In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy

In [None]:
import spacy
from re import sub, split

# Clean the data

In [None]:
ratings_raw = pd.read_csv('data/docs_before.csv')

In [None]:
ratings_raw.head()

In [None]:
ratings_raw["avg_ranking"] = ratings_raw.iloc[:,6:10].mean(axis=1)


In [None]:
ratings_raw.columns

In [None]:
useful_cols = ['hp_id',
               'ratemds_id',
               'hasorder',
               'order_id',
               'avg_help', 
               'avg_know', 
               'avg_punct', 
               'avg_staff',
               'avg_ranking',
               'spec_comb',
               'review_corpus'
              ]

In [None]:
ratings = ratings_raw[useful_cols]

In [None]:
ratings = ratings.drop_duplicates(subset='hp_id',keep='first')

In [None]:
ratings.head()

In [None]:
ratings

In [None]:
gender_info = pd.read_csv('data/genders.csv')

In [None]:
gender_info.drop_duplicates(subset='hp_id', keep='first',inplace = True)

# Merge Datasets

In [None]:
doc_review = pd.merge(ratings, gender_info, on='hp_id', how='left')

In [None]:
doc_review['spec_comb'] = doc_review['spec_comb'].map(lambda x: x.strip())

In [None]:
doc_review['sentence']= doc_review['review_corpus'].map(lambda x: x.split('|'))

In [None]:
for sentence in doc_review['sentence'][0]:
    print(sentence)

In [None]:
len(doc_review)

In [None]:
len(doc_review['hp_id'].unique())

In [None]:
doc_review

# Create a datset where there is one row for review not for doctor

In [None]:
doc_review.reset_index(inplace= True,drop = True)

In [None]:
doc_review.head()

In [None]:
im_reviews = doc_review[doc_review["spec_comb"] == "Internal Medicine"].reset_index(drop = True)

In [None]:
im_reviews.info()

In [None]:
# Generate a new dataframe of doc's review, make sure each record has single review
im_review_df = pd.DataFrame()
for i, sentence in enumerate(im_reviews['sentence']):
    temp_dict = dict(enumerate(sentence))
    a = len(list(temp_dict.keys()))
    s = str(im_reviews['hp_id'][i])
    temp_df = pd.DataFrame.from_dict(data = temp_dict, orient = 'index', columns=['Review'])
    temp_df['hp_id'] = [s for i in range(a)]
    im_review_df = im_review_df.append(temp_df, ignore_index=True)

In [None]:
im_review_df.info()

In [None]:
im_review_df

In [None]:
im_review_df['hp_id'] = im_review_df['hp_id'].map(lambda x: int(x))

In [None]:
im_review_df.info()

In [None]:
im_review_df = pd.merge(im_review_df, doc_review, on='hp_id', how = 'left')

In [None]:
im_review_df.info()

In [None]:
im_review_df = im_review_df.drop(['review_corpus', 'sentence', 'spec_comb'], axis=1)


In [None]:
im_review_df.head()

In [None]:
im_review_df.gender.value_counts()

In [None]:
im_review_df.gender.value_counts()/len(im_review_df)

In [None]:
im_review_df.hasorder.value_counts()

In [None]:
im_review_df.hasorder.value_counts()/len(im_review_df)

In [None]:
def cols_to_binary(val, col_name):
    # col names can be "gender" or "ranking"
    if col_name == "gender":
        if val == "F":
            return_var = 1
        elif val == "M":
            return_var = 0
        else:
            return_var = "NA"
    elif col_name == "ranking":
        if val >= 4.0:
            return_var = 1
        elif val < 4.0:
            return_var = 0
        else: 
            return_var = "NA"
    return return_var

In [None]:
im_review_df["gen_bin"] = im_review_df["gender"].apply(lambda x: cols_to_binary(x, "gender"))
im_review_df["high_avg_rank"] = im_review_df["avg_ranking"].apply(lambda x: cols_to_binary(x, "ranking"))
im_review_df["high_avg_help"] = im_review_df["avg_help"].apply(lambda x: cols_to_binary(x, "ranking"))
im_review_df["high_avg_know"] = im_review_df["avg_know"].apply(lambda x: cols_to_binary(x, "ranking"))
im_review_df["high_avg_punc"] = im_review_df["avg_punct"].apply(lambda x: cols_to_binary(x, "ranking"))
im_review_df["high_avg_staf"] = im_review_df["avg_staff"].apply(lambda x: cols_to_binary(x, "ranking"))


In [None]:
im_review_df

# Now prepare the data for training

In [None]:
# shuffle them
df = im_review_df.sample(frac=1).reset_index(drop=True)


In [None]:
eighty_percent = len(df)*.8

In [None]:
# take 80% for train // 20% for test
df_train = df[0:eighty_percent]
df_test = df[eighty_percent:]


In [None]:
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [None]:
df_train

In [None]:
# split x data (literally just the review) from the other potential y variables (which also serve as metadata analysis features)

x_train = df_train["Review"]
y_train_gen = df_train["gen_bin"]
y_train_rank = df_train["high_avg_rank"]
y_train_help = df_train["high_avg_help"]
y_train_know = df_train["high_avg_know"]
y_train_punc = df_train["high_avg_punc"]

x_test = df_test["Review"]
y_test_gen = df_test["gen_bin"]
y_test_rank = df_test["high_avg_rank"]
y_test_help = df_test["high_avg_help"]
y_test_know = df_test["high_avg_know"]
y_test_punc = df_test["high_avg_punc"]

# Clean Explanations

In [None]:
x_train

In [None]:
def clean_word(s):
    
    # unwanted symbols/punctuations
    s = sub("%", " percent", s) ##percents
    s = sub("&amp;", "and", s) ##ampersands
    s = sub("'s", "", s) ##possessive or contraction
    s = sub("'re", "", s) ##contraction
    s = sub("'ll", "", s) ##contraction
    s = sub("‚Äô", "", s) ##contraction
    s = sub("'t", " not", s) ##contraction
    
    # typical text mining things
    s = s.lower() ##case sensitivity

    return s

In [None]:
x_train = x_train.apply(lambda x: clean_word(x))
x_test = x_test.apply(lambda x: clean_word(x))


## The most basic model: Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
x_train_vec = vec.fit_transform(x_train).toarray()
x_test_vec = vec.transform(x_test).toarray()


In [None]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(x_train_vec, y_train_gen)

In [None]:
model_nb.score(x_train_vec, y_train_gen)


In [None]:
model_nb.score(x_test_vec, y_test_gen)


In [None]:
# Predicting the Test set results
y_pred = model_nb.predict(x_test_vec)
 

In [None]:
def conf_matrix(pred_val, actual_val):
    if pred_val == 1:
        if actual_val == 1:
            return "TP"
        elif actual_val == 0:
            return "FP"
    elif pred_val == 0:
        if actual_val == 1:
            return "FN"
        elif actual_val == 0:
            return "TN"
    else:
        return "ERROR IN DATA"

In [None]:
def create_conf_df(model, x_df, y_df):
    compared_df = pd.DataFrame()
    compared_df["preds"] = model.predict(x_df)
    compared_df["actuals"] = y_df
    compared_df["error_type"] = compared_df.apply(lambda x: conf_matrix(x.preds, x.actuals), axis=1)
    compared_df["count"] = 1
    return compared_df

In [None]:
def summary_cost_matrix(model, x_df, y_df):
    conf_df = create_conf_df(model, x_df, y_df)
    summary_df = conf_df[["error_type", "count"]].groupby(by=["error_type"]).sum()
    fn = summary_df.iloc[0][0]
    fp = summary_df.iloc[1][0]
    tn = summary_df.iloc[2][0]
    tp = summary_df.iloc[3][0]
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = (2*precision*recall)/(precision+recall)
    F0_5score = ((1 + 0.5**2) * precision * recall) / (0.5**2 * precision + recall)
    print(summary_df)
    print("FN:", fn)
    print("FP:", fp)
    print("TN:", tn)
    print("TP:", tp)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1_score)
    print("F0.5:", F0_5score)

In [None]:
summary_cost_matrix(model_nb, x_train_vec, y_train_gen)

In [None]:
summary_cost_matrix(model_nb, x_test_vec, y_test_gen)

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
 
# n_estimators = # of trees
rf_model_no_extra = RandomForestClassifier(n_estimators = 501,
                                           criterion = 'entropy')
                             
rf_model_no_extra.fit(x_train_vec, y_train_gen)

In [None]:
rf_model_no_extra.score(x_train_vec, y_train_gen)


In [None]:
rf_model_no_extra.score(x_test_vec, y_test_gen)


In [None]:
# Predicting the Test set results
y_pred_rf_no_extra = rf_model_no_extra.predict(x_test_vec)
 

In [None]:
summary_cost_matrix(rf_model_no_extra, x_train_vec, y_train_gen)

In [None]:
summary_cost_matrix(rf_model_no_extra, x_test_vec, y_test_gen)

# Adding lemmatization, stemming etc

In [None]:
# Tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

word_tokenize(x_train[0])

In [None]:
# Lemmatize
lemm = WordNetLemmatizer()

def lemm_sentence(tokenized_sentence):
    new_tokenized_sentence = []
    for word in tokenized_sentence:
        new_tokenized_sentence.append(lemm.lemmatize(word))
    return new_tokenized_sentence

In [None]:
# x_train_lemmatized = x_train_tokenized.apply(lambda x: lemm_sentence(x))
# x_test_lemmatized = x_test_tokenized.apply(lambda x: lemm_sentence(x))

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
ps = PorterStemmer()
 
def stem_sentence(lemmatized_sentence):
    new_lemmatized_sentence = []
    for word in lemmatized_sentence:
        new_lemmatized_sentence.append(ps.stem(word))
    return new_lemmatized_sentence


In [None]:
# x_train_stemmed = x_train_lemmatized.apply(lambda x: stem_sentence(x))
# x_test_stemmed = x_test_lemmatized.apply(lambda x: stem_sentence(x))

In [None]:
def norm_sentence_again(stem_sentence):
    new_norm_sentence = ""
    for word in stem_sentence:
        new_norm_sentence = new_norm_sentence + " " + word
    return new_norm_sentence

In [None]:
# x_train_to_vec = x_train_stemmed.apply(lambda x: norm_sentence_again(x))
# x_test_to_vec = x_test_stemmed.apply(lambda x: norm_sentence_again(x))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 100000)

In [None]:
def create_bigram_strings(string_sentence):
    bigram_list = []
    for word in range(0, len(string_sentence.split())-1):
        unigram_1 = string_sentence.split()[word]
        unigram_2 = string_sentence.split()[word+1]
        bigram_list.append(unigram_1 + "_" + unigram_2)
    bigram_sentence = ""
    for bigram in bigram_list:
        bigram_sentence = bigram_sentence + " " + bigram
    return bigram_sentence

In [None]:
def train_ratings_to_vectors(x_train, y_train, x_test, y_test):
    
    # clean them
    inner_x_train = x_train.apply(lambda x: clean_word(x))
    inner_x_test = x_test.apply(lambda x: clean_word(x))

    # tokenize
    inner_x_train_tokenized = inner_x_train.apply(lambda x: word_tokenize(x))
    inner_x_test_tokenized = inner_x_test.apply(lambda x: word_tokenize(x))

    # lemmatize
    lemm = WordNetLemmatizer()
    inner_x_train_lemmatized = inner_x_train_tokenized.apply(lambda x: lemm_sentence(x))
    inner_x_test_lemmatized = inner_x_test_tokenized.apply(lambda x: lemm_sentence(x))
    
    # stem
    ps = PorterStemmer()
    inner_x_train_stemmed = inner_x_train_lemmatized.apply(lambda x: stem_sentence(x))
    inner_x_test_stemmed = inner_x_test_lemmatized.apply(lambda x: stem_sentence(x))
    
    # return to non-vectorized-list so we can manipulate
    inner_x_train_to_vec = inner_x_train_stemmed.apply(lambda x: norm_sentence_again(x))
    inner_x_test_to_vec = inner_x_test_stemmed.apply(lambda x: norm_sentence_again(x))
    
    # now count vectorize
    cv = CountVectorizer()
    inner_x_train_vec = cv.fit_transform(pd.Series(inner_x_train_to_vec)).toarray()
    inner_x_test_vec = cv.transform(pd.Series(inner_x_test_to_vec)).toarray()

    return inner_x_train_vec, y_train, inner_x_test_vec, y_test

In [None]:
def train_ratings_to_vectors_bigram(x_train, y_train, x_test, y_test):
    # clean them
    inner_x_train = x_train.apply(lambda x: clean_word(x))
    inner_x_test = x_test.apply(lambda x: clean_word(x))

    # tokenize
    inner_x_train_tokenized = inner_x_train.apply(lambda x: word_tokenize(x))
    inner_x_test_tokenized = inner_x_test.apply(lambda x: word_tokenize(x))

    # lemmatize
    lemm = WordNetLemmatizer()
    inner_x_train_lemmatized = inner_x_train_tokenized.apply(lambda x: lemm_sentence(x))
    inner_x_test_lemmatized = inner_x_test_tokenized.apply(lambda x: lemm_sentence(x))
    
    # stem
    ps = PorterStemmer()
    inner_x_train_stemmed = inner_x_train_lemmatized.apply(lambda x: stem_sentence(x))
    inner_x_test_stemmed = inner_x_test_lemmatized.apply(lambda x: stem_sentence(x))
    
    # return to non-vectorized-list so we can manipulate
    inner_x_train_to_vec = inner_x_train_stemmed.apply(lambda x: norm_sentence_again(x))
    inner_x_test_to_vec = inner_x_test_stemmed.apply(lambda x: norm_sentence_again(x))
    
    # convert this guy to a bigrammized sentence
    bigrammed_sentence_x_train = inner_x_train_to_vec.apply(lambda x: create_bigram_strings(x))
    bigrammed_sentence_x_test  = inner_x_test_to_vec.apply(lambda x: create_bigram_strings(x))
    
    # make them one big sentence:
    comb_inner_x_train_to_vec = inner_x_train_to_vec + bigrammed_sentence_x_train
    comb_inner_x_test_to_vec = inner_x_test_to_vec + bigrammed_sentence_x_test
    
    # now count vectorize
    cv = CountVectorizer()
    inner_x_train_vec = cv.fit_transform(pd.Series(comb_inner_x_train_to_vec)).toarray()
    inner_x_test_vec = cv.transform(pd.Series(comb_inner_x_test_to_vec)).toarray()

    return inner_x_train_vec, y_train, inner_x_test_vec, y_test


In [None]:
bi_uni_df = train_ratings_to_vectors_bigram(x_train, y_train_gen, x_test, y_test_gen)

In [None]:
standard_df = train_ratings_to_vectors(x_train, y_train_gen, x_test, y_test_gen)

In [None]:
def all_eval_metrics(trained_model, x_train, y_train, x_test, y_test):
   
    # vectorized usable versions only of x train
    
    train_fit = trained_model.score(x_train, y_train)
    print("Train fit: ", train_fit)
    test_fit = trained_model.score(x_test, y_test)
    print("Test fit: ", test_fit)

    # Predicting the Test set results
    y_predictions = trained_model.predict(x_test)

    four_outputs = prec_reca_f1(y_test, y_predictions)
    print("Precision: ", four_outputs[0])
    print("Recall: ", four_outputs[1])
    print("F1: ", four_outputs[2])
    print("F0.5: ", four_outputs[3])


# Mix in the dictionaries

In [None]:
# First we have to redo the tokenization because we want it to be in a different format to compute cosine similarity, we will not use these after we have the dictionary cosine similarity vals

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [None]:
# Lemmatize
lemm = WordNetLemmatizer()

def lemm_sentence(tokenized_sentence):
    new_tokenized_sentence = []
    for word in tokenized_sentence:
        new_tokenized_sentence.append(lemm.lemmatize(word))
    return new_tokenized_sentence

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
ps = PorterStemmer()
 
def stem_sentence(lemmatized_sentence):
    new_lemmatized_sentence = []
    for word in lemmatized_sentence:
        new_lemmatized_sentence.append(ps.stem(word))
    return new_lemmatized_sentence


In [None]:
def norm_sentence_again(stem_sentence):
    new_norm_sentence = ""
    for word in stem_sentence:
        new_norm_sentence = new_norm_sentence + " " + word
    return new_norm_sentence

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)

In [None]:
def train_ratings_to_vectors_STEMMED(x_train, y_train, x_test, y_test):
    # clean them
    inner_x_train = x_train.apply(lambda x: clean_word(x))
    inner_x_test = x_test.apply(lambda x: clean_word(x))

    # tokenize
    inner_x_train_tokenized = inner_x_train.apply(lambda x: word_tokenize(x))
    inner_x_test_tokenized = inner_x_test.apply(lambda x: word_tokenize(x))

    # lemmatize
    lemm = WordNetLemmatizer()
    inner_x_train_lemmatized = inner_x_train_tokenized.apply(lambda x: lemm_sentence(x))
    inner_x_test_lemmatized = inner_x_test_tokenized.apply(lambda x: lemm_sentence(x))
    
    # stem
    ps = PorterStemmer()
    inner_x_train_stemmed = inner_x_train_lemmatized.apply(lambda x: stem_sentence(x))
    inner_x_test_stemmed = inner_x_test_lemmatized.apply(lambda x: stem_sentence(x))
    
#     # return to non-vectorized-list so we can manipulate
#     inner_x_train_to_vec = inner_x_train_stemmed.apply(lambda x: norm_sentence_again(x))
#     inner_x_test_to_vec = inner_x_test_stemmed.apply(lambda x: norm_sentence_again(x))
    
#     # now count vectorize
#     cv = CountVectorizer(max_features = 1500)
#     inner_x_train_vec = cv.fit_transform(pd.Series(inner_x_train_to_vec)).toarray()
#     inner_x_test_vec = cv.transform(pd.Series(inner_x_test_to_vec)).toarray()

#    return inner_x_train_vec, inner_y_train, inner_x_test_vec, inner_y_test
    return inner_x_train_stemmed, y_train, inner_x_test_stemmed, y_test


In [None]:
standard_df_for_str = train_ratings_to_vectors_STEMMED(x_train, y_train_gen, x_test, y_test_gen)

In [None]:
def tokenized_to_string(tokenized_sent):
    new_sent = ""
    for word in tokenized_sent:
        new_sent = new_sent + " " + word
    return new_sent[1:]

In [None]:
standard_df_str = (standard_df_for_str[0].apply(tokenized_to_string),
                   standard_df_for_str[1],
                   standard_df_for_str[2].apply(tokenized_to_string),
                   standard_df_for_str[3],)

# Make covariate columns

In [None]:
mass_dict_pd = pd.read_excel('data/LIWC2007dictionary_cleaned.xls', sheet_name = "Cleaned")

In [None]:
mass_dict = {}

In [None]:
mass_dict_pd.columns

In [None]:
for col in mass_dict_pd.columns:
    mass_dict[col]  = []
    for word in range(0, len(mass_dict_pd[col])):
        if pd.notna(mass_dict_pd[col][word]):
            clean_word = mass_dict_pd[col][word].replace("*", "")
            mass_dict[col].append(clean_word)

In [None]:
mass_dict

# Dictionary Semantic Embeddings

In [None]:
from empath import Empath
lexicon = Empath()

In [None]:
import spacy
from collections import defaultdict

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
dicts_as_str = defaultdict(str) 
dicts_as_nlp = defaultdict(lambda: 'initial')

In [None]:
for col_key in mass_dict:
    for item in mass_dict[col_key]:
        dicts_as_str[col_key] = dicts_as_str[col_key] + item + " " 
    dicts_as_nlp[col_key] = nlp(dicts_as_str[col_key])

In [None]:
# dicts_as_str = the dictionary as a concatenated string
# dicts_as_nlp = the nlp(objects) of these concatenated string

In [None]:
standard_df_str[0]

In [None]:
dicts_as_nlp.keys()

## Semantic Cosine Similarity Embeddings

In [None]:
import statistics as st


In [None]:
def median_cosine_similarity(string_sentence, dict_col, dict_to_compare = dicts_as_nlp):
    ind_dict = dict_to_compare[dict_col]
    main_doc = nlp(string_sentence.lower())
    
    all_scores = set()
    
    for nlp_word in dicts_as_nlp[dict_col]:
        all_scores.add(main_doc.similarity(nlp_word))
    
    return st.median(all_scores)

In [None]:
def create_meta_cols_sem_we(string_sentence, dict_to_compare = dicts_as_nlp):
    # must be a string sentence! not a tokenized one
    # works best with a concatenated string of tokenized words
    new_meta_cols = ['Quant', 
                     'Numbers', 
                     'Humans', 
                     'Affect', 
                     'Posemo', 
                     'Negemo', 
                     'Cause', 
                     'Health', 
                     'Money', 
                     'Death', 
                     'Ipron', 
                     'aux_verb', 
                     'adverbs', 
                     'Negate', 
                     'Family', 
                     'Anx', 
                     'Anger', 
                     'Sad', 
                     'CogMech', 
                     'Insight', 
                     'Discrep', 
                     'Tentat', 
                     'Certain', 
                     'Inhib', 
                     'Incl', 
                     'Excl', 
                     'Bio', 
                     'Body', 
                     'Sexual', 
                     'Time', 
                     'Achiev']
    blank_row_to_fill = pd.DataFrame(columns=new_meta_cols)
    blank_row_to_fill.loc[0] = 0
    
    blank_row_to_fill['Length'] = len(string_sentence.split())
    
    string_doc1 = string_sentence.lower()
    
    for dict_key in dict_to_compare:
        score = median_cosine_similarity(string_doc1, dict_key)        
        blank_row_to_fill[dict_key] = score

    return blank_row_to_fill

In [None]:
def create_meta_dfs_sem_we(train_df_tokenized_strings, test_df_tokenized_strings):
    # must be tokenized string rows
    new_meta_cols = ['Quant', 
                     'Numbers', 
                     'Humans', 
                     'Affect', 
                     'Posemo', 
                     'Negemo', 
                     'Cause', 
                     'Health', 
                     'Money', 
                     'Death', 
                     'Ipron', 
                     'aux_verb', 
                     'adverbs', 
                     'Negate', 
                     'Family', 
                     'Anx', 
                     'Anger', 
                     'Sad', 
                     'CogMech', 
                     'Insight', 
                     'Discrep', 
                     'Tentat', 
                     'Certain', 
                     'Inhib', 
                     'Incl', 
                     'Excl', 
                     'Bio', 
                     'Body', 
                     'Sexual', 
                     'Time', 
                     'Achiev']
    
    meta_train = pd.DataFrame(columns=new_meta_cols)
    meta_test = pd.DataFrame(columns=new_meta_cols)

    for row in train_df_tokenized_strings:
        meta_train = meta_train.append(create_meta_cols_sem_we(row)).reset_index(drop = True)

    for row in test_df_tokenized_strings:
        meta_test = meta_test.append(create_meta_cols_sem_we(row)).reset_index(drop = True)
        
    return meta_train, meta_test

In [None]:
x_train_meta_data, x_test_meta_data = create_meta_dfs_sem_we(standard_df_str[0], standard_df_str[2])

In [None]:
# Grab proper ngram data
x_train_bi_uni = bi_uni_df[0]
y_train_bi_uni = bi_uni_df[1]
x_test_bi_uni = bi_uni_df[2]
y_test_bi_uni = bi_uni_df[3]

In [None]:
# Grab proper ngram data
x_train_uni = standard_df[0]
y_train_uni = standard_df[1]
x_test_uni = standard_df[2]
y_test_uni = standard_df[3]

In [None]:
standard_df

In [None]:
meta_plus_unibi_train = pd.concat([x_train_meta_data, pd.DataFrame(x_train_bi_uni)], axis =1)

In [None]:
meta_plus_unibi_train

In [None]:
meta_plus_unibi_test = pd.concat([x_test_meta_data, pd.DataFrame(x_test_bi_uni)], axis =1)

In [None]:
meta_plus_unibi_test

In [None]:
meta_plus_uni_train = pd.concat([x_train_meta_data, pd.DataFrame(x_train_uni)], axis =1)

In [None]:
meta_plus_uni_test = pd.concat([x_test_meta_data, pd.DataFrame(x_test_uni)], axis =1)

# logreg


In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
logreg = LogisticRegressionCV(cv = 5, max_iter = 10000)


In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
df_train_scaled = min_max_scaler.fit_transform(meta_plus_unibi_train)
df_test_scaled = min_max_scaler.fit_transform(meta_plus_unibi_test)


In [None]:
logreg.fit(df_train_scaled, y_train)

In [None]:
all_eval_metrics(logreg, df_train_scaled, y_train, df_test_scaled, y_test)

# Run models on combined set

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

rf_model = RandomForestClassifier(criterion = 'entropy') 

In [None]:
# 5-Fold Cross validation
np.mean(cross_val_score(rf_model, meta_plus_unibi_train, y_train, cv=5))

In [None]:
intro_param_grid = {
                    'bootstrap': [True],
                    'criterion': ['entropy'],
                    'max_depth': [20, 25],
                    'max_features': ['sqrt'],
                    'min_samples_leaf': [1, 2, 3],
                    'min_samples_split': [3, 5],
                    'n_estimators': [1000]
                   }

In [None]:
from sklearn.model_selection import GridSearchCV

grid_rf_model = GridSearchCV(rf_model, intro_param_grid, cv=5)
grid_rf_model.fit(meta_plus_unibi_train, y_train)

In [None]:
meta_plus_unibi_train

In [None]:
# grid_rf_model.best_estimator_ 

In [None]:
# grid_rf_model.best_params_

In [None]:
all_eval_metrics(grid_rf_model, meta_plus_unibi_train, y_train, meta_plus_unibi_test, y_test)

# CHANGE METRIC

In [None]:
from sklearn.metrics import fbeta_score, make_scorer

In [None]:
fone_scorer = make_scorer(fbeta_score, beta=0.5)
fone_scorer

In [None]:
rf_model_f1 = RandomForestClassifier(criterion = 'entropy') 

In [None]:
param_grid1 = {
                 'n_estimators': [1000, 1100],
                 'max_depth': [15, 20, 25],
                 'criterion': ["entropy"],
                 'max_features': ['auto', 'sqrt'],
                 'min_samples_split': [2, 3, 4],
                 'min_samples_leaf': [1, 2],
                 'bootstrap': [True, False]
             }

In [None]:
from sklearn.model_selection import GridSearchCV

grid_rf_model_f1 = GridSearchCV(rf_model_f1, param_grid1, cv=5, scoring=fone_scorer)
grid_rf_model_f1.fit(meta_plus_uni_train, y_train)

In [None]:
grid_rf_model_f1.best_estimator_ 

In [None]:
grid_rf_model_f1.best_params_

In [None]:
all_eval_metrics(grid_rf_model_f1, meta_plus_uni_train, y_train, meta_plus_uni_test, y_test)

In [None]:
preds = grid_rf_model_f1.predict(meta_plus_uni_test)

In [None]:
grid_rf_model_f1.best_params_

In [None]:
best_params = {'bootstrap': [False],
 'criterion': ['entropy'],
 'max_depth': [25],
 'max_features': ['sqrt'],
 'min_samples_leaf': [1],
 'min_samples_split': [2],
 'n_estimators': [1000]}

In [None]:
rf_model_best = RandomForestClassifier(criterion = 'entropy') 

In [None]:
from sklearn.model_selection import GridSearchCV

grid_rf_model_best = GridSearchCV(rf_model_best, best_params, cv=5, scoring=fone_scorer)
grid_rf_model_best.fit(meta_plus_uni_train, y_train)

In [None]:
meta_plus_uni_train

In [None]:
all_eval_metrics(grid_rf_model_best, meta_plus_uni_train, y_train, meta_plus_uni_test, y_test)

### Bigram Inclusion

In [None]:
rf_model_f1_bi = RandomForestClassifier(criterion = 'entropy') 

In [None]:
param_grid1 = {
                 'n_estimators': [1000, 1100],
                 'max_depth': [15, 20, 25],
                 'criterion': ["entropy"],
                 'max_features': ['auto', 'sqrt'],
                 'min_samples_split': [2, 3, 4],
                 'min_samples_leaf': [1, 2],
                 'bootstrap': [True, False]
             }

In [None]:
from sklearn.model_selection import GridSearchCV

grid_rf_model_f1_bi = GridSearchCV(rf_model_f1_bi, param_grid1, cv=5, scoring=fone_scorer)
grid_rf_model_f1_bi.fit(meta_plus_unibi_train, y_train)

In [None]:
grid_rf_model_f1_bi.best_estimator_ 

In [None]:
grid_rf_model_f1_bi.best_params_

In [None]:
all_eval_metrics(grid_rf_model_f1_bi, meta_plus_unibi_train, y_train, meta_plus_unibi_test, y_test)

In [None]:
#preds = grid_rf_model_f1_bi.predict(meta_plus_unibi_test)

In [None]:
best_params_bi = {'bootstrap': [False],
 'criterion': ['entropy'],
 'max_depth': [25],
 'max_features': ['sqrt'],
 'min_samples_leaf': [1],
 'min_samples_split': [2],
 'n_estimators': [1000]}

In [None]:
rf_model_best_bi = RandomForestClassifier(criterion = 'entropy') 

In [None]:
from sklearn.model_selection import GridSearchCV

grid_rf_model_best_bi = GridSearchCV(rf_model_best_bi, best_params_bi, cv=5, scoring=fone_scorer)
grid_rf_model_best_bi.fit(meta_plus_unibi_train, y_train)

In [None]:
all_eval_metrics(grid_rf_model_best, meta_plus_unibi_train, y_train, meta_plus_unibi_test, y_test)



# Logisitic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
logreg = LogisticRegressionCV(cv = 5, max_iter = 10000)


In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
df_train_scaled = min_max_scaler.fit_transform(meta_plus_unibi_train)
df_test_scaled = min_max_scaler.fit_transform(meta_plus_unibi_test)


In [None]:
param_grid_logr = {    
                     'fit_intercept': [True, False],
                     'max_iter': [50000],
                     'penalty': ['l2'],
                     'refit': [True, False                     
                     'scoring': [fone_scorer],
                     'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
}


In [None]:
grid_rf_model_regr = GridSearchCV(logreg, param_grid_logr, cv=5, scoring=fone_scorer)
grid_rf_model_regr.fit(df_train_scaled, y_train)

In [None]:
all_eval_metrics(grid_rf_model_regr, df_train_scaled, y_train, df_test_scaled, y_test)

In [None]:
def conf_matrix(pred_val, actual_val):
    if pred_val == 1:
        if actual_val == 1:
            return "TP"
        elif actual_val == 0:
            return "FP"
    elif pred_val == 0:
        if actual_val == 1:
            return "FN"
        elif actual_val == 0:
            return "TN"
    else:
        return "ERROR IN DATA"

In [None]:
def create_conf_df(model, x_df, y_df):
    compared_df = pd.DataFrame()
    compared_df["preds"] = model.predict(x_df)
    compared_df["actuals"] = y_df
    compared_df["error_type"] = compared_df.apply(lambda x: conf_matrix(x.preds, x.actuals), axis=1)
    compared_df["count"] = 1
    return compared_df

In [None]:
def summary_cost_matrix(model, x_df, y_df):
    conf_df = create_conf_df(model, x_df, y_df)
    summary_df = conf_df[["error_type", "count"]].groupby(by=["error_type"]).sum()
    fn = summary_df.iloc[0][0]
    fp = summary_df.iloc[1][0]
    tn = summary_df.iloc[2][0]
    tp = summary_df.iloc[3][0]
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = (2*precision*recall)/(precision+recall)
    F0_5score = ((1 + 0.5**2) * precision * recall) / (0.5**2 * precision + recall)
    print(summary_df)
    print("FN:", fn)
    print("FP:", fp)
    print("TN:", tn)
    print("TP:", tp)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1_score)
    print("F0.5:", F0_5score)

In [None]:
def fitness_check(model, x_train, y_train, x_test, y_test):
    train_fit = model.score(x_train, y_train)
    print("Train fit: ", train_fit)
    test_fit = model.score(x_test, y_test)
    print("Test fit: ", test_fit)

In [None]:
fitness_check(grid_rf_model_regr, df_train_scaled, y_train, df_test_scaled, y_test)

In [None]:
summary_cost_matrix(grid_rf_model_regr, df_test_scaled, y_test)