# Logistic Regression

redo model training with new data in this script

In [1]:
import pandas as pd
import rdata
from sklearn.linear_model import LogisticRegression
import json
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
from functions import get_cv_score
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_pickle("../data/df_uni_bi_tri.pickle")
df =df.reset_index(col_level=1).rename(columns = {'': 'id'}, level =0)

In [4]:
# language for labels
path_ind_diff = r"../data/version 1.1/primary data/individual differences data/joint.ind.diff.l2.rda"
parsed_readrate = rdata.parser.parse_file(path_ind_diff)
converted_readrate = rdata.conversion.convert(parsed_readrate)
df_readrate = converted_readrate['joint_id'][['uniform_id', 'lang']]

df_readrate.columns = pd.MultiIndex.from_product([['labels'], df_readrate.columns])

In [5]:
# join lang
df_main = df.merge(df_readrate, left_on = [('id', 'uniform_id')], right_on= [('labels', 'uniform_id')])
df_main = df_main.drop([('labels', 'uniform_id')], axis=1)


In [9]:
# encode language as lables
with open("../data/outputs/language_lookup.json") as json_file:
    language_lookup = json.load(json_file)

df_main[('labels', 'lang_code')] = df_main[('labels', 'lang')].replace(language_lookup).values

## LogReg NLIR

In [7]:
logreg = LogisticRegression(C=1e5, multi_class='multinomial', solver='lbfgs', random_state=0)

In [13]:
# columns used in training
columns = ['TF_uni', 'FP_uni', 'FF_uni', 'TF_bi', 'FP_bi', 'FF_bi', 'TF_tri', 'FP_tri', 'FF_tri' ]

Cross validation score: only needed for report and not for further coding

In [22]:
cross_val_scores = get_cv_score(df_main, columns, logreg)
print(np.mean(cross_val_scores))
cross_val_scores

5it [00:03,  1.29it/s]

0.5800000000000001
0.5800000000000001





[0.65, 0.5666666666666667, 0.6, 0.5166666666666667, 0.5666666666666667]

In [16]:
# real traintest split to be used: 
train_subset = df_main[columns]
train_label_subset = df_main[('labels', 'lang_code')]

gss = GroupShuffleSplit(n_splits=5, test_size = 0.3, random_state=42)
split_indexes = list(gss.split(train_subset, train_label_subset, df_main[('id','uniform_id')]))[0]
train_idx = list(split_indexes[0])
test_idx = list(split_indexes[1])

X_train = train_subset.iloc[train_idx]
y_train = train_label_subset.iloc[train_idx]
X_test = train_subset.iloc[test_idx]
y_test = train_label_subset.iloc[test_idx]

test_lang = df_main[('labels', 'lang')].iloc[test_idx]
test_lang_codes = df_main[('labels', 'lang_code')].iloc[test_idx]

In [27]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_acc_score = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {test_acc_score}; overall cross validation accuracy" {np.mean(cross_val_scores)}')

Test set accuracy: 0.6888888888888889; overall cross validation accuracy" 0.5800000000000001


### LogReg Similarity

In [50]:
X_test['lang'] = test_lang # add language column
languages = list(test_lang.unique())
dict_lang = {}
for ll in languages:
    df = X_test[X_test['lang'] == ll]
    dict_lang[ll] = df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['lang'] = test_lang # add language column


In [51]:
dict_proba ={}
len_dfs = []
lang_identifier = []
for key in dict_lang.keys():
    lang_df = dict_lang[key][columns]
    test_proba = logreg.predict_proba(lang_df)
    len_df = len(lang_df)
    test_proba = test_proba.sum(axis=0)   # .mean(axis=0) # do sum instead of mean 
    dict_proba[key] = test_proba
    len_dfs.append(len_df) # record length of language df
    lang_identifier.append(key)
    

In [53]:
df_predictions = pd.DataFrame(dict_proba).reset_index()#.rename(columns = {'index': 'lang_code'})
# cols are lang on which predicted and rows are outputs


In [54]:
# add col with lenghts and lang name
df_predictions['length'] = len_dfs
df_predictions['lang'] = lang_identifier

In [33]:
# with open("../data/outputs/language_lookup.json") as json_file:
#     language_lookup = json.load(json_file)

# reversed_language_lookup  = {v: k for k, v in language_lookup.items()}
# df_predictions["lang_name"] = df_predictions['lang_code'].replace(reversed_language_lookup).values

In [56]:
df_predictions

Unnamed: 0,index,ee,fi,ge,he,it,en,sp,length,lang
0,0,4.404483,1.033553,0.1051325,1.647245e-16,0.000294,3.75439e-09,1.099893,10,ee
1,1,2.19452e-12,1.560682e-07,1.2816230000000001e-27,5.985482e-37,3e-06,7.999949,0.005478773,6,fi
2,2,1.00302,3.434715,3.179167e-11,1.0,0.205143,5.093741e-05,0.8932847,8,ge
3,3,1.272788,0.9956819,7.891825,2.071659e-14,1.000016,2.400117e-13,7.749357e-11,2,he
4,4,1.00018,0.004521053,0.003041186,1.0,0.999969,7.979861e-18,0.00141968,6,it
5,5,0.5359143,0.396259,8.12118e-07,1.495791e-23,3.786561,4.205258e-14,0.7135236,8,en
6,6,1.783615,0.1352705,1.10329e-09,1.041547e-14,0.008015,6.281173e-13,2.2864,5,sp


In [59]:
lang_values = []
lang_pairs = []
#lang_similarities = {}
for key1 in df_predictions['lang'].unique(): # language_lookup.keys()
    for key2 in df_predictions['lang'].unique(): # language_lookup.keys()
        #print((key1, key2))
        val1 = float(df_predictions[df_predictions['lang'] == key1][key2])
        val2 = float(df_predictions[df_predictions['lang'] == key2][key1])
        len1 = float(df_predictions[df_predictions['lang'] == key1]['length'])
        len2 = float(df_predictions[df_predictions['lang'] == key2]['length'])
        
        #lang_similarities[f'{key1}_{key2}'] = ((val1 + val2) /(len1+len2)) #  divide by corpus length bc summed above
        lang_values.append((val1 + val2) / (len1+len2))
        lang_pairs.append(f'{key1}_{key2}')

In [60]:
df_lang_similarities = pd.DataFrame({'pair': lang_pairs, 'logreg_val': lang_values})

In [61]:
df_lang_similarities['logreg_val_inverted'] = 1 - df_lang_similarities['logreg_val']

In [62]:
#lang_similarities # invert as 1- proba. 
df_lang_similarities

Unnamed: 0,pair,logreg_val,logreg_val_inverted
0,ee_ee,0.4404483,0.559552
1,ee_fi,0.06459703,0.935403
2,ee_ge,0.06156402,0.938436
3,ee_he,0.1060656,0.893934
4,ee_it,0.0625296,0.93747
5,ee_en,0.02977302,0.970227
6,ee_sp,0.1922339,0.807766
7,fi_ee,0.06459703,0.935403
8,fi_fi,2.601136e-08,1.0
9,fi_ge,0.2453368,0.754663


In [63]:
# with open("../data/outputs/logreg_similarity.json", "w") as outfile:
#     json.dump(lang_similarities, outfile)

df_lang_similarities.to_csv("../data/outputs/logreg_similarity_new.csv", index = False)