# Logistic Regression

redo model training with new data in this script

In [1]:
import pandas as pd
import rdata
from sklearn.linear_model import LogisticRegression
import json
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
from functions import get_cv_score
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_pickle("../data/df_uni_bi_tri.pickle")
df =df.reset_index(col_level=1).rename(columns = {'': 'id'}, level =0)

Add language that is going to be the label/ target for model:

In [3]:
# read df with language:
path_ind_diff = r"../data/version 1.1/primary data/individual differences data/joint.ind.diff.l2.rda"
parsed_readrate = rdata.parser.parse_file(path_ind_diff)
converted_readrate = rdata.conversion.convert(parsed_readrate)
df_readrate = converted_readrate['joint_id'][['uniform_id', 'lang']]

# need multiindex to merge properly:
df_readrate.columns = pd.MultiIndex.from_product([['labels'], df_readrate.columns]) 

# join language
df_main = df.merge(df_readrate, left_on = [('id', 'uniform_id')], right_on= [('labels', 'uniform_id')])
df_main = df_main.drop([('labels', 'uniform_id')], axis=1)

# encode language as lables ( using old lookup json):
with open("../data/outputs/language_lookup.json") as json_file:
    language_lookup = json.load(json_file)

df_main[('labels', 'lang_code')] = df_main[('labels', 'lang')].replace(language_lookup).values

In [4]:
##### only for 4 lang check: #####
#df_main[('labels', 'lang')].unique()
# use ee-fi and it-sp
df_main = df_main[df_main[('labels', 'lang')].isin(['ee', 'fi', 'it', 'sp'])]

##########

## LogReg NLIR

In [5]:
# defining model with correct solver
logreg = LogisticRegression(C=1e5, multi_class='multinomial', solver='lbfgs', random_state=0)

In [6]:
# columns used in training
columns = ['TF_uni', 'FP_uni', 'FF_uni', 'TF_bi', 'FP_bi', 'FF_bi', 'TF_tri', 'FP_tri', 'FF_tri' ]

Cross validation score: 

(only needed for report and not for further coding)

In [7]:
cross_val_scores = get_cv_score(df_main, columns, logreg)
print(np.mean(cross_val_scores))
#cross_val_scores # [0.65, 0.5666666666666667, 0.6, 0.5166666666666667, 0.5666666666666667]

5it [00:01,  3.00it/s]

0.4914285714285714
0.4914285714285714





In [10]:
# real train-test split to be used for further analysis: 
train_subset = df_main[columns]
train_label_subset = df_main[('labels', 'lang_code')]

gss = GroupShuffleSplit(n_splits=5, test_size = 0.3, random_state=42)
split_indexes = list(gss.split(train_subset, train_label_subset, df_main[('id','uniform_id')]))[0]
train_idx = list(split_indexes[0])
test_idx = list(split_indexes[1])

X_train = train_subset.iloc[train_idx]
y_train = train_label_subset.iloc[train_idx]
X_test = train_subset.iloc[test_idx]
y_test = train_label_subset.iloc[test_idx]

test_lang = df_main[('labels', 'lang')].iloc[test_idx]
test_lang_codes = df_main[('labels', 'lang_code')].iloc[test_idx]

In [12]:
# fit model:
logreg.fit(X_train, y_train)

# only for score count:
y_pred = logreg.predict(X_test)
test_acc_score = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {test_acc_score}; overall cross validation accuracy {np.mean(cross_val_scores)}')
# Test set accuracy: 0.6888888888888889; overall cross validation accuracy 0.5800000000000001 for all languages
# Test set accuracy: 0.4074074074074074; overall cross validation accuracy 0.4914285714285714 for 4 languages


Test set accuracy: 0.4074074074074074; overall cross validation accuracy 0.4914285714285714


### LogReg Similarity

Create dictionary with df for each language separately to be used for similarity metrics:

In [13]:
X_test['lang'] = test_lang # add language column
languages = list(test_lang.unique())
dict_lang = {}
for ll in languages:
    df = X_test[X_test['lang'] == ll]
    dict_lang[ll] = df


dict_proba ={}
len_dfs = []
lang_identifier = []
for key in dict_lang.keys():
    lang_df = dict_lang[key][columns]
    test_proba = logreg.predict_proba(lang_df) # predicting probabilities here
    len_df = len(lang_df)
    test_proba = test_proba.sum(axis=0)   # sum bc they are normalised by number of participants
    dict_proba[key] = test_proba
    len_dfs.append(len_df) # record length of language df
    lang_identifier.append(key)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['lang'] = test_lang # add language column


In [14]:
# to pandas:
df_predictions = pd.DataFrame(dict_proba).reset_index()
# cols are lang on which predicted and rows are outputs

# add col with lenghts and lang name
df_predictions['length'] = len_dfs
df_predictions['lang'] = lang_identifier

In [15]:
df_predictions

Unnamed: 0,index,ee,fi,it,sp,length,lang
0,0,1.0088,1.0,5.1556170000000005e-17,1.000196,9,ee
1,1,2.820599,1.979845,5.632468e-08,1.888169,5,fi
2,2,3e-05,0.998007,3.453242,1.368055,4,it
3,3,5.170571,1.022147,0.5467576,4.74358,9,sp


Convert into pair dictionary:

In [16]:
lang_values = []
lang_pairs = []
#lang_similarities = {}
for key1 in df_predictions['lang'].unique(): 
    for key2 in df_predictions['lang'].unique(): 
        val1 = float(df_predictions[df_predictions['lang'] == key1][key2])
        val2 = float(df_predictions[df_predictions['lang'] == key2][key1])
        len1 = float(df_predictions[df_predictions['lang'] == key1]['length'])
        len2 = float(df_predictions[df_predictions['lang'] == key2]['length'])
        
        #lang_similarities[f'{key1}_{key2}'] = ((val1 + val2) /(len1+len2)) #  divide by corpus length bc summed above
        lang_values.append((val1 + val2) / (len1+len2))
        lang_pairs.append(f'{key1}_{key2}')

In [17]:
df_lang_similarities = pd.DataFrame({'pair': lang_pairs, 'logreg_val': lang_values})

In [18]:
# inverted 
df_lang_similarities['logreg_val_inverted'] = 1 - df_lang_similarities['logreg_val']

In [19]:
#lang_similarities # invert as 1- proba. 
df_lang_similarities

Unnamed: 0,pair,logreg_val,logreg_val_inverted
0,ee_ee,0.112089,0.887911
1,ee_fi,0.2729,0.7271
2,ee_it,2e-06,0.999998
3,ee_sp,0.34282,0.65718
4,fi_ee,0.2729,0.7271
5,fi_fi,0.395969,0.604031
6,fi_it,0.11089,0.88911
7,fi_sp,0.20788,0.79212
8,it_ee,2e-06,0.999998
9,it_fi,0.11089,0.88911


In [20]:
# write 

df_lang_similarities.to_csv("../data/outputs/logreg_similarity_4lang.csv", index = False) # rename for other saves

**ToDos:**

- investigate why 4 lang perform worse than all. maybe try with 2? ( very different and very similar - eg ee vs fi and ee and sp)
- do separate runs with only bi and bi+tri-grams and fill table in report
- plot language tree using both similarity metrics
- do majority class baseline (cv accuracy only)

