# Logistic regression

## Data Load

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import json
#import fastparquet

In [2]:
from functions import get_cv_score, custom_train_test_split

In [3]:
data_extended_path = r"../data/data_extended_v1_1/data_v1_1.parquet.gzip"
df = pd.read_csv(data_extended_path)
#df = pd.read_parquet(data_extended_path, engine='fastparquet')
#pd.read_parquet(r"../data/data_extended_v1_1/data_v1_1.parquet.gzip")

In [4]:
df

Unnamed: 0,subid,firstrun.gopast,firstfix.dur,dur,total_dur,lang,FF_norm,FP_norm,TF_norm,lang_code
0,DU_04,0.0,278.0,464.0,5486.0,du,0.000000,0.050674,0.084579,0
1,DU_04,944.0,164.0,480.0,5486.0,du,0.172074,0.029894,0.087495,0
2,DU_04,155.0,155.0,155.0,5486.0,du,0.028254,0.028254,0.028254,0
3,DU_04,323.0,323.0,323.0,5486.0,du,0.058877,0.058877,0.058877,0
4,DU_04,462.0,265.0,462.0,5486.0,du,0.084214,0.048305,0.084214,0
...,...,...,...,...,...,...,...,...,...,...
1732238,tr_52,0.0,560.0,1598.0,26247.0,tr,0.000000,0.021336,0.060883,11
1732239,tr_52,0.0,432.0,1810.0,26247.0,tr,0.000000,0.016459,0.068960,11
1732240,tr_52,0.0,766.0,2109.0,26247.0,tr,0.000000,0.029184,0.080352,11
1732241,tr_52,0.0,652.0,1459.0,26247.0,tr,0.000000,0.024841,0.055587,11


## LogReg NLIR

In [5]:
logreg = LogisticRegression(C=1e5, multi_class='multinomial', solver='lbfgs', random_state=0)

In [6]:
train_cols = ['FF_norm', 'FP_norm', 'TF_norm'] # more cols for additional task of training on more columns.


In [None]:
cross_val_scores = get_cv_score(df, train_cols, "LogisticRegression", logreg)
print(np.mean(cross_val_scores))

In [8]:
X_train, X_test, y_train, y_test, test_lang = custom_train_test_split(df, train_cols)

In [9]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_acc_score = accuracy_score(y_test, y_pred)

#print(f'Test set accuracy: {test_acc_score}; overall cross validation accuracy" {np.mean(cross_val_scores)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
# get other metrics as well, maybe store and analyze output?
# write predictions
# out path
# np.savetxt(r"../data/outputs/NLIR_predictions_logreg.csv", y_pred, delimiter=",")

### LogReg Similarity

In [10]:
X_test['lang'] = test_lang # add language column
languages = list(test_lang.unique())
dict_lang = {}
for ll in languages:
    df = X_test[X_test['lang'] == ll]
    dict_lang[ll] = df

In [28]:
dict_proba ={}
for key in dict_lang.keys():
    test_proba = logreg.predict_proba(dict_lang[key][train_cols])
    test_proba = test_proba.mean(axis=0)
    dict_proba[key] = test_proba

In [59]:
df_predictions = pd.DataFrame(dict_proba).reset_index().rename(columns = {'index': 'lang_code'})
# cols are lang on which predicted and rows are outputs

In [61]:
with open("../data/outputs/language_lookup.json") as json_file:
    language_lookup = json.load(json_file)

In [62]:
reversed_language_lookup  = {v: k for k, v in language_lookup.items()}
df_predictions["lang_name"] = df_predictions['lang_code'].replace(reversed_language_lookup).values

In [64]:
df_predictions

Unnamed: 0,lang_code,du,ee,fi,ge,gr,he,it,en,no,ru,sp,tr,lang_name
0,0,0.051728,0.051585,0.051948,0.04894,0.051722,0.049133,0.050932,0.049314,0.051155,0.051508,0.050243,0.049665,du
1,1,0.123299,0.123247,0.123716,0.118953,0.123329,0.119501,0.122155,0.118465,0.1225,0.122885,0.121064,0.120259,ee
2,2,0.1146,0.113261,0.115021,0.106171,0.114718,0.107219,0.112581,0.116178,0.113384,0.114505,0.110629,0.108753,en
3,3,0.112928,0.112421,0.113437,0.106153,0.112946,0.106617,0.111049,0.108561,0.111604,0.112505,0.109373,0.107924,fi
4,4,0.081438,0.082278,0.080084,0.096636,0.081025,0.093344,0.084502,0.088572,0.083224,0.082113,0.088208,0.090977,ge
5,5,0.070061,0.070096,0.07022,0.068868,0.070128,0.069278,0.069901,0.069063,0.070008,0.069992,0.069595,0.069419,gr
6,6,0.082905,0.083454,0.082462,0.089013,0.082881,0.088611,0.084593,0.086123,0.084048,0.083256,0.086099,0.087433,he
7,7,0.096206,0.096684,0.095921,0.10081,0.09624,0.100826,0.097669,0.099148,0.097275,0.096535,0.098798,0.099861,it
8,8,0.075594,0.075269,0.075907,0.071689,0.075655,0.072116,0.074623,0.073814,0.074958,0.075408,0.073652,0.072831,no
9,9,0.085949,0.085879,0.086216,0.08339,0.086022,0.083867,0.085396,0.083959,0.08561,0.08578,0.084744,0.084279,ru


In [77]:
float(df_predictions[df_predictions['lang_name'] == 'ee']['du'])

0.12329865756409215

In [78]:
lang_similarities = {}
for key1 in language_lookup.keys():
    for key2 in language_lookup.keys():
        #print((key1, key2))
        val1 = float(df_predictions[df_predictions['lang_name'] == key1][key2])
        val2 = float(df_predictions[df_predictions['lang_name'] == key2][key1])
        lang_similarities[f'{(key1, key2)}'] = ((val1 + val2) /2)


In [79]:
lang_similarities

{"('du', 'du')": 0.051728463762166096,
 "('du', 'ee')": 0.08744177160499488,
 "('du', 'en')": 0.08195702408330519,
 "('du', 'fi')": 0.08243784346619092,
 "('du', 'ge')": 0.06518940974880452,
 "('du', 'gr')": 0.0608914265826926,
 "('du', 'he')": 0.06601896465384695,
 "('du', 'it')": 0.07356899666173465,
 "('du', 'no')": 0.06337460364414688,
 "('du', 'ru')": 0.06872860378414086,
 "('du', 'sp')": 0.058424903795437896,
 "('du', 'tr')": 0.044175416031702996,
 "('ee', 'du')": 0.08744177160499488,
 "('ee', 'ee')": 0.123247342436857,
 "('ee', 'en')": 0.1158629484154857,
 "('ee', 'fi')": 0.11806838384813537,
 "('ee', 'ge')": 0.10061509166705229,
 "('ee', 'gr')": 0.0967127485048275,
 "('ee', 'he')": 0.10147763770697615,
 "('ee', 'it')": 0.10941955918385163,
 "('ee', 'no')": 0.09888468461098196,
 "('ee', 'ru')": 0.10438180299677446,
 "('ee', 'sp')": 0.09402184232414491,
 "('ee', 'tr')": 0.07955256734669733,
 "('en', 'du')": 0.08195702408330519,
 "('en', 'ee')": 0.1158629484154857,
 "('en', 'en')"