# Logistic regression

## Data Load

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import json
#import fastparquet

In [3]:
from functions import get_cv_score, custom_train_test_split

In [4]:
data_extended_path = r"../data/data_extended_v1_1/data_v1_1.parquet.gzip"
df = pd.read_csv(data_extended_path)
#df = pd.read_parquet(data_extended_path, engine='fastparquet')
#pd.read_parquet(r"../data/data_extended_v1_1/data_v1_1.parquet.gzip")

In [5]:
df

Unnamed: 0,subid,firstrun.gopast,firstfix.dur,dur,total_dur,lang,FF_norm,FP_norm,TF_norm,lang_code
0,DU_04,0.0,278.0,464.0,5486.0,du,0.000000,0.050674,0.084579,0
1,DU_04,944.0,164.0,480.0,5486.0,du,0.172074,0.029894,0.087495,0
2,DU_04,155.0,155.0,155.0,5486.0,du,0.028254,0.028254,0.028254,0
3,DU_04,323.0,323.0,323.0,5486.0,du,0.058877,0.058877,0.058877,0
4,DU_04,462.0,265.0,462.0,5486.0,du,0.084214,0.048305,0.084214,0
...,...,...,...,...,...,...,...,...,...,...
1732238,tr_52,0.0,560.0,1598.0,26247.0,tr,0.000000,0.021336,0.060883,11
1732239,tr_52,0.0,432.0,1810.0,26247.0,tr,0.000000,0.016459,0.068960,11
1732240,tr_52,0.0,766.0,2109.0,26247.0,tr,0.000000,0.029184,0.080352,11
1732241,tr_52,0.0,652.0,1459.0,26247.0,tr,0.000000,0.024841,0.055587,11


## LogReg NLIR

In [6]:
logreg = LogisticRegression(C=1e5, multi_class='multinomial', solver='lbfgs', random_state=0)

In [7]:
train_cols = ['FF_norm', 'FP_norm', 'TF_norm'] # more cols for additional task of training on more columns.


In [None]:
cross_val_scores = get_cv_score(df, train_cols, "LogisticRegression", logreg)
print(np.mean(cross_val_scores))

In [9]:
X_train, X_test, y_train, y_test, test_lang = custom_train_test_split(df, train_cols)

In [10]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_acc_score = accuracy_score(y_test, y_pred)

#print(f'Test set accuracy: {test_acc_score}; overall cross validation accuracy" {np.mean(cross_val_scores)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# get other metrics as well, maybe store and analyze output?
# write predictions
# out path
# np.savetxt(r"../data/outputs/NLIR_predictions_logreg.csv", y_pred, delimiter=",")

### LogReg Similarity

In [12]:
X_test['lang'] = test_lang # add language column
languages = list(test_lang.unique())
dict_lang = {}
for ll in languages:
    df = X_test[X_test['lang'] == ll]
    dict_lang[ll] = df

In [13]:
dict_proba ={}
len_dfs = []
for key in dict_lang.keys():
    lang_df = dict_lang[key][train_cols]
    test_proba = logreg.predict_proba(lang_df)
    len_df = len(lang_df)
    test_proba = test_proba.sum(axis=0)   # .mean(axis=0) # do sum instead of mean 
    dict_proba[key] = test_proba
    len_dfs.append(len_df)
    # record length of language df as well 

In [14]:
df_predictions = pd.DataFrame(dict_proba).reset_index().rename(columns = {'index': 'lang_code'})
# cols are lang on which predicted and rows are outputs

In [17]:
# add col with lenghts
df_predictions['length'] = len_dfs

In [18]:
with open("../data/outputs/language_lookup.json") as json_file:
    language_lookup = json.load(json_file)

In [19]:
reversed_language_lookup  = {v: k for k, v in language_lookup.items()}
df_predictions["lang_name"] = df_predictions['lang_code'].replace(reversed_language_lookup).values

In [20]:
df_predictions

Unnamed: 0,lang_code,du,ee,fi,ge,gr,he,it,en,no,ru,sp,tr,length,lang_name
0,0,3771.367108,3739.80104,3850.66991,3030.977013,3481.866499,2056.920464,3013.337429,2817.557139,2410.795295,2526.231676,2767.687611,1343.400583,72907,du
1,1,8989.335227,8935.185832,9170.568013,7366.967849,8302.397211,5002.799274,7227.170939,6768.504938,5773.075472,6026.894041,6668.909067,3252.883743,72498,ee
2,2,8355.14348,8211.17941,8526.025454,6575.377284,7722.71379,4488.615969,6660.72841,6637.803589,5343.465744,5615.875586,6094.135553,2941.655909,74126,en
3,3,8233.245956,8150.284927,8408.634012,6574.269866,7603.410963,4463.41198,6570.128403,6202.619686,5259.566573,5517.798759,6024.914177,2919.227579,61932,fi
4,4,5937.430662,5964.965591,5936.308602,5984.874759,5454.531815,3907.762721,4999.501034,5060.553468,3922.104448,4027.256408,4859.039412,2460.843642,67319,ge
5,5,5107.934007,5081.842637,5205.095254,4265.140854,4720.914435,2900.244154,4135.649004,3945.920115,3299.290538,3432.77769,3833.690129,1877.721253,41864,gr
6,6,6044.320855,6050.251764,6112.560069,5512.753139,5579.497263,3709.626782,5004.882441,4920.625892,3960.946932,4083.281005,4742.857879,2364.962927,59164,he
7,7,7014.094569,7009.41438,7110.215381,6243.391326,6478.800294,4220.98168,5778.468842,5664.803464,4584.269465,4734.557804,5442.367667,2701.149569,57135,it
8,8,5511.325816,5456.851083,5626.712234,4439.817486,5092.993446,3019.061147,4415.024075,4217.365697,3532.527339,3698.401494,4057.17146,1970.00837,47127,no
9,9,6266.266446,6226.028336,6390.835508,5164.53644,5790.927027,3510.995664,5052.38448,4796.988495,4034.551359,4207.089624,4668.224718,2279.653561,49045,ru


In [24]:
lang_values = []
lang_pairs = []
#lang_similarities = {}
for key1 in language_lookup.keys():
    for key2 in language_lookup.keys():
        #print((key1, key2))
        val1 = float(df_predictions[df_predictions['lang_name'] == key1][key2])
        val2 = float(df_predictions[df_predictions['lang_name'] == key2][key1])
        len1 = float(df_predictions[df_predictions['lang_name'] == key1]['length'])
        len2 = float(df_predictions[df_predictions['lang_name'] == key2]['length'])
        
        #lang_similarities[f'{key1}_{key2}'] = ((val1 + val2) /(len1+len2)) #  divide by corpus length bc summed above
        lang_values.append((val1 + val2) / (len1+len2))
        lang_pairs.append(f'{key1}_{key2}')

In [31]:
df_lang_similarities = pd.DataFrame({'pair': lang_pairs, 'logreg_val': lang_values})

In [32]:
#lang_similarities # invert as 1- proba. 
df_lang_similarities

Unnamed: 0,pair,logreg_val
0,du_du,0.051728
1,du_ee,0.087543
2,du_en,0.075988
3,du_fi,0.089617
4,du_ge,0.063957
...,...,...
139,tr_it,0.059549
140,tr_no,0.051325
141,tr_ru,0.054937
142,tr_sp,0.049100


In [33]:
# with open("../data/outputs/logreg_similarity.json", "w") as outfile:
#     json.dump(lang_similarities, outfile)

df_lang_similarities.to_csv("../data/outputs/logreg_similarity.csv", index = False)