# Logistic regression

## Data Load

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [59]:
def custom_train_test_split(data, columns):
    """
    Function for train test split.
    Arguments:
        data: pandas dataset (with train columns, labels ('lang_code') and groups ('subid'))
        columns: columns to train on.
    Returns: train and test sets with labels.
    """
    train_subset = data[columns]
    train_label_subset = data['lang_code']
    gss = GroupShuffleSplit(n_splits=1, test_size = 0.4, random_state=42)
    split_indexes = list(gss.split(train_subset, train_label_subset, data['subid']))[0]
    train_idx = list(split_indexes[0])
    test_idx = list(split_indexes[1])

    X_train = train_subset.iloc[train_idx]
    y_train = train_label_subset.iloc[train_idx]
    X_test = train_subset.iloc[test_idx]
    y_test = train_label_subset.iloc[test_idx]

    test_lang = data['lang'].iloc[test_idx]
    return X_train, X_test, y_train, y_test, test_lang

In [60]:
def get_cv_score(data, columns, model_type, model, cv = 5):
    """
    Function to get cv score. 
    Arguments:
        data: pandas dataset (with train columns, labels ('lang_code') and groups ('subid'))
        columns: columns to train on.
        model_type: LogisticRegression or LSTM.
        model: defined model.
        cv: how many cv splits, default = 5.
    Returns: train and test sets with labels.
    """
    train_subset = data[columns]
    train_label_subset = data['lang_code']

    gss = GroupShuffleSplit(n_splits = cv, test_size = 0.4, random_state=42)
    cv_scores = []

    if model_type == 'LogisticRegression':
        for train_idx, test_idx in tqdm(gss.split(train_subset, train_label_subset, groups = data['subid'])): # wtf what are labels
            model.fit(train_subset.iloc[train_idx], train_label_subset.iloc[train_idx])
            score = model.score(train_subset.iloc[test_idx], train_label_subset.iloc[test_idx])
            cv_scores.append(score)
        print(np.mean(cv_scores))
        return cv_scores

    elif model_type == 'LSTM':
        print('TBD. Must be implemented if logreg framwork does not work.')
        return 

    else:
        print("Model type must be 'LogisticRegression' or 'LSTM'.")

In [61]:
data_extended_path = r"../data/data_extended_v1_1/data_v1_1.csv"
df = pd.read_csv(data_extended_path)

## LogReg NLIR

In [62]:
logreg = LogisticRegression(C=1e5, multi_class='multinomial', solver='lbfgs', random_state=0)

In [23]:
train_cols = ['FF_norm', 'FP_norm', 'TF_norm'] # more cols for additional task of training on more columns.
cross_val_scores = get_cv_score(df, train_cols, "LogisticRegression", logreg)
print(np.mean(cross_val_scores))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.10766933200140638


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[0.12052354462300001,
 0.09862741399573424,
 0.12235904605331728,
 0.08941046432303136,
 0.10742619101194907]

In [63]:
X_train, X_test, y_train, y_test, test_lang = custom_train_test_split(df, train_cols)

In [65]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_acc_score = accuracy_score(y_test, y_pred)

#print(f'Test set accuracy: {test_acc_score}; overall cross validation accuracy" {np.mean(cross_val_scores)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
# get other metrics as well, maybe store and analyze output?
# write predictions
# out path
# np.savetxt(r"../data/outputs/NLIR_predictions_logreg.csv", y_pred, delimiter=",")

### LogReg Similarity

In [66]:
X_test['lang'] = test_lang # add language column
languages = list(test_lang.unique())
dict_lang = {}
for ll in languages:
    df = X_test[X_test['lang'] == ll]
    dict_lang[ll] = df

In [None]:
# double for loop: for each key/df in dict -> predict, record pred as dict, put to df? and get mean s meatric

In [54]:
dict_lang['du']#[:-1]

Unnamed: 0,FF_norm,FP_norm,TF_norm,lang
0,0.000000,0.050674,0.084579,du
1,0.172074,0.029894,0.087495,du
2,0.028254,0.028254,0.028254,du
3,0.058877,0.058877,0.058877,du
4,0.084214,0.048305,0.084214,du
...,...,...,...,...
1548361,0.000000,0.126645,0.152259,du
1548362,0.058281,0.058281,0.058281,du
1548363,0.136319,0.201515,0.308528,du
1548364,0.329602,0.062232,0.157392,du


In [67]:
for key in dict_lang.keys():
    test_proba = logreg.predict_proba(dict_lang[key][train_cols])

dict_keys(['du', 'ee', 'fi', 'ge', 'gr', 'he', 'it', 'en', 'no', 'ru', 'sp', 'tr'])

In [57]:
logreg.classes_ # now map from lang code to lang. write external function to map and unmap 

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [56]:
test_proba

array([[0.04921869, 0.11924527, 0.10763136, ..., 0.08447535, 0.06897935,
        0.03995224],
       [0.04515644, 0.11346616, 0.09746004, ..., 0.08097361, 0.07399716,
        0.04168536],
       [0.04814203, 0.11560505, 0.10773757, ..., 0.08345771, 0.06689759,
        0.03899432],
       ...,
       [0.06363794, 0.14859503, 0.13145295, ..., 0.09593921, 0.06338954,
        0.03767961],
       [0.04654687, 0.11844404, 0.0979013 , ..., 0.08257245, 0.07612656,
        0.04244673],
       [0.05620283, 0.13185129, 0.12173659, ..., 0.09025828, 0.0640588 ,
        0.03797565]])

In [22]:
# on same data as training or need different data?
#score = logreg.score()
# X - put spanish
# y - put portuguese and see how high the accuracy? or see how often predicted portuguese?
