In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

## Call data, choose features and target

In [2]:
df = pd.read_csv('../../data/processed/fetzer_processed_data.csv')

In [3]:
X = pd.get_dummies(df.loc[:, 'Region':'NONEU_2001Migrantshare'])
y = df.loc[:,'Leave?']
# X, X_test, y, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)

In [5]:
y.value_counts()

1    262
0    115
Name: Leave?, dtype: int64

## Make a logistic regression cross validation

In [6]:
kf = KFold(n_splits=3, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    lr_model = linear_model.LogisticRegression(solver="lbfgs")
    lr_model.fit(X_train, y_train)
    y_pred = (lr_model.predict_proba(X_val)[:, 1] >= 0.55)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))


print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')

Accuracy: 0.883 +- 0.037
Precision: 0.898 +- 0.052
Recall: 0.937 +- 0.026
ROC AUC: 0.850 +- 0.040


## Tuning C-value

In [7]:
# BEST C-VALUE IS 1
scores = {}
c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

for i in range(0,10):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    roc_aucs = []
    c_value = c_values[i]
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        lr_model = linear_model.LogisticRegression(solver="lbfgs", C=c_value)
        lr_model.fit(X_train, y_train)
        y_pred = (lr_model.predict_proba(X_val)[:, 1] >= 0.55)

        roc_aucs.append(roc_auc_score(y_val, y_pred))

    scores[c_value] = np.mean(roc_aucs)
    
scores

{1e-05: 0.5,
 0.0001: 0.5,
 0.001: 0.5329022988505747,
 0.01: 0.7638938793186791,
 0.1: 0.8035665516998929,
 1: 0.8497081867647136,
 10: 0.8420002760142064,
 100: 0.8420002760142064,
 1000: 0.8420002760142064,
 10000: 0.8420002760142064}

## Solver?

In [8]:
# THESE ALL SEEM TO GIVE THE SAME VALUE, DONT REALLY UNDERSTAND WHAT THEY MEAN
scores = {}
solver_choices = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

for i in range(0,5):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    roc_aucs = []
    solver_choice = solver_choices[i]
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        lr_model = linear_model.LogisticRegression(solver=solver_choice, C=1)
        lr_model.fit(X_train, y_train)
        y_pred = (lr_model.predict_proba(X_val)[:, 1] >= 0.55)

        roc_aucs.append(roc_auc_score(y_val, y_pred))

    scores[solver_choice] = np.mean(roc_aucs)
    
scores



{'newton-cg': 0.8497081867647136,
 'lbfgs': 0.8497081867647136,
 'liblinear': 0.8497081867647136,
 'sag': 0.8497081867647136,
 'saga': 0.8497081867647136}

## Tuning threshold

In [9]:
# BEST THRESHOLD IS 0.55
scores = {}

for i in range(1,100):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    roc_aucs = []
    threshold = np.linspace(0,1,100)[i]
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        lr_model = linear_model.LogisticRegression(solver="lbfgs")
        lr_model.fit(X_train, y_train)
        y_pred = (lr_model.predict_proba(X_val)[:, 1] >= threshold)

        roc_aucs.append(roc_auc_score(y_val, y_pred))

    scores[i] = np.mean(roc_aucs)
    
scores

{1: 0.5807283858070965,
 2: 0.6509995002498751,
 3: 0.6968140929535233,
 4: 0.7223075962018991,
 5: 0.7374250374812593,
 6: 0.741591704147926,
 7: 0.7452148925537231,
 8: 0.7452148925537231,
 9: 0.7545852073963019,
 10: 0.7545852073963019,
 11: 0.7618315842078961,
 12: 0.7696214392803599,
 13: 0.7737881059470265,
 14: 0.7737881059470265,
 15: 0.7754505100390982,
 16: 0.781823059058706,
 17: 0.781823059058706,
 18: 0.7859897257253726,
 19: 0.7859897257253726,
 20: 0.7896535188288208,
 21: 0.7896535188288208,
 22: 0.7974433739012846,
 23: 0.7991057779933562,
 24: 0.7991057779933562,
 25: 0.7991057779933562,
 26: 0.8032724446600229,
 27: 0.8032724446600229,
 28: 0.8011891113266897,
 29: 0.7994708982682704,
 30: 0.7994708982682704,
 31: 0.8030940866740676,
 32: 0.8030940866740676,
 33: 0.8108839417465313,
 34: 0.8089231574328059,
 35: 0.8089231574328059,
 36: 0.8130898240994724,
 37: 0.8111290397857469,
 38: 0.8090457064524136,
 39: 0.8073274933939943,
 40: 0.811494160060661,
 41: 0.811494

## Look for best features

In [10]:
# MOST IMPORTANT FEATURES PREDICTING REMAIN ARE PAY, MIGRANT SHARE, SCOTLAND
importances = dict(set(zip(pd.get_dummies(df.loc[:, 'Region':'NONEU_2001Migrantshare']).columns, 
                lr_model.coef_[0])))

importances

{'Region_East Midlands': 0.33230918081648747,
 'median_hourly_pay_growth': -0.46876797674034515,
 'Region_East': 0.7479438904301375,
 'Region_North East': -0.10007904926518246,
 'Region_Yorkshire and The Humber': -0.21855727855433424,
 'Region_North West': -0.24748785373350743,
 'ResidentAge45to59share': 0.2445794971766845,
 'Region_South West': -0.09934637927592774,
 'NONEU_2001Migrantshare': -0.9772018169438317,
 'Region_West Midlands': 0.3956670807363221,
 'ResidentAge30to44share': 0.584723363924468,
 'umemployment_rate_aps': 0.14957847306176147,
 'Region_Wales': -0.3043902405261387,
 'ResidentAge60plusshare': 0.40256323348628387,
 'EU_2001Migrantshare': -1.4555419195927894,
 'median_hourly_pay2005': -1.3007441556666792,
 'Region_Scotland': -1.7945510989248623,
 'Region_London': 0.5021408356625209,
 'Region_South East': 0.3844081441570912}

In [11]:
lr_model.coef_[0]

array([ 0.58472336,  0.2445795 ,  0.40256323, -1.30074416, -0.46876798,
        0.14957847, -1.45554192, -0.97720182,  0.74794389,  0.33230918,
        0.50214084, -0.10007905, -0.24748785, -1.7945511 ,  0.38440814,
       -0.09934638, -0.30439024,  0.39566708, -0.21855728])