In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

## Call data, choose features and target

In [2]:
df = pd.read_csv('../../data/processed/fetzer_processed_data.csv')

In [3]:
X = pd.get_dummies(df.loc[:, 'Region':'NONEU_2001Migrantshare'])
y = df.loc[:,'Leave?']
# X, X_test, y, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)

In [5]:
y.value_counts()

1    262
0    115
Name: Leave?, dtype: int64

## Make a random forest cross validation

In [6]:
kf = KFold(n_splits=3, shuffle=True, random_state = 13)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    svc_model = svm.SVC(probability=True, gamma="scale")
    svc_model.fit(X_train, y_train)
    y_pred = (svc_model.predict_proba(X_val)[:, 1] >= 0.55)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))


print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')

Accuracy: 0.867 +- 0.013
Precision: 0.866 +- 0.019
Recall: 0.958 +- 0.015
ROC AUC: 0.810 +- 0.022


## Tuning C-value

In [7]:
# BEST C-VALUE IS 1
scores = {}
c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

for i in range(0,10):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    roc_aucs = []
    c_value = c_values[i]
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        svc_model = svm.SVC(probability=True, gamma="scale", C=c_value)
        svc_model.fit(X_train, y_train)
        y_pred = (svc_model.predict_proba(X_val)[:, 1] >= 0.55)

        roc_aucs.append(roc_auc_score(y_val, y_pred))

    scores[c_value] = np.mean(roc_aucs)
    
scores

{1e-05: 0.6802255936919351,
 0.0001: 0.6825116739689586,
 0.001: 0.7788007622174359,
 0.01: 0.7860471390290301,
 0.1: 0.7902138056956968,
 1: 0.8076596783476061,
 10: 0.7881743842299591,
 100: 0.7765888602848364,
 1000: 0.7629902358553894,
 10000: 0.7737778567508241}

## Auto gamma?

In [8]:
# AUTO GAMMA IS NOT AS GOOD
kf = KFold(n_splits=3, shuffle=True, random_state = 13)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    svc_model = svm.SVC(probability=True, gamma="auto")
    svc_model.fit(X_train, y_train)
    y_pred = (svc_model.predict_proba(X_val)[:, 1] >= 0.55)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))


print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')

Accuracy: 0.862 +- 0.016
Precision: 0.863 +- 0.021
Recall: 0.954 +- 0.017
ROC AUC: 0.803 +- 0.026


## False probability?

In [9]:
## TRUE PROBABILITY WITH THRESHOLD OF 0.55 IS BETTER
kf = KFold(n_splits=3, shuffle=True, random_state = 13)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    svc_model = svm.SVC(gamma="scale")
    svc_model.fit(X_train, y_train)
    y_pred = svc_model.predict(X_val)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))


print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')

Accuracy: 0.870 +- 0.019
Precision: 0.859 +- 0.024
Recall: 0.973 +- 0.011
ROC AUC: 0.804 +- 0.033


## Tuning threshold

In [10]:
# BEST THRESHOLD IS 0.55
scores = {}

for i in range(0,100):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    roc_aucs = []
    threshold = np.linspace(0,1,100)[i]
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        svc_model = svm.SVC(probability=True, gamma="scale")
        svc_model.fit(X_train, y_train)
        y_pred = (svc_model.predict_proba(X_val)[:, 1] >= threshold)

        roc_aucs.append(roc_auc_score(y_val, y_pred))

    scores[i] = np.mean(roc_aucs)
    
scores

{0: 0.5,
 1: 0.5077898550724638,
 2: 0.5608195902048975,
 3: 0.6096826586706647,
 4: 0.6075087456271865,
 5: 0.6486319340329836,
 6: 0.6704928785607196,
 7: 0.6934813843078461,
 8: 0.694487131434283,
 9: 0.7226792853573213,
 10: 0.7117285107446275,
 11: 0.7381618469115958,
 12: 0.7381618469115958,
 13: 0.7439089733483772,
 14: 0.7475321617541745,
 15: 0.7439089733483772,
 16: 0.7402299759762326,
 17: 0.7419481890346519,
 18: 0.7459771024130143,
 19: 0.7668104357463476,
 20: 0.7475169574854781,
 21: 0.7683502908188115,
 22: 0.7683502908188115,
 23: 0.7666320777603922,
 24: 0.7723792041971738,
 25: 0.780712537530507,
 26: 0.7848792041971736,
 27: 0.7885023926029708,
 28: 0.79220679040407,
 29: 0.780712537530507,
 30: 0.7984161857064191,
 31: 0.7879995190397523,
 32: 0.8020799788098674,
 33: 0.8020799788098674,
 34: 0.8020799788098674,
 35: 0.8020799788098674,
 36: 0.8020799788098674,
 37: 0.7981584101824164,
 38: 0.7981584101824164,
 39: 0.8001191944961419,
 40: 0.8020241698435197,
 41: 