In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

## Call data, choose features and target

In [2]:
df = pd.read_csv('../../data/processed/fetzer_processed_data.csv')

In [3]:
X = pd.get_dummies(df.loc[:, 'Region':'NONEU_2001Migrantshare'])
y = df.loc[:,'Leave?']
# X, X_test, y, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)

In [5]:
y.value_counts()

1    262
0    115
Name: Leave?, dtype: int64

## Make a Gaussian naive bayes cross validation

In [6]:
# GUASSIAN NAIVE BAYES ISNT SO GOOD
kf = KFold(n_splits=3, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    nb_model = naive_bayes.GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred = (nb_model.predict_proba(X_val)[:, 1] >= 0.55)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))


print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')

Accuracy: 0.756 +- 0.138
Precision: 0.866 +- 0.027
Recall: 0.747 +- 0.259
ROC AUC: 0.742 +- 0.082


## Bernoulli?

In [7]:
# BERNOULLI IS MUCH BETTER THAN GAUSSIAN
kf = KFold(n_splits=3, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    nb_model = naive_bayes.BernoulliNB()
    nb_model.fit(X_train, y_train)
    y_pred = (nb_model.predict_proba(X_val)[:, 1] >= 0.77)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))


print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')

Accuracy: 0.812 +- 0.014
Precision: 0.921 +- 0.039
Recall: 0.798 +- 0.010
ROC AUC: 0.822 +- 0.025


## Changing threshold

In [8]:
# THRESHOLD FOR BERNOULLI SEEMS TO PEAK AT 0.77
scores = {}

for i in range(0,100):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    roc_aucs = []
    threshold = np.linspace(0,1,100)[i]
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        nb_model = naive_bayes.BernoulliNB()
        nb_model.fit(X_train, y_train)
        y_pred = (nb_model.predict_proba(X_val)[:, 1] >= threshold)

        roc_aucs.append(roc_auc_score(y_val, y_pred))

    scores[i] = np.mean(roc_aucs)
    
scores

{0: 0.5,
 1: 0.5775487256371815,
 2: 0.6181690404797601,
 3: 0.6181690404797601,
 4: 0.6223357071464268,
 5: 0.6366612649557574,
 6: 0.6449058540014408,
 7: 0.6593108293488186,
 8: 0.6671006844212825,
 9: 0.6671006844212825,
 10: 0.6787588259387589,
 11: 0.6845059523755405,
 12: 0.6845059523755405,
 13: 0.6984486588573512,
 14: 0.7025595165576702,
 15: 0.7098058933692643,
 16: 0.7098058933692643,
 17: 0.7098058933692643,
 18: 0.711468297461336,
 19: 0.715091485867133,
 20: 0.7112899394753806,
 21: 0.723514432898772,
 22: 0.7292615593355536,
 23: 0.7292615593355536,
 24: 0.7344246028138145,
 25: 0.7426201828588436,
 26: 0.7442825869509151,
 27: 0.7421992536175818,
 28: 0.7583224420233788,
 29: 0.7712753405741036,
 30: 0.7712753405741036,
 31: 0.7737237941823509,
 32: 0.7773469825881479,
 33: 0.7773469825881479,
 34: 0.7788868376606116,
 35: 0.7882571525031906,
 36: 0.8054985318135351,
 37: 0.8096651984802019,
 38: 0.8062287723633634,
 39: 0.8098519607691607,
 40: 0.8098519607691607,
 41