In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import imblearn.over_sampling
from collections import Counter

In [2]:
df = pd.read_csv('../../data/processed/fetzer_processed_data.csv')

In [3]:
X = pd.get_dummies(df.loc[:, 'Region':'NONEU_2001Migrantshare'])
y = df.loc[:,'Leave?']
X, X_test, y, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=27)

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [5]:
# # PICKLING SCALER
# import pickle
# pickle.dump(scaler, open("my_pickled_scaler.p", "wb"))

## Oversample

In [6]:
n_pos = np.sum(y == 1)
n_neg = np.sum(y == 0)
n_pos, n_neg

(210, 91)

In [7]:
# # OVERSAMPLE NEGATIVES BY A FACTOR OF 2
# ratio = {1 : n_pos, 0 : n_neg * 2}
# ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42) 

# X, y = ROS.fit_sample(X, y)

## Make models

In [8]:
lr_model = linear_model.LogisticRegression(solver="lbfgs")
nb_model = naive_bayes.BernoulliNB()
svc_model = svm.SVC(probability=True, gamma="scale")
rf_model = ensemble.RandomForestClassifier(n_estimators=28, random_state=3)

models = ["lr_model", "nb_model", "svc_model", "rf_model"]
model_vars = [eval(n) for n in models]
model_list = list(zip(models, model_vars))

## Try max voting ensemble

In [9]:
voting_classifer = VotingClassifier(estimators=model_list, voting='hard', n_jobs=-1)
voting_classifer.fit(X, y)
y_pred = voting_classifer.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f}')
print(f'Precision: {precision_score(y_test, y_pred):.3f}')
print(f'Recall: {recall_score(y_test, y_pred):.3f}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred):.3f}')

Accuracy: 0.882
Precision: 0.906
Recall: 0.923
ROC AUC: 0.857


## Try average voting ensemble

In [10]:
voting_classifer = VotingClassifier(estimators=model_list, voting='soft', n_jobs=-1)
voting_classifer.fit(X, y)
y_pred = voting_classifer.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f}')
print(f'Precision: {precision_score(y_test, y_pred):.3f}')
print(f'Recall: {recall_score(y_test, y_pred):.3f}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred):.3f}')

Accuracy: 0.895
Precision: 0.893
Recall: 0.962
ROC AUC: 0.856


## Try weighted voting ensemble

In [11]:
# THIS I A PRETTY GOOD WEIGHT
weights = [1.7, 0.7, 0.5, 1]

In [12]:
voting_classifer = VotingClassifier(estimators=model_list, voting='soft', weights = weights)
voting_classifer.fit(X, y)
y_pred = voting_classifer.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f}')
print(f'Precision: {precision_score(y_test, y_pred):.3f}')
print(f'Recall: {recall_score(y_test, y_pred):.3f}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred):.3f}')

Accuracy: 0.895
Precision: 0.879
Recall: 0.981
ROC AUC: 0.845


In [13]:
# # PICKLING FINAL MODEL
# model = voting_classifer
# import pickle
# pickle.dump(model, open("my_pickled_model.p", "wb"))

## Tune the weights

In [14]:
scores = {}

for i in range(1,3):
    for j in range(0,2):
        for k in range(0,2):
            for l in range(1,3):
                weights = [i, j, k, l]
                voting_classifer = VotingClassifier(estimators=model_list, voting='soft', weights = weights)
                voting_classifer.fit(X, y)
                y_pred = voting_classifer.predict(X_test)
                
                scores[(i, j, k, l)] = roc_auc_score(y_test, y_pred)
                
                
sort_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

for i in sort_scores:
    print(i[0], i[1])             
                

(2, 0, 0, 1) 0.8862179487179488
(1, 0, 0, 1) 0.8653846153846154
(2, 0, 0, 2) 0.8653846153846154
(1, 0, 0, 2) 0.8557692307692308
(1, 1, 1, 1) 0.8557692307692308
(2, 1, 0, 1) 0.8557692307692308
(2, 1, 1, 1) 0.844551282051282
(1, 0, 1, 1) 0.8349358974358974
(1, 0, 1, 2) 0.8349358974358974
(1, 1, 0, 1) 0.8349358974358974
(1, 1, 1, 2) 0.8349358974358974
(2, 0, 1, 2) 0.8349358974358974
(2, 1, 0, 2) 0.8349358974358974
(2, 1, 1, 2) 0.8349358974358974
(1, 1, 0, 2) 0.8253205128205128
(2, 0, 1, 1) 0.8237179487179488


In [15]:
sort_scores[0]

((2, 0, 0, 1), 0.8862179487179488)