In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Definitions
pd.set_option('display.float_format', lambda x: '%.6f' % x)
%matplotlib inline
njobs = 4
randomState = 0

In [2]:
# Load preprocessed data
data = pd.read_csv("cleanData/data.csv")
y = pd.read_csv("cleanData/y.csv", header = None)
y = y[0]

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.25, random_state = randomState)

In [3]:
# Logistic Regression
lr = LogisticRegression(random_state = randomState, 
                        C = 3,
                        n_jobs = njobs)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(lr.score(X_test, y_test))

  " = {}.".format(self.n_jobs))


0.897737205011


In [4]:
# SVM
svm = SVC(random_state = randomState)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print(svm.score(X_test, y_test))

0.897348742352


In [5]:
# Random Forests
rf = RandomForestClassifier(random_state = randomState, 
                            n_estimators = 300, 
                            max_depth = 5,
                            n_jobs = njobs)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(rf.score(X_test, y_test))

0.897542973682


In [6]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 20,
                           n_jobs = njobs)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print(knn.score(X_test, y_test))

0.897251626687


In [7]:
# XGB
xgb = XGBClassifier(max_depth=4,
                    learning_rate=0.1,
                    n_estimators = 100)

xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print(xgb.score(X_test, y_test))

0.899582402642


In [8]:
# Combine predictions
def computePreds(preds, cutoff) :
    preds[preds > cutoff] = 1
    preds[preds <= cutoff] = 0
    return(preds)

names = []
scores50 = []
scores75 = []
range = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 1]
for a in range :
    for b in range : 
        for c in range :
            for d in range :
                for e in range :
                    if ((a + b + c + d + e) == 1) :
                        name = str(a) + "-" + str(b) + "-" + str(c) + "-" + str(d) + "-" + str(e)
                        names.append(name)
                        preds = (a * y_pred_lr) + (b * y_pred_svm) + (c * y_pred_rf) + (d * y_pred_knn) + (e * y_pred_xgb)
                        preds = computePreds(preds, 0.5)
                        score = accuracy_score(y_test, preds)
                        scores50.append(score)
                        preds = computePreds(preds, 0.75)
                        score = accuracy_score(y_test, preds)
                        scores75.append(score)

df = pd.DataFrame()
df.loc[:, "names"] = pd.Series(names)
df.loc[:, "scores50"] = pd.Series(scores50)
df.loc[:, "scores75"] = pd.Series(scores75)
print("Head : ")
display(df.sort_values("scores50", ascending = False).head(10))
display(df.sort_values("scores75", ascending = False).head(10))
print("Tail : ")
display(df.sort_values("scores50", ascending = False).tail(10))
display(df.sort_values("scores75", ascending = False).tail(10))

Head : 


Unnamed: 0,names,scores50,scores75
0,0-0-0-0-1,0.899582,0.899582
228,0.1-0-0-0.1-0.8,0.899582,0.899582
412,0.2-0-0.1-0.1-0.6,0.899582,0.899582
29,0-0-0.4-0-0.6,0.899582,0.899582
547,0.3-0-0.1-0-0.6,0.899582,0.899582
272,0.1-0.1-0-0.2-0.6,0.899582,0.899582
271,0.1-0.1-0-0-0.8,0.899582,0.899582
542,0.3-0-0-0.1-0.6,0.899582,0.899582
418,0.2-0-0.2-0-0.6,0.899582,0.899582
73,0-0.1-0.3-0-0.6,0.899582,0.899582


Unnamed: 0,names,scores50,scores75
0,0-0-0-0-1,0.899582,0.899582
228,0.1-0-0-0.1-0.8,0.899582,0.899582
412,0.2-0-0.1-0.1-0.6,0.899582,0.899582
29,0-0-0.4-0-0.6,0.899582,0.899582
547,0.3-0-0.1-0-0.6,0.899582,0.899582
272,0.1-0.1-0-0.2-0.6,0.899582,0.899582
271,0.1-0.1-0-0-0.8,0.899582,0.899582
542,0.3-0-0-0.1-0.6,0.899582,0.899582
418,0.2-0-0.2-0-0.6,0.899582,0.899582
73,0-0.1-0.3-0-0.6,0.899582,0.899582


Tail : 


Unnamed: 0,names,scores50,scores75
79,0-0.1-0.3-0.6-0,0.897252,0.897252
245,0.1-0-0.2-0.4-0.3,0.897252,0.897252
454,0.2-0.1-0.1-0.6-0,0.897252,0.897252
241,0.1-0-0.1-0.8-0,0.897252,0.897252
72,0-0.1-0.2-0.6-0.1,0.897252,0.897252
240,0.1-0-0.1-0.6-0.2,0.897252,0.897252
234,0.1-0-0-0.9-0,0.897252,0.897252
233,0.1-0-0-0.8-0.1,0.897252,0.897252
232,0.1-0-0-0.6-0.3,0.897252,0.897252
313,0.1-0.2-0-0.6-0.1,0.897252,0.897252


Unnamed: 0,names,scores50,scores75
79,0-0.1-0.3-0.6-0,0.897252,0.897252
245,0.1-0-0.2-0.4-0.3,0.897252,0.897252
454,0.2-0.1-0.1-0.6-0,0.897252,0.897252
241,0.1-0-0.1-0.8-0,0.897252,0.897252
72,0-0.1-0.2-0.6-0.1,0.897252,0.897252
240,0.1-0-0.1-0.6-0.2,0.897252,0.897252
234,0.1-0-0-0.9-0,0.897252,0.897252
233,0.1-0-0-0.8-0.1,0.897252,0.897252
232,0.1-0-0-0.6-0.3,0.897252,0.897252
313,0.1-0.2-0-0.6-0.1,0.897252,0.897252


In [9]:
print("Best ensemble is a 50-50 weighted combination of the Random Forests and KNN models, but difference best and worse \
combinations seems pretty negligible (about 0.015 accuracy)")

Best ensemble is a 50-50 weighted combination of the Random Forests and KNN models, but difference best and worse combinations seems pretty negligible (about 0.015 accuracy)
