In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 1. Preprocess data

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [None]:
print train_data.shape
print test_data.shape

In [None]:
train_data.describe()

In [None]:
plt.figure(figsize=(14,6))
train_data.boxplot()
plt.show()

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
tmp = pd.DataFrame(scaler.fit_transform(train_data),columns=train_data.columns)

In [None]:
plt.figure(figsize=(14,6))
tmp.boxplot()
plt.show()

In [None]:
tmp2 = tmp.pop('eyeDetection')

In [None]:
from scipy import stats

train_data = train_data[(np.abs(stats.zscore(train_data)) < 3).all(axis=1)]

In [None]:
train_set_x = train_data
train_set_y = train_set_x.pop("eyeDetection")
test_set_x = test_data
test_set_y = test_set_x.pop("eyeDetection")

In [None]:
scaler = preprocessing.StandardScaler()

scaler.fit(train_set_x)
train_set_x = pd.DataFrame(scaler.transform(train_set_x),columns=train_set_x.columns)
test_set_x = pd.DataFrame(scaler.transform(test_set_x),columns=test_set_x.columns)

In [None]:
plt.figure(figsize=(14,6))
train_set_x.boxplot()
plt.show()

In [None]:
train_set_y.value_counts()

In [None]:
base_score = float(5360) / (5360+4384)
print base_score

# 2. Logistic regression

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn import linear_model
#http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics
from sklearn import metrics

clf_logistic = linear_model.LogisticRegression(C=1000000)
clf_logistic.fit(train_set_x,train_set_y)

train_predictions = clf_logistic.predict(train_set_x)
test_predictions = clf_logistic.predict(test_set_x)

print metrics.accuracy_score(train_predictions,train_set_y)
print metrics.accuracy_score(test_predictions,test_set_y)

In [None]:
tmp = pd.DataFrame()
tmp["predictions"] = test_predictions
tmp["targets"] = test_set_y

tmp.hist(column="predictions",by="targets",sharey=True)
plt.show()

In [None]:
test_predictions = clf_logistic.predict_proba(test_set_x)

tmp = pd.DataFrame()
tmp["predictions"] = test_predictions[:,1]
tmp["targets"] = test_set_y

tmp.hist(column="predictions",by="targets",sharey=True)
plt.show()

# 3. Model evaluation

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(clf_logistic,train_set_x,train_set_y,cv=5)
print cv_scores
print np.mean(cv_scores)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

predicted = cross_val_predict(clf_logistic, train_set_x, train_set_y, cv=5)
print metrics.accuracy_score(train_set_y, predicted) 

In [None]:
tmp = pd.DataFrame()
tmp["predictions"] = predicted
tmp["targets"] = train_set_y

tmp.hist(column="predictions",by="targets",sharey=True)
plt.show()

In [None]:
predicted = cross_val_predict(clf_logistic, train_set_x, train_set_y, cv=5, method="predict_proba")
print predicted

In [None]:
tmp = pd.DataFrame()
tmp["predictions"] = predicted[:,1]
tmp["targets"] = train_set_y

tmp.hist(column="predictions",by="targets",sharey=True)
plt.show()

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
for train, test in kf.split(train_set_x):
    clf_logistic.fit(train_set_x.iloc[train],train_set_y.iloc[train])
    predicted = clf_logistic.predict(train_set_x.iloc[test])
    print metrics.accuracy_score(train_set_y.iloc[test], predicted) 

In [None]:
from sklearn.metrics import roc_curve

predicted = cross_val_predict(clf_logistic, train_set_x, train_set_y, cv=5, method="predict_proba")

fpr, tpr, thresholds = roc_curve(train_set_y,predicted[:,1])

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
print metrics.auc(fpr, tpr)

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(train_set_y,predicted[:,1])

In [None]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.show()

In [None]:
clf_logistic.fit(train_set_x,train_set_y)
print clf_logistic.coef_

In [None]:
#Exercise: perform 5-fold cross-validation on the smaller train-set below and plot the ROC
train_set_x_small = train_set_x.iloc[:200]
train_set_y_small = train_set_y.iloc[:200]

In [None]:
#Exercise: fit a logistic regression model on the full and the small train-set 
#          and plot a ROC curve for the predictions on the test-set


# 4. Regularization

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'C':[0.001,0.01,0.1,1,10]}

GSCV = GridSearchCV(clf_logistic, params,cv=5)
GSCV.fit(train_set_x,train_set_y)

print GSCV.best_score_
print GSCV.best_estimator_

In [None]:
#Exercise: find the optimal value for C for a lgostic regression model with penalty='l1'


In [None]:
#Exercise: print the model-parameters for the best model obtained above


In [None]:
#Exercise: how well does the best model perform on the test-set (compute AUC)?


In [None]:
#Exercise: fit a logistic regression model withe peanlty='l1' and C=0.01
#          and print the model parameters again.


In [None]:
#Exercise: how wel does this model perform on the test-set (compute the AUC)?


# 5. Linear SVM

In [None]:
#Exercise: find the best value for C on the small train-set

#http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

params = {'C':[0.001,0.01,0.1,1,10,100]}

In [None]:
#Exercise: what is the performance (accuracy) of the best model fitted on the full train-set on the test-set?


In [None]:
#Exercise: what is the AUC performance on the test-set?


# 6. Decision tree

In [None]:
#Exercise: optimze a Decision tree on the train-set

#http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.tree import DecisionTreeClassifier

clf_DT = DecisionTreeClassifier()


In [None]:
#Exercise: what is the performance (accuracy) of the best model fitted on the full train-set on the test-set?


# 7. RBF SVM

In [None]:
#Exercise: what is the performance (accuracy and AUC) of an optimized RBF SVM on the test set?
#Tip: optimize on small train-set

clf_svm = svm.SVC(kernel='rbf')


In [None]:
#Exercise: plot the ROC curve for the test-set


# 8. Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

clf_logistic_bagged = BaggingClassifier(clf_logistic,n_estimators=50,max_samples=0.7)
scores = cross_val_score(clf_logistic_bagged, train_set_x, train_set_y, cv=5)
print scores
print np.mean(scores)

In [None]:
params_bagged = {'n_estimators':[10,30,50], 'max_samples':[0.4,0.6,0.8]} #make them increase max_depth

GSCV = GridSearchCV(clf_logistic_bagged, params_bagged,cv=5)
GSCV.fit(train_set_x,train_set_y)

print GSCV.best_score_
print GSCV.best_estimator_

In [None]:
#Exercise: what is the effect of bagging on a Decision tree classifier?


# 9. Random Forest

In [None]:
#Exercise: what is the performance (accuracy and AUC) of an optimized Random Forest on the test set?
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint 
from scipy.stats import uniform 

clf_RF = RandomForestClassifier()

params_dist_RF = {
                "max_features": uniform(0.1,0.6),
                "min_samples_split": randint(2, 100),
                "n_estimators": randint(10, 100)
                }

n_iter_search = 50

RSCV = RandomizedSearchCV(clf_RF, params_dist_RF,cv=5,n_jobs=-1,n_iter=n_iter_search)


In [None]:
print RSCV.best_estimator_.feature_importances_

# 10. Extreme Gradient Boosting

In [None]:
#conda install py-xgboost
#Exercise: what is the performance (accuracy and AUC) of an optimized XGBoost on the test set?
from xgboost import XGBClassifier 

clf_xgb = XGBClassifier() 
