In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
import seaborn as sns
import pydotplus

from sklearn import preprocessing
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.externals.six import StringIO
from sklearn.metrics import precision_recall_curve, roc_curve

from sklearn_bayes.rvm import RVC
from sklearn_bayes.logistic.variational_logistic import VariationalLogisticRegression 

import pickle

%matplotlib qt

In [2]:
# Import the data
df_exam = pd.read_csv('exam.dat', header=None, sep='[1-4]:', engine='python')

In [3]:
# Summarize
sns.set()
plot_scatterplot = sns.pairplot(df_exam, hue=0, vars=np.arange(1, 5))
plot_scatterplot.savefig("scatterplot.pdf", dpi=10)

print("n = {0}, d = {1}".format(*df_exam.shape))
print("count(g = 0) = {0}, count(g = 1) = {1}".format(sum(df_exam[0] < 0.5), sum(df_exam[0] > 0.5)))

n = 3089, d = 5
count(g = 0) = 1089, count(g = 1) = 2000


In [4]:
X = df_exam.values[:, 1 :]
g = df_exam.values[:, 0]

X_train, X_test, g_train, g_test = train_test_split (X, g, test_size = 0.25, random_state=33)

mu = np.mean(X_train, axis = 0)
sigma = np.std(X_train, axis = 0)
X_train = (X_train - mu) / sigma
X_test = (X_test - mu) / sigma

cv = KFold(X_train.shape[0], n_folds=10, random_state=8) # Generate the cross validation labels

In [5]:
classifiers = dict()
params_key = dict()
params_grid = dict()

score_train = dict()
score_test = dict()
score_cv_mean = dict()
score_cv_std = dict()

# KNN classifier
k_range = np.arange(1, 100, 2) # Range of parameter n_neighbors
classifiers["knn"] = KNeighborsClassifier()
params_key["knn"] = "n_neighbors"
params_grid["knn"] = dict([(params_key["knn"], k_range), ])

# Decision tree classifier
depth_range = np.arange(1, 21) # Range of parameter n_neighbors
classifiers["tree"] = DecisionTreeClassifier()
params_key["tree"] = "max_depth"
params_grid["tree"] = dict([(params_key["tree"], depth_range), ])

# Logistic regresssion 
C_range = np.logspace(-5, 5, 20) # Range of parameter n_neighbors
classifiers["logreg"] = linear_model.LogisticRegression()
params_key["logreg"] = "C"
params_grid["logreg"] = dict([(params_key["logreg"], C_range), ])

# Train all the classifiers
for classifier in classifiers.keys():
    
    print("Tuning {0}...".format(classifier))
    
    grid = GridSearchCV(classifiers[classifier], param_grid=params_grid[classifier], cv=cv , verbose=0, n_jobs=4)
    grid.fit(X_train, g_train)
    
    score_train[classifier] = np.empty(len(params_grid[classifier][params_key[classifier]]), dtype="float64")
    score_test[classifier] = np.empty(len(params_grid[classifier][params_key[classifier]]), dtype="float64")
    
    for idx, p in enumerate(params_grid[classifier][params_key[classifier]]):
        classifiers[classifier].set_params(**dict([(params_key[classifier], p), ]))
        classifiers[classifier].fit(X_train, g_train)
    
        score_train[classifier][idx] = classifiers[classifier].score(X_train, g_train)
        score_test[classifier][idx] = classifiers[classifier].score(X_test, g_test)
        
    score_cv_mean[classifier] = [entry[1] for entry in grid.grid_scores_]
    score_cv_std[classifier] = [np.std(entry[2]) for entry in grid.grid_scores_] 
    


Tuning logreg...
Tuning tree...
Tuning knn...


In [6]:
# Plot the tuning of model parameters
axis_font = {'size': '20'}
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

for classifier in classifiers.keys():
    plt.figure()

    plt.errorbar(params_grid[classifier][params_key[classifier]],\
                 score_cv_mean[classifier], yerr=score_cv_std[classifier],\
                 color='green', linewidth=2, label='10-fold CV', marker='.', markersize=12)
    plt.plot(params_grid[classifier][params_key[classifier]],\
             score_train[classifier], color='darkorange', linewidth=2, label='Train', marker='.', markersize=12)
    plt.plot(params_grid[classifier][params_key[classifier]],\
             score_test[classifier], color='blue', linewidth=2, label='Test', marker='.', markersize=12)
    
    plt.xlabel(params_key[classifier], **axis_font)
    plt.ylabel('Score', **axis_font)
    plt.grid()
    plt.legend(prop={'size':16})
    plt.savefig('{0}_tuning.pdf'.format(classifier), dpi=10)

In [8]:
# The svm classifier with rbf kernel
gamma_range = np.logspace(-4, 4, 9)
C_range = np.logspace(-4, 4, 9)

classifiers["svc"] = SVC(kernel='rbf')
params_grid["svc"] = dict(gamma=gamma_range, C=C_range)
params_key["svc"] = ["gamma", "C"]
params_grid["svc"] = dict([("gamma", gamma_range), ("C", C_range)])

print("Tuning SVC...")
    
grid = GridSearchCV(classifiers["svc"], param_grid=params_grid["svc"], cv=cv, verbose=0, n_jobs=4)
grid.fit(X_train, g_train)
grid.score(X_test, g_test)

Tuning SVC...


0.95989650711513586

In [9]:
score_train["svc"] = np.empty((len(gamma_range), len(C_range)), dtype="float64")
score_test["svc"] = np.empty((len(gamma_range), len(C_range)), dtype="float64")

for i, gamma in enumerate(gamma_range):
    for j, C in enumerate(C_range):
        classifiers["svc"].set_params(**dict([("gamma", gamma), ("C", C)]))
        classifiers["svc"].fit(X_train, g_train)
        
        score_train["svc"][i][j] = classifiers["svc"].score(X_train, g_train)
        score_test["svc"][i][j] = classifiers["svc"].score(X_test, g_test)
        
score_cv_mean["svc"] = np.reshape(np.array([entry[1] for entry in grid.grid_scores_]),\
                                  (len(gamma_range), len(C_range)),\
                                  order='F')
score_cv_std["svc"] = np.reshape(np.array([np.std(entry[2]) for entry in grid.grid_scores_]),\
                                  (len(gamma_range), len(C_range)),\
                                  order='F')

In [10]:
axis_font = {'size': '16'}
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12

plt.figure()

fig, axes = plt.subplots(nrows=2, ncols=2, sharex = True, sharey = True)

caxes = [None] * 4
im0 = axes[0][0].imshow(score_train["svc"], interpolation='none', cmap = cm.autumn, aspect='auto', origin='lower', vmin=0.6, vmax=1)
axes[0][0].set_title('Train')
axes[0][1].imshow(score_test["svc"], interpolation='none', cmap = cm.autumn, aspect='auto', origin='lower', vmin=0.6, vmax=1)
axes[0][1].set_title('Test')
axes[1][0].imshow(score_cv_mean["svc"], interpolation='none', cmap = cm.autumn, aspect='auto', origin='lower', vmin=0.6, vmax=1)
axes[1][0].set_title('10-fold CV mean')
im1 = axes[1][1].imshow(score_cv_std["svc"], interpolation='none', cmap = cm.winter, aspect='auto', origin='lower', vmin=0, vmax=0.03)
axes[1][1].set_title('10-fold CV std')

for row in range(2):
    axes[row][0].set_ylabel('$\log\gamma$', **axis_font)     
    axes[row][0].set_yticks(np.arange(len(gamma_range)))
    axes[row][0].set_yticklabels(np.log10(gamma_range).astype(int))
for col in range(2):
    axes[1][col].set_xlabel('$\log C$', **axis_font)
    axes[1][col].set_xticks(np.arange(len(C_range)))
    axes[1][col].set_xticklabels(np.log10(C_range).astype(int))
    axes[1][col].grid(True)
    
fig.subplots_adjust(right=0.65)
cbar_ax0 = fig.add_axes([0.7, 0.15, 0.05, 0.7])
plt.colorbar(im0, cax=cbar_ax0)  

cbar_ax1 = fig.add_axes([0.85, 0.15, 0.05, 0.7])
plt.colorbar(im1, cax=cbar_ax1)

plt.savefig('svc_tuning.pdf', dpi=10)

In [11]:
classifiers["rfc"]  = RandomForestClassifier(n_estimators=10, max_depth=3) # Random forest classifier
classifiers["rfc"].fit(X_train, g_train)
        
score_train["rfc"] = classifiers["rfc"].score(X_train, g_train)
score_test["rfc"] = classifiers["rfc"].score(X_test, g_test)

score_train["rfc"], score_test["rfc"]

(0.9745250431778929, 0.96119016817593794)

In [12]:
classifiers["rvc"] = RVC(kernel = 'rbf', gamma = 1)
classifiers["rvc"].fit(X_train, g_train)
classifiers["rvc"].score(X_test, g_test)

0.96636481241914618

In [13]:
classifiers["vlr"] = VariationalLogisticRegression()
classifiers["vlr"].fit(X_train, g_train)
classifiers["vlr"].score(X_test, g_test)

0.94437257438551103

In [48]:
# Train the best classifier for each algorithms and evaluate their performances
fpr = dict() # False positive rate
fnr = dict() # False negative rate
precision = dict() # Precision
recall = dict() # Recall
score = dict() # Test score

classifiers["knn"].set_params(n_neighbors=15)
classifiers["tree"].set_params(max_depth=3)
classifiers["logreg"].set_params(C=1)
classifiers["svc"].set_params(C=1, gamma=1)
classifiers["rfc"].set_params(max_depth=3)
classifiers["rvc"].set_params(kernel = 'rbf', gamma = 1)
classifiers["vlr"].set_params()

for classifier in classifiers.keys():
    classifiers[classifier].fit(X_train, g_train)
    g_score = classifiers[classifier].predict(X_test)
    
    fpr[classifier], fnr[classifier] = sum((g_test==0) * (g_score==1)) / sum(g_test==0),\
                                       sum((g_test==1) * (g_score==0)) / sum(g_test==1)
        
    
    precision[classifier], recall[classifier] = (1 - fnr[classifier]) /\
                                                (1 - fnr[classifier] + fpr[classifier]),\
                                                (1 - fnr[classifier])/\
                                                (1 - fnr[classifier] + fnr[classifier])
    score[classifier] = classifiers[classifier].score(X_test, g_test)
    
    print("Classifier {0}...".format(classifier))

print('False positive: ', fpr)
print('False negative: ', fnr)
print('Precision: ', precision)
print('Recall: ', recall)
print('Score: ', score)   

Classifier tree...
Classifier svc...
Classifier vlr...
Classifier rvc...
Classifier logreg...
Classifier rfc...
Classifier knn...
False positive:  {'tree': 0.085106382978723402, 'rfc': 0.063829787234042548, 'vlr': 0.067375886524822695, 'rvc': 0.067375886524822695, 'logreg': 0.067375886524822695, 'svc': 0.056737588652482268, 'knn': 0.063829787234042548}
False negative:  {'tree': 0.026476578411405296, 'rfc': 0.022403258655804479, 'vlr': 0.048879837067209775, 'rvc': 0.022403258655804479, 'logreg': 0.048879837067209775, 'svc': 0.030549898167006109, 'knn': 0.030549898167006109}
Precision:  {'tree': 0.91960704052394593, 'rfc': 0.93870927474722254, 'vlr': 0.93384767023818804, 'rvc': 0.93552377858717672, 'logreg': 0.93384767023818804, 'svc': 0.94471032036484437, 'knn': 0.9382260431956384}
Recall:  {'tree': 0.97352342158859473, 'rfc': 0.9775967413441955, 'vlr': 0.95112016293279023, 'rvc': 0.9775967413441955, 'logreg': 0.95112016293279023, 'svc': 0.96945010183299385, 'knn': 0.96945010183299385}


In [49]:
# Plot the tree structure
dot_data = StringIO() 

export_graphviz(classifiers["tree"], out_file=dot_data,  
                filled=True, rounded=True,
                class_names=['0', '1'],
                special_characters=True)  
pydotplus.graph_from_dot_data(dot_data.getvalue()).write_pdf('tree.pdf')

True

In [39]:
svcbag = BaggingClassifier(SVC(kernel='rbf', gamma=0.1, C=10000))
svcbag.fit(X_train, g_train)
svcbag.score(X_test, g_test)

BaggingClassifier(base_estimator=SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [47]:
classifiers["tree"].mu = mu # Mean and standard deviation used to do normalization
classifiers["tree"].sigma = sigma

pickle.dump(classifiers["tree"], open("classifier.p", "wb"))