In [15]:
#Import required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics as mt
from sklearn.decomposition import RandomizedPCA 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import GaussianNB


In [4]:
# Reduced, imputed, file.
# Data can be downloaded here: https://www.dropbox.com/s/2vcqmorh3n3cm21/train_booked_top5_imputed.csv?dl=0
train_booked_df = pd.read_csv('data/train_booked_top5_imputed.csv')
train_booked_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
# create variables we are more familiar with
X, y = train_booked_df.iloc[:, 1:].values, train_booked_df.iloc[:, 0].values
yhat = np.zeros(y.shape) # we will fill this with predictions

# create cross validation iterator
cv = StratifiedKFold(y, n_folds=10)

# get a handle to the classifier object, which defines the type
clf = KNeighborsClassifier(n_neighbors=3)

# now iterate through and get predictions, saved to the correct row in yhat
for train, test in cv:
    clf.fit(X[train],y[train])
    yhat[test] = clf.predict(X[test])

total_accuracy = mt.accuracy_score(y, yhat)
print 'KNN accuracy', total_accuracy

KNN accuracy 0.233946521767


In [8]:
# setup pipeline to take PCA, then fit a KNN classifier
clf_pipe = Pipeline(
    [('PCA_Expedia',RandomizedPCA(n_components=5)),
     ('CLF_Expedia',KNeighborsClassifier(n_neighbors=1))]
)

# now iterate through and get predictions, saved to the correct row in yhat
for train, test in cv:
    clf_pipe.fit(X[train],y[train])
    yhat[test] = clf_pipe.predict(X[test])

total_accuracy = mt.accuracy_score(y, yhat)
print 'KNN, pipeline accuracy', total_accuracy

KNN, pipeline accuracy 0.244501564705


In [26]:
clf_pipe = Pipeline(
    [('PCA',RandomizedPCA(n_components=5)),
     ('CLF',RandomForestClassifier(max_depth=50, n_estimators=50, n_jobs=-1))]
)

# now iterate through and get predictions, saved to the correct row in yhat
for train, test in cv:
    clf_pipe.fit(X[train],y[train])
    yhat[test] = clf_pipe.predict(X[test])
    
total_accuracy = mt.accuracy_score(y, yhat)
print 'Pipeline accuracy', total_accuracy

Pipeline accuracy 0.397991514956


In [27]:
clf = RandomForestClassifier(max_depth=50, n_estimators=50, n_jobs=-1, oob_score=True)

# now iterate through and get predictions, saved to the correct row in yhat
for train, test in cv:
    clf.fit(X[train],y[train])
    yhat[test] = clf.predict(X[test])
    
total_accuracy = mt.accuracy_score(y, yhat)
print 'Accuracy', total_accuracy

Accuracy 0.455187447213


In [None]:
# now lets get access to the different properties of our RF

print clf

plt.barh(range(len(clf.feature_importances_)), clf.feature_importances_)
plt.show()

print 'Generalization score estimate from training data', clf.oob_score_

In [14]:
dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)

# lets train some trees
clf_array = [
    dt_stump,
    AdaBoostClassifier(base_estimator=dt_stump,algorithm="SAMME.R",
                       learning_rate=0.5, n_estimators=50),
    ExtraTreesClassifier(n_estimators=50,min_samples_split=1),
    RandomForestClassifier(max_depth=50, n_estimators=50)
    ]

for clf in clf_array:
    acc = cross_val_score(clf,X,y)
    print acc.mean()

0.368115655377
0.377367175428
0.423207403589
0.447129574609


In [17]:
# setup pipeline to take PCA, then fit a different classifier
clf_pipe = Pipeline(
    [('PCA',RandomizedPCA(n_components=5)),
     ('CLF',GaussianNB())]
)
clf = RandomForestClassifier(max_depth=50, n_estimators=150, n_jobs=-1, oob_score=True)
# now iterate through and get predictions, saved to the correct row in yhat
for train, test in cv:
    clf_pipe.fit(X[train],y[train])
    yhat[test] = clf_pipe.predict(X[test])

total_accuracy = mt.accuracy_score(y, yhat)
print 'Pipeline accuracy', total_accuracy

Pipeline accuracy 0.337363485019


In [24]:
from sklearn.neighbors import NearestCentroid


clf_pipe = Pipeline(
    [('PCA',RandomizedPCA(n_components=5)),
     ('CLF',NearestCentroid(metric='euclidean'))]
)

# now iterate through and get predictions, saved to the correct row in yhat
for train, test in cv:
    clf_pipe.fit(X[train],y[train])
    yhat[test] = clf_pipe.predict(X[test])
    
total_accuracy = mt.accuracy_score(y, yhat)
print 'Pipeline accuracy', total_accuracy

Pipeline accuracy 0.190320312843


In [25]:
from sklearn import metrics as mt

freq_infreq_threshold = 40

# get various measures of performance
total_accuracy = mt.accuracy_score(y, yhat)

prec_for_freq_classes = []
recall_for_infreq_classes = []
rec_tot = []
prec_tot = []

for cls in np.unique(y):
    idx = (y==cls) # get classes
    ytmp_actual = np.zeros(y.shape) # make binary class problem
    ytmp_actual[idx] = 1 # set the instances for this specific class
    
    ytmp_predicted = np.zeros(y.shape) # binary prediction array
    ytmp_predicted[yhat==cls] = 1
    
    num_in_class = sum(idx)
    
    rec = mt.recall_score(ytmp_actual, ytmp_predicted)
    prec = mt.precision_score(ytmp_actual, ytmp_predicted)
    rec_tot.append(rec)
    prec_tot.append(prec)
    
    if num_in_class < freq_infreq_threshold:
        recall_for_infreq_classes.append(rec)
    elif num_in_class >= freq_infreq_threshold:
        prec_for_freq_classes.append(prec)
        
print 'Total Accuracy:',total_accuracy
print 'Number of infrequent faces:',len(recall_for_infreq_classes), 'with average recall of:', np.mean(recall_for_infreq_classes)
print 'Number of frequent faces:',len(prec_for_freq_classes), 'with average precision of:',np.mean(prec_for_freq_classes)

Total Accuracy: 0.190320312843
Number of infrequent faces: 0 with average recall of: nan
Number of frequent faces: 5 with average precision of: 0.200373769513


