# Example learning with sklearn

Joint grid search over parameters of models presented in 'models.ipynb'. A python script with source of this notebook can be found in `titanic.py`.

In [1]:
import numpy as np
import pandas as ps
from time import time

# Choice of models inspired by
# https://arxiv.org/pdf/1708.05070.pdf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier

from sklearn.preprocessing import RobustScaler, OneHotEncoder, Imputer
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.model_selection import GridSearchCV, train_test_split

from misc import ColumnSelector, IntEncoder

# read the data
csv = ps.read_csv('titanic.csv')

# get the output
y = csv['Survived']

# delete the output from csv
del csv['Survived']

X = csv.as_matrix()
col_idx = {v: i for i, v in enumerate(csv.columns)}

In [2]:
# data is represented as numpy matrix 
print(X[0, :])
# col_idx is used to find index of particular column
print(col_idx)

[1 3 'Braund, Mr. Owen Harris' 'male' 22.0 1 0 'A/5 21171' 7.25 nan 'S']
{'Gender': 3, 'Embarked': 10, 'Cabin': 9, 'SibSp': 5, 'Age': 4, 'Pclass': 1, 'PassengerId': 0, 'Name': 2, 'Parch': 6, 'Fare': 8, 'Ticket': 7}


In [3]:
# this selects subset of features and does preprocessing on them
features = make_union(
    make_pipeline(
        ColumnSelector(col_idx['Fare']),
        Imputer(), # replaces missing values with mean of values
    ),
    make_pipeline(
        ColumnSelector(col_idx['Pclass']),
        IntEncoder(),
        OneHotEncoder(sparse=False),
    ),
    make_pipeline(
        ColumnSelector(col_idx['Embarked']),
        IntEncoder(),
        OneHotEncoder(sparse=False),
    ),
)

In [10]:
# example feature transformation
features.fit(X)
display(csv.iloc[1:4, :])
print(features.transform(X[1:4, :]))

Unnamed: 0,PassengerId,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


[[ 71.2833   1.       0.       0.       1.       0.       0.    ]
 [  7.925    0.       0.       1.       0.       0.       1.    ]
 [ 53.1      1.       0.       0.       0.       0.       1.    ]]


In [5]:
# prepare the necessary model classes and search spaces
estimator = Pipeline([
    ('feats', features),
    ('scale', RobustScaler()),
    ('model', GradientBoostingClassifier()),
])

# search spaces for different model classes
gbrt = {
    'model': [GradientBoostingClassifier()],
    'model__n_estimators': [2 ** i for i in range(1, 11)],
    'model__learning_rate': [2 ** i for i in range(-10, 10)],
}
svc = {
    'model': [SVC()],
    'model__C': [10 ** i for i in np.linspace(-6, 6, 20)],
    'model__gamma': [10 ** i for i in np.linspace(-6, 6, 20)],
}
linsvc = {
    'model': [LinearSVC(max_iter=10000)],
    'model__C': [10 ** i for i in np.linspace(-6, 6, 20)],
}
knn = {
    'model': [KNeighborsClassifier()],
    'model__n_neighbors': range(1, min(len(X)-1, 256)),
}
dectree = {
    'model': [DecisionTreeClassifier()],
    'model__max_depth': range(1, 20),
    'model__min_samples_split': [2 ** i for i in range(-20, -1)],
}

# this class does search over all parameter spaces for parameter
# combination which yields the best validation loss
model = GridSearchCV(
    estimator=estimator,
    param_grid=[gbrt, linsvc, knn, dectree], #svc, 
    n_jobs=-1,
    verbose=1
)

# a class implementing trivial model - guess either at random
# or a class that is most likely
dummy_model = GridSearchCV(
    estimator=estimator,
    param_grid=[{
        'model': [DummyClassifier()],
        'model__strategy': ['most_frequent', 'uniform'],
    }],
    n_jobs=1,
)

# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)



In [6]:
# do the fitting here!
model.fit(X_train, y_train)
dummy_model.fit(X_train, y_train)

Fitting 3 folds for each of 836 candidates, totalling 2508 fits


[Parallel(n_jobs=-1)]: Done 231 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 2270 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 2508 out of 2508 | elapsed:   28.3s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(key=8, row_space=False)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0))])), ('pipeline-2', Pipeline(memory=N...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'model': [DummyClassifier(constant=None, random_state=None, strategy='most_frequent')], 'model__strategy': ['most_frequent', 'uniform']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [7]:
# analyze the results
test_score = model.score(X_test, y_test)
dummy_score = dummy_model.score(X_test, y_test)

print("Trivial accuracy: %s" % dummy_score)
print("Model accuracy: %s" % test_score)

Trivial accuracy: 0.596412556054
Model accuracy: 0.72197309417
