In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import Imputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

class dataNtarget(object):
    '''
    Split data into two parts: X and y
     data : pandas.DataFrame
    yName : string
    '''
    def __init__(self, data, yName):
        self.data = data
        self.y = data[yName]
        self.X = data.loc[:,self.data.columns != yName]


def rmCol(pandsDF, colnamesList):
    '''
    remove columns you don't want
        pandasDF : pandas.DataFrame
    colnamesList : list
    return dataframe without columns you don't want.
    '''
    tmp = pandsDF.loc[:,list(map(lambda x: x not in colnamesList,
                       pandsDF.columns))]
    return tmp

def getSplitingData(X, y, splitGenerator):
    '''
                 X : pandas.DataFrame : Data without Y
                 y : pandas.Series    : Data of Y
    splitGenerator : generator        : generator which generate spliting index
    This function fetches only one set of splitting generator.
    So you can get different set whenever you call this function.
    '''
    train_index, test_index = next(splitGenerator)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    return (X_train, X_test, y_train, y_test)


In [2]:
if __name__ == '__main__':
    titanic = pd.read_csv('titanic/train.csv')
    titanic = rmCol(titanic, ['PassengerId','Name','Ticket','Cabin'])
    titanic = pd.get_dummies(titanic)
    titanic = dataNtarget(titanic, 'Survived')
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(
    titanic.X, titanic.y, test_size=0.8, random_state=0)
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X_train)
    # Set the parameters by cross-validation
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                         'C': [1, 10, 100, 1000]},
                        {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    scores = ['precision', 'recall']
    
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                           scoring='%s_macro' % score)
        clf.fit(imp.transform(X_train), y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(imp.transform(X_test))
        print(classification_report(y_true, y_pred))
        print()


# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}

Grid scores on development set:

0.622 (+/-0.080) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.629 (+/-0.111) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.685 (+/-0.097) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.641 (+/-0.078) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.788 (+/-0.119) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.678 (+/-0.117) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.815 (+/-0.121) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.788 (+/-0.129) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.802 (+/-0.114) for {'C': 1, 'kernel': 'linear'}
0.798 (+/-0.131) for {'C': 10, 'kernel': 'linear'}
0.791 (+/-0.124) for {'C': 100, 'kernel': 'linear'}
0.799 (+/-0.123) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed