In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score
import pandas as pd
import numpy as np
import random as rnd
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from IPython.display import Image 
from StringIO import StringIO
import pydotplus
import matplotlib.pyplot as plt
import time
%matplotlib inline

In [2]:
def timeit(method):

    def timed(*args, **kw):

        ts = time.time()

        result = method(*args, **kw)

        te = time.time()

        if 'log_time' in kw:

            name = kw.get('log_name', method.__name__.upper())

            kw['log_time'][name] = int((te - ts) * 1000)

        else:

            print('%r  %2.2f ms' % \

                  (method.__name__, (te - ts) * 1000))

        return result

    return timed

In [3]:
def titanic(train, test):
    train_df = pd.read_csv(train)
    test_df = pd.read_csv(test)
    combine = [train_df, test_df]

    train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
    test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
    combine = [train_df, test_df]

    for dataset in combine:
        dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    pd.crosstab(train_df['Title'], train_df['Sex'])

    for dataset in combine:
        dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
        'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    for dataset in combine:
        dataset['Title'] = dataset['Title'].map(title_mapping)
        dataset['Title'] = dataset['Title'].fillna(0)

    train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
    test_df = test_df.drop(['Name'], axis=1)
    combine = [train_df, test_df]

    for dataset in combine:
        dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

    guess_ages = np.zeros((2,3))
    guess_ages

    for dataset in combine:
        for i in range(0, 2):
            for j in range(0, 3):
                guess_df = dataset[(dataset['Sex'] == i) & \
                                      (dataset['Pclass'] == j+1)]['Age'].dropna()

                # age_mean = guess_df.mean()
                # age_std = guess_df.std()
                # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

                age_guess = guess_df.median()

                # Convert random age float to nearest .5 age
                guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

        for i in range(0, 2):
            for j in range(0, 3):
                dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                        'Age'] = guess_ages[i,j]

        dataset['Age'] = dataset['Age'].astype(int)

    for dataset in combine:
        dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1



    for dataset in combine:
        dataset['IsAlone'] = 0
        dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

    combine = [train_df, test_df]

    freq_port = train_df.Embarked.dropna().mode()[0]

    for dataset in combine:
        dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

    test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

    test_df.head(10)
    return (train_df, test_df)
train_df, test_df = titanic("./train.csv", "./test.csv")

In [4]:
def preprocess(train_df, test_df):
    train_df = pd.get_dummies(train_df, ["Embarked"])

    test_df = pd.get_dummies(test_df, ['Embarked'])
    return train_df,test_df

In [5]:
train_df, test_df = preprocess(train_df, test_df)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22,1,0,7.25,1,2,0,0,0,1
1,1,1,1,38,1,0,71.2833,3,2,0,1,0,0
2,1,3,1,26,0,0,7.925,2,1,1,0,0,1
3,1,1,1,35,1,0,53.1,3,2,0,0,0,1
4,0,3,0,35,0,0,8.05,1,1,1,0,0,1


In [6]:
x_train = train_df.iloc[:713,1:]
y_train = train_df.iloc[:713,0]
x_test = train_df.iloc[713:,1:]
y_test = train_df.iloc[713:,0]

In [7]:
@timeit
def hyperparameters(model, parameters, train, test):
    acc_scorer = make_scorer(accuracy_score)
    grid_search = GridSearchCV(model, parameters, scoring = acc_scorer)
    return grid_search.fit(train, test)

In [8]:
@timeit
def classifier(C, kernel, gamma = 'auto', degree = 3, coef0 = 0.0):
    if kernel == 'linear':
        return SVC(C = C, kernel = kernel)
    elif kernel == 'poly':
        return SVC(C = C, kernel = kernel, gamma = gamma, degree = degree, coef0 = coef0)
    elif kernel == 'rbf':
        return SVC(C = C, kernel = kernel, gamma = gamma)
    elif kernel == 'sigmoid':
        return SVC(C = C, kernel = kernel, gamma = gamma, coef0 = coef0)

In [9]:
clf = classifier(C = 0.01, kernel = 'linear')
clf.fit(x_train,y_train)
prediction = clf.predict(x_test)
accuracy_score(y_test, prediction)

'classifier'  0.09 ms


0.8539325842696629