In [1]:
import platform
import matplotlib
if platform.system() == 'Darwin':
    matplotlib.use('TkAgg')

import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import pandas as pd

def normalization(x):
    x_ = np.array(x)
    return (x_ - x_.mean(axis=0)) / (x_.max(axis=0) - x_.min(axis=0))

class Data:
    def __init__(self):
        self.df_train = pd.read_csv("./rawdata/train.csv").replace("male",0).replace("female",1)
        self.df_test = pd.read_csv("./rawdata/test.csv").replace("male",0).replace("female",1)

    def X(self, *args):
        featureList = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
                         'Ticket', 'Fare', 'Cabin', 'Embarked']
        for key in args:
            if key not in featureList:
                raise ValueError('{0} is invalid key'.format(key))

        self.df_train["Age"].fillna(self.df_train.Age.median(), inplace=True)
        self.df_test["Age"].fillna(self.df_test.Age.median(), inplace=True)

        return normalization(self.df_train[list(args)].values), normalization(self.df_test[list(args)].values)

    def y(self):
        return self.df_train["Survived"].values.astype('int')

    def result2csv(self, predictedLabel, filename='result.csv'):
        self.df_test["Survived"] = predictedLabel
        self.df_test[["PassengerId", "Survived"]].to_csv("./result/{0}".format(filename), index=False)

    def grade_hist(self):
        split_data = []
        for survived in [0,1]:
            split_data.append(self.df_train[self.df_train.Survived==survived])

        temp = [i["Pclass"].dropna() for i in split_data]
        plt.hist(temp, histtype="barstacked", bins=3)

data = Data()
#data.grade_hist()
X_train, X_test = data.X('Age', 'Sex', 'Parch', 'Pclass')
y_train = data.y()
#print(normalization(X))


In [2]:
tuned_parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
    {'C': [1, 10, 100, 1000], 'kernel': ['poly'], 'degree': [2, 3, 4], 'gamma': [0.001, 0.0001]},
    {'C': [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.001, 0.0001]}
    ]
classifier =  GridSearchCV(svm.SVC(), tuned_parameters, cv=5, scoring='accuracy', verbose=True, n_jobs=2)
classifier.fit(X_train, y_train)
print(classifier.cv_results_)

Fitting 5 folds for each of 44 candidates, totalling 220 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'mean_fit_time': array([0.0036118 , 0.00378151, 0.0066865 , 1.74308844, 0.00783949,
       0.00768943, 0.00694885, 0.00777178, 0.00658021, 0.00693607,
       0.01749673, 0.01031556, 0.00518541, 0.0049643 , 0.0045054 ,
       0.0044178 , 0.00458617, 0.00426536, 0.00474467, 0.00427709,
       0.00409474, 0.00451598, 0.0049623 , 0.00430055, 0.00596595,
       0.00555778, 0.00521545, 0.00508761, 0.00766983, 0.00518761,
       0.0064054 , 0.00496035, 0.00498834, 0.00420904, 0.005514  ,
       0.00573506, 0.00667381, 0.00609279, 0.00651999, 0.00584822,
       0.00445795, 0.00576992, 0.00469894, 0.00480056]), 'std_fit_time': array([4.42186217e-04, 3.96678748e-04, 1.53234893e-03, 6.98055376e-01,
       5.72574643e-04, 3.66154911e-04, 3.42159971e-04, 3.53281355e-04,
       5.80087457e-04, 3.30507561e-04, 1.71348499e-03, 1.37322151e-03,
       5.24134285e-04, 1.45694231e-03, 5.60348730e-04, 5.27651280e-04,
       6.60529530e-04, 4.37045966e-04, 1.19609532e-03, 2.92775903e-04,
       1.01280100e

[Parallel(n_jobs=2)]: Done 220 out of 220 | elapsed:    6.9s finished


In [3]:
estimator = classifier.best_estimator_
#estimator = svm.SVC(C=1, kernel='linear')
#estimator.fit(X_train, y_train)

y_test = estimator.predict(X_test)
data.result2csv(y_test)
