In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [75]:
df = pd.read_csv('train.csv')

In [76]:
df = df[['Survived','Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [77]:
df = df.dropna(axis=0, how='any')

In [78]:
y = df['Survived']

In [79]:
X = df[['Pclass','Sex', 'Age', 'SibSp', 'Parch','Fare', 'Embarked']]

In [80]:
from sklearn.preprocessing import LabelEncoder
X = np.array(X)

In [81]:
le_1 = LabelEncoder()

In [82]:
X[:, 1] = le_1.fit_transform(X[:, 1])

In [83]:
le_2 = LabelEncoder()
X[:, 6] = le_2.fit_transform(X[:, 6])

In [84]:
X = pd.DataFrame(data=X, columns=['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [85]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import train_test_split

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [105]:
sc = StandardScaler()

In [106]:
sc.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [107]:
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## Evaluation Function 

In [108]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [109]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
#     print accuracy score, classification report, confusion metrics
    if train:
#         training performance
        print('Train Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report : \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy : {0:.4f}\n'.format(np.mean(res)))
        print('Accuracy SD : {0:.4f}\n'.format(np.std(res)))
        
    elif train == False:
#         test performance
        print('Test Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report : \n {}\n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))


## SVM

In [110]:
from sklearn.svm import SVC

In [129]:
clf = SVC(C = 5, degree=2, cache_size=100)

In [130]:
clf.fit(X_train, y_train)

SVC(C=5, cache_size=100, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [131]:
print_score(clf, X_train, X_test, y_train, y_test, train=True)
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Train Result : 

Accuracy Score 0.8418

Classification Report : 
              precision    recall  f1-score   support

          0       0.84      0.92      0.88       344
          1       0.85      0.72      0.78       225

avg / total       0.84      0.84      0.84       569
 

Confusion Metrics : 
 [[316  28]
 [ 62 163]] 

Average Accuracy : 0.8138

Accuracy SD : 0.0531

Test Result : 

Accuracy Score 0.8112

Classification Report : 
              precision    recall  f1-score   support

          0       0.80      0.89      0.84        80
          1       0.83      0.71      0.77        63

avg / total       0.81      0.81      0.81       143


Confusion Metrics : 
 [[71  9]
 [18 45]] 



## Using grid search

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
params = {'C': [0.001, 0.01, 0.1, 1, 2, 3],
          'cache_size': [100, 200, 400, 500],
          'kernel': ['rbf', 'linear', 'poly'],
          'degree': range(1, 5)}

In [29]:
grid_cv = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [30]:
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 2, 3], 'cache_size': [100, 200, 400, 500], 'kernel': ['rbf', 'linear', 'poly'], 'degree': range(1, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [31]:
print_score(grid_cv, X_train, X_test, y_train, y_test, train=True)
print_score(grid_cv, X_train, X_test, y_train, y_test, train=False)

Train Result : 

Accuracy Score 0.8418

Classification Report : 
              precision    recall  f1-score   support

          0       0.82      0.94      0.88       344
          1       0.88      0.69      0.78       225

avg / total       0.85      0.84      0.84       569
 

Confusion Metrics : 
 [[323  21]
 [ 69 156]] 

Average Accuracy : 0.7996

Accuracy SD : 0.0438

Test Result : 

Accuracy Score 0.8252

Classification Report : 
              precision    recall  f1-score   support

          0       0.80      0.91      0.85        80
          1       0.87      0.71      0.78        63

avg / total       0.83      0.83      0.82       143


Confusion Metrics : 
 [[73  7]
 [18 45]] 



In [32]:
grid_cv.best_estimator_

SVC(C=1, cache_size=100, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)