In [1]:
import os, json, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
sample_submissions = pd.read_csv('gender_submission.csv')
sample_submissions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [3]:
data = pd.read_csv('train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
def dummy_var(df, var):
    for sv in set(transformed[var]):
        transformed[var + '_' + sv] = [1 if v == sv else 0 for v in transformed[var]]
    transformed.drop(var, axis=1, inplace=True)

transformed = data.copy()

transformed['Title'] = [i.split(',')[-1].strip().split(' ')[0] for i in transformed['Name']]
transformed['Cabin_Level'] = [i[0] if type(i) == str else 'N' for i in transformed['Cabin']]
transformed['Embarked'] = [i if type(i) == str else 'N' for i in transformed['Embarked']]
transformed['Age'].fillna(np.mean(transformed['Age']), inplace=True)

transformed.drop('Name', axis=1, inplace=True)
transformed.drop('Cabin', axis=1, inplace=True)
transformed.drop('Ticket', axis=1, inplace=True)

dummy_var(transformed, 'Title')
dummy_var(transformed, 'Sex')
dummy_var(transformed, 'Cabin_Level')
dummy_var(transformed, 'Embarked')

transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 39 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      891 non-null    int64  
 1   Survived         891 non-null    int64  
 2   Pclass           891 non-null    int64  
 3   Age              891 non-null    float64
 4   SibSp            891 non-null    int64  
 5   Parch            891 non-null    int64  
 6   Fare             891 non-null    float64
 7   Title_Don.       891 non-null    int64  
 8   Title_Jonkheer.  891 non-null    int64  
 9   Title_Mr.        891 non-null    int64  
 10  Title_Miss.      891 non-null    int64  
 11  Title_Lady.      891 non-null    int64  
 12  Title_Mlle.      891 non-null    int64  
 13  Title_Col.       891 non-null    int64  
 14  Title_Capt.      891 non-null    int64  
 15  Title_Master.    891 non-null    int64  
 16  Title_Dr.        891 non-null    int64  
 17  Title_Mme.      

In [5]:
n = transformed.shape[0]
n_survived = sum(transformed['Survived'])
pct_survived = n_survived*100/n
print(pct_survived)

labels = ['Survived']
variables = [col for col in transformed.columns if col not in labels]
print(labels)

38.38383838383838
['Survived']


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     transformed[variables], transformed['Survived'], test_size=0.2, 
     random_state=0, shuffle=True, stratify=transformed[labels]
)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((712, 38), (712,), (179, 38), (179,))

In [7]:
sum(y_test)*100/179, sum(y_train)*100/712

(38.547486033519554, 38.342696629213485)

In [8]:
param_grid = {
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}
param_grid

{'C': [0.1, 1, 10, 100],
 'gamma': [1, 0.1, 0.01, 0.001],
 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC()
gridsearchcv = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1, verbose=10, cv=5)
gridsearchcv.fit(X_train, y_train)
y_pred = gridsearchcv.predict(X_test)
y_train_pred = gridsearchcv.predict(X_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   18.5s


In [None]:
from sklearn.metrics import plot_confusion_matrix
disp = plot_confusion_matrix(svc, X_test, y_test, display_labels=['Survived', 'Not Survived'],cmap=plt.cm.Blues)
print(disp.confusion_matrix)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)