# Titanic Survival Classification


In [1]:
# imports
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# check if dataset exist
!ls

README.md           titanic-dataset.csv titanic-svm.pdf
requirements.txt    titanic-svm.ipynb


In [3]:
df = pd.read_csv("./titanic-dataset.csv")

print("Dataset shape: ", df.shape)
df.head()

Dataset shape:  (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Preparation

Check the distribution of Survived

In [4]:
df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

Drop irrelevant variables, make sex a dummy variable

In [5]:
df.drop(['PassengerId','Name','Ticket','Fare','Embarked'],axis=1, inplace=True)
df.loc[df['Sex']=='male','Sex']=1
df.loc[df['Sex']=='female','Sex']=0

Check the percentage of null data 

In [6]:
((df.isnull().sum())/len(df))*100

Survived     0.000000
Pclass       0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Cabin       77.104377
dtype: float64

In [7]:
df.drop('Cabin', axis=1,inplace=True)
df['Age'].fillna(df['Age'].mean(), inplace = True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 41.9+ KB


# Model

In [9]:
X = df.drop('Survived', axis=1)  # features
y = df['Survived']  # labels

X.shape, y.shape

((891, 5), (891,))

## Train, test split

In [10]:
from sklearn.model_selection import train_test_split

# random_state = random seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

X_train.shape, X_test.shape

((801, 5), (90, 5))

## Data Standardization

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

scaled_X_train= scaler.transform(X_train)
scaled_X_test= scaler.transform(X_test)

## SVM Model

We are applying soft SVM here, with default C value and linear kernel

In [12]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(scaled_X_train, y_train)

In [13]:
y_pred = model.predict(scaled_X_test)

In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.90      0.81        51
           1       0.82      0.59      0.69        39

    accuracy                           0.77        90
   macro avg       0.78      0.75      0.75        90
weighted avg       0.78      0.77      0.76        90



### Hyperparameter Tuning

So far we used default values for hyperparameters, which is C.

In [15]:
# This gives us the implementation details and the parameters.

help(SVC)

Help on class SVC in module sklearn.svm._classes:

class SVC(sklearn.svm._base.BaseSVC)
 |  SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
 |  
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time scales at least
 |  quadratically with the number of samples and may be impractical
 |  beyond tens of thousands of samples. For large datasets
 |  consider using :class:`~sklearn.svm.LinearSVC` or
 |  :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a
 |  :class:`~sklearn.kernel_approximation.Nystroem` transformer or
 |  other :ref:`kernel_approximation`.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `g

In [16]:
from sklearn.model_selection import GridSearchCV

# in order to filter some sklearn warnings
import warnings
warnings.filterwarnings('ignore') 

svm = SVC(max_iter=500)
param_grid = {'C':[0.01, 0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001],
             'kernel': ['linear', 'rbf']}
grid = GridSearchCV(svm, param_grid)
grid.fit(scaled_X_train, y_train)

grid.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [17]:
y_pred_grid = grid.predict(scaled_X_test)
print(classification_report(y_test, y_pred_grid))

              precision    recall  f1-score   support

           0       0.79      0.94      0.86        51
           1       0.90      0.67      0.76        39

    accuracy                           0.82        90
   macro avg       0.84      0.80      0.81        90
weighted avg       0.83      0.82      0.82        90

