In [1]:
import pandas as p
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import numpy as np
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
my_data = p.read_csv('train.csv')
my_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


First we start by reading the data in the train file, and we print the head to get to know the columns of the dataset and their types.

In [3]:
def preprocessing(my_data):
    encode = LabelEncoder()
    my_data['Name'] = encode.fit_transform(my_data['Name'])
    my_data['Sex'] = encode.fit_transform(my_data['Sex'])
    my_data['Ticket'] = encode.fit_transform(my_data['Ticket'])
    my_data['Cabin'] = encode.fit_transform(my_data['Cabin'].astype(str))
    my_data['Embarked'] = encode.fit_transform(my_data['Embarked'].astype(str))

From the head we realized that we had string values, we are going to build support vector machine model which works on numeric values, so we need to perform some preprocessing and encode these string values to be able to look at them as numeric values.
It is also taken into consideration that the values of cabin and embarked columns have a mix of numbers and characters so we need to convert them to string before we perform encoding.

In [4]:
preprocessing(my_data)
Features = my_data[['PassengerId','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]
Target = my_data[['Survived']]
print(Features)
print(Target)

     PassengerId  Pclass  Name  Sex   Age  SibSp  Parch  Ticket     Fare  \
0              1       3   108    1  22.0      1      0     523   7.2500   
1              2       1   190    0  38.0      1      0     596  71.2833   
2              3       3   353    0  26.0      0      0     669   7.9250   
3              4       1   272    0  35.0      1      0      49  53.1000   
4              5       3    15    1  35.0      0      0     472   8.0500   
..           ...     ...   ...  ...   ...    ...    ...     ...      ...   
886          887       2   548    1  27.0      0      0     101  13.0000   
887          888       1   303    0  19.0      0      0      14  30.0000   
888          889       3   413    0   NaN      1      2     675  23.4500   
889          890       1    81    1  26.0      0      0       8  30.0000   
890          891       3   220    1  32.0      0      0     466   7.7500   

     Cabin  Embarked  
0      147         2  
1       81         0  
2      147        

From the problem description we knew that our mission is to classify data into survived and didn't survive and that means that Survived is our target while the remaining 11 columns are features, that's why we are going to separate them.
Also columns in Features have empty and infinite values so we will use a function from numpy to replace them with finite values.

In [5]:
Features = np.nan_to_num(Features)
model = LogisticRegression(solver ='liblinear') # We used liblinear as it applies Automatic parameter selection
rfe = RFE(model, 9)
fit = rfe.fit(Features,np.ravel(Target))
print("Selected Features: %s" % (fit.support_))

Selected Features: [False  True False  True  True  True  True  True  True  True  True]


We have large number of features, and some of them maybe weak or irrelevant so we will use something called feature importance scoring, to be able to perform feature reduction without affecting the accuracy of our model.
We used logistic regression to get feature importance scoring and we said that we wanted to pick the best 9 features, logistic regression compares between features based on the value of coef_ attribute after fitting the model.
After printing the selected features, we found that the weakest two features were Name and PassengerId so we decided to remove them by redifing Features as the selected 9 columns from our data_set.

In [47]:
Features = my_data[['Pclass','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]
Features = np.nan_to_num(Features)
print(Features)

[[  3.       1.      22.     ...   7.25   147.       2.    ]
 [  1.       0.      38.     ...  71.2833  81.       0.    ]
 [  3.       0.      26.     ...   7.925  147.       2.    ]
 ...
 [  3.       0.       0.     ...  23.45   147.       2.    ]
 [  1.       1.      26.     ...  30.      60.       0.    ]
 [  3.       1.      32.     ...   7.75   147.       1.    ]]


After that we are going to split our data 80-20 and use 80% for training and 20% for validation, this help us evaluate our model and see if we want to tune hyperparameters to improve the performance of our model, by calculating it's accuracy after comparing model predictions with the actual classification.
Our model is support vector machine which tries to find a hyperplane that separates the points that belong to the two classes, survived and didn't survive.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(Features,np.ravel(Target), test_size = 0.20)
clf = SVC() 
clf.fit(X_train, y_train)

SVC()

After training our model, we are going to evaluate it by using a confusion matrix, the matrix has 4 cells, (0,0) represent predicted no(0) and actual no(0), (0,1) represent predicted yes(1) and actual no(0), (1,0) represent predicted no(0) and actual yes(1) and at last (1,1) represent predicted yes(1) and actual yes(1).

In [8]:
predictions = clf.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[89 10]
 [53 27]]
              precision    recall  f1-score   support

           0       0.63      0.90      0.74        99
           1       0.73      0.34      0.46        80

    accuracy                           0.65       179
   macro avg       0.68      0.62      0.60       179
weighted avg       0.67      0.65      0.61       179



Then we are going to use parameter grid and cross_validation to get the best hyperparameters for our model.
In SVM we have several hyperparameters including the kernel which determines the shape of hyperplane, C whose value determines a tradeoff between maximizing correct classification and maximizing margin around hyperplane to ensure correct classification of unseen data if they were near the hyperplane, also gamma is a hyperparameter for non-linear kernels defines how far the influence of a single training example reaches.

In [9]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['sigmoid','rbf']}
grid = GridSearchCV(SVC(), param_grid , refit = True, verbose = 3,cv = 5)
model = grid.fit(X_train, y_train)
model.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.629, total=   0.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.629, total=   0.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.634, total=   0.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.634, total=   0.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.634, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.629, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.629, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.629, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.634, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.634, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.634, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.629, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.629, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    4.1s finished


{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}

In [10]:
model.best_estimator_

SVC(C=1000, gamma=0.0001)

In [11]:
predictions = model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[86 13]
 [36 44]]
              precision    recall  f1-score   support

           0       0.70      0.87      0.78        99
           1       0.77      0.55      0.64        80

    accuracy                           0.73       179
   macro avg       0.74      0.71      0.71       179
weighted avg       0.73      0.73      0.72       179



We have tried non-linear kernels and found out that the best among them was rbf with C = 1000 and gamma = 0.0001
now, we will try a linear kernel and try to find the best C

In [46]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],'kernel': ['linear']}
grid = GridSearchCV(SVC(), param_grid , refit = True, verbose = 3,cv = 5)
model = grid.fit(X_train, y_train)
model.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................ C=0.1, kernel=linear, score=0.797, total=   4.0s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.9s remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.776, total=   3.1s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.0s remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.768, total=   7.3s
[CV] C=0.1, kernel=linear ............................................
[CV] ................ C=0.1, kernel=linear, score=0.782, total=   3.1s
[CV] C=0.1, kernel=linear ............................................
[CV] ................ C=0.1, kernel=linear, score=0.739, total=   7.3s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.797, total=   8.5s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.783, total=  10.6s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.768, total=  14.5s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.782, total=  11.6s
[CV] C=1, kernel=linear ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  7.6min finished


{'C': 10, 'kernel': 'linear'}

In [17]:
model.best_estimator_

SVC(C=1, kernel='linear')

In [43]:
predictions = model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[100   8]
 [ 21  50]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.87       108
           1       0.86      0.70      0.78        71

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.82       179
weighted avg       0.84      0.84      0.83       179



After trials we found out that the best hyperparameters are linear kernel and C = 1, which gave an accuracy 0f 84%

In the last step, we read data from test file, and we perform preprocessing on test data, then we use our trained model to predict the class, we store our value in csv file to be submitted to kaggle.

In [45]:
test = p.read_csv('test.csv')
preprocessing(test)
Test = test[['Pclass','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]
Test = np.nan_to_num(Test)
prediction = model.predict(Test)
print(prediction)
submission = p.DataFrame({'PassengerId':test['PassengerId'],'Survived':prediction})
filename = 'Titanic competition.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
Saved file: Titanic competition.csv
