## Titanic Kaggle Competition

* The objective to to use machine learning models to determine how many surviors there were on the Titanic.

In [1]:
# We will need to import the necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path


In [2]:
# Lets take a look at all the csv files that were downloaded
train_path = pd.read_csv(Path("Data/train.csv"))

In [3]:
train_df = pd.DataFrame(train_path)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Lets take a look at the gender_submission
gender_path = pd.read_csv(Path("Data/gender_submission.csv"))
gender_df = pd.DataFrame(gender_path)
gender_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [5]:
# check for nulls values in the train_df
# since cabin is 77% null should i drop it all together?
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# drop the cabin column
new_df = train_df.drop(columns = "Cabin")
new_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [7]:
# drop the null values
new_df = new_df.dropna()

In [8]:
# make a new data frame with just the survivors in it
surviors_df = new_df["Survived"].to_frame()
surviors_df.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [9]:
# Drop the survived column
new_df = new_df.drop(columns = "Survived")

In [10]:
# check the balance of the value counts
surviors_df.value_counts()

Survived
0           424
1           288
dtype: int64

In [11]:
surviors_df.shape

(712, 1)

In [12]:
# We need to convert all the variable in the number format
# we will creat dummies to do so
new_df = pd.get_dummies(new_df)
new_df.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel",...,Ticket_SW/PP 751,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6608,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,38.0,1,0,71.2833,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,3,26.0,0,0,7.925,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,35.0,1,0,53.1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,3,35.0,0,0,8.05,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
new_df.shape


(712, 1264)

In [14]:
# Split the train data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_df, surviors_df, random_state =78)

In [16]:
y_test.shape

(178, 1)

In [17]:
# We now need to scale the training and testing data
# When i do this i will only be fitting the scaler to the training set
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [18]:
# Fit the scaler only to the TRAINING data
x_scaled = scaler.fit(X_train)

In [19]:
# now we can scale the training and testing data
x_train_scaled = x_scaled.transform(X_train)
x_test_scaled = x_scaled.transform(X_test)

In [53]:
# import the model that we are going to use to run our predictions
# I am using the support vector machine 
from sklearn.svm import SVC

model = SVC(kernel='sigmoid', C = 10, random_state = 78)

In [54]:
# Fit the model to the training data X_train_scaled and y_train
model.fit(x_train_scaled, y_train)

  return f(*args, **kwargs)


SVC(C=10, kernel='sigmoid', random_state=78)

In [55]:
# predict this model
predictions = model.predict(X_test)
predictions

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

## Evaluation of the model

In [58]:
from sklearn.metrics import confusion_matrix

In [61]:
# Creat a confusion matrix and classification report
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Predicted Died", "Actual Die"], columns=["Predicted Survive", "Predicted Survive"]
)

In [62]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Survive,Predicted Survive.1
Predicted Died,8,97
Actual Die,6,67


Classification Report
              precision    recall  f1-score   support

           0       0.57      0.08      0.13       105
           1       0.41      0.92      0.57        73

    accuracy                           0.42       178
   macro avg       0.49      0.50      0.35       178
weighted avg       0.50      0.42      0.31       178



* Based on the results above we can see that our model did poorly.
* It looks to be that out model is not balanced, we can try a few techniques to assist

## First we can try to oversample

In [65]:
from imblearn.over_sampling import SMOTE

In [67]:
X_resampled, y_resampled = SMOTE(random_state = 1, sampling_strategy = 1.0).fit_resample(x_train_scaled, y_train)

In [68]:
# Retrain the model using the resampled data
model_2 = SVC(kernel='sigmoid', C = 10, random_state = 78)
model_2.fit(X_resampled, y_resampled)

  return f(*args, **kwargs)


SVC(C=10, kernel='sigmoid', random_state=78)

In [71]:
# Predict the new model
predictions_2 = model_2.predict(x_test_scaled)
predictions_2

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)

In [75]:
# Print out the confusion matrix to see the results
cm_2 = confusion_matrix(y_test, predictions_2)
cm_df_2 = pd.DataFrame(
    cm_2, index=["Actual Died", "Actual Survive"], columns=["Predicted Died", "Predicted Survive"]
)

In [76]:
# Displaying results
print("Confusion Matrix")
display(cm_df_2)
print("Classification Report")
print(classification_report(y_test, predictions_2))

Confusion Matrix


Unnamed: 0,Predicted Died,Predicted Survive
Actual Died,98,7
Actual Survive,28,45


Classification Report
              precision    recall  f1-score   support

           0       0.78      0.93      0.85       105
           1       0.87      0.62      0.72        73

    accuracy                           0.80       178
   macro avg       0.82      0.77      0.78       178
weighted avg       0.81      0.80      0.80       178



* This is a great outcome but lets try one more technique called SelectFromModel
* This will take the top features and run the model using only them

In [77]:
from sklearn.feature_selection import SelectFromModel

In [78]:
model_3 = SelectFromModel(model_2, prefit =True)

In [79]:
new_df_3 = model_3.transform(X_resampled)
x_test_transformed = model_3.transform(x_test_scaled)

ValueError: when `importance_getter=='auto'`, the underlying estimator SVC should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.