# Titanic - Predict survival!

Dataset from Kaggle https://www.kaggle.com/competitions/titanic/overview


In [70]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

In [71]:
dataset = pd.read_csv("C:/Users/Home/Documents/Titanic/train.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [72]:
dataset.describe()
# Nothing stands out to me as outlier territory (Like a 130 year old man or something like that)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [73]:
dataset.columns[dataset.isnull().any()]

Index(['Age', 'Cabin', 'Embarked'], dtype='object')

In [104]:
dataset["Age"].isna().sum()/dataset.shape[0]
# This is almost 20% of rows with Age missing

0.19865319865319866

In [105]:
dataset["Cabin"].isna().sum()/dataset.shape[0]
# This is 77% of rows!

0.7710437710437711

In [106]:
dataset["Embarked"].isna().sum()/dataset.shape[0]

0.002244668911335578

In [75]:
testset = pd.read_csv("C:/Users/Home/Documents/Titanic/test.csv")
testset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [99]:
testset.columns[testset.isnull().any()]

Index(['Age', 'Fare', 'Cabin'], dtype='object')

In [101]:
testset["Age"].isna().sum()/testset.shape[0]

0.20574162679425836

In [102]:
testset["Fare"].isna().sum()/testset.shape[0]

0.0023923444976076554

In [103]:
testset["Cabin"].isna().sum()/testset.shape[0]

0.7822966507177034

## Solution 1
Regression: Lets use some numeric terms to see how good of a prediction we can get. This is a logistic regression problem because survival is a binomial variable; Either the person survived or they didn't.

In [76]:
# First I will exclude those rows missing Age
subset = dataset[dataset["Age"].notna()]
X = subset[["Pclass","Age","SibSp","Parch","Fare"]]
X = sm.add_constant(X)
y = subset["Survived"]
model1 = sm.Logit(y,X).fit()
model1.pred_table()
# We are doing pretty good predicting deaths, but not great predicting survival

Optimization terminated successfully.
         Current function value: 0.570854
         Iterations 6


array([[351.,  73.],
       [140., 150.]])

In [77]:
model1.summary()

0,1,2,3
Dep. Variable:,Survived,No. Observations:,714.0
Model:,Logit,Df Residuals:,708.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 19 Apr 2024",Pseudo R-squ.:,0.1548
Time:,15:33:06,Log-Likelihood:,-407.59
converged:,True,LL-Null:,-482.26
Covariance Type:,nonrobust,LLR p-value:,1.848e-30

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,3.4010,0.505,6.732,0.000,2.411,4.391
Pclass,-1.1530,0.146,-7.900,0.000,-1.439,-0.867
Age,-0.0446,0.007,-6.181,0.000,-0.059,-0.030
SibSp,-0.2923,0.106,-2.755,0.006,-0.500,-0.084
Parch,0.2479,0.109,2.273,0.023,0.034,0.462
Fare,0.0033,0.003,1.299,0.194,-0.002,0.008


In [78]:
# Above model was not great, lets try again
X = dataset[["Pclass","SibSp","Parch","Fare"]]
X = sm.add_constant(X)
y = dataset["Survived"]
model2 = sm.Logit(y,X).fit()
model2.pred_table()
# Even better at predicting death! Still really bad at predicting survival.

Optimization terminated successfully.
         Current function value: 0.599982
         Iterations 6


array([[461.,  88.],
       [190., 152.]])

In [79]:
model2.summary()

0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,Logit,Df Residuals:,886.0
Method:,MLE,Df Model:,4.0
Date:,"Fri, 19 Apr 2024",Pseudo R-squ.:,0.09901
Time:,15:33:07,Log-Likelihood:,-534.58
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,1.838e-24

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.9117,0.300,3.035,0.002,0.323,1.501
Pclass,-0.6953,0.112,-6.188,0.000,-0.916,-0.475
SibSp,-0.1465,0.081,-1.812,0.070,-0.305,0.012
Parch,0.2557,0.099,2.580,0.010,0.061,0.450
Fare,0.0051,0.002,2.097,0.036,0.000,0.010


# Solution 2
Random Forests: I would like to start incorporating some of those categorical variables, and what better way than a whole bunch of decision trees!

In [134]:
# Transform the data with one-hot encoding
subset = dataset.drop("Cabin", axis = 1)
subset = subset.dropna()
print(subset.shape[0])
features = subset[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]
features['Pclass'] = features['Pclass'].astype(str) + "class"
features = pd.get_dummies(features)
print(features.head())
target = subset["Survived"]

712
    Age  SibSp  Parch     Fare  Pclass_1class  Pclass_2class  Pclass_3class  \
0  22.0      1      0   7.2500          False          False           True   
1  38.0      1      0  71.2833           True          False          False   
2  26.0      0      0   7.9250          False          False           True   
3  35.0      1      0  53.1000           True          False          False   
4  35.0      0      0   8.0500          False          False           True   

   Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  
0       False      True       False       False        True  
1        True     False        True       False       False  
2        True     False       False       False        True  
3        True     False       False       False        True  
4       False      True       False       False        True  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Pclass'] = features['Pclass'].astype(str) + "class"


In [135]:
labels = np.array(target)
feature_list = list(features.columns)
features = np.array(features)

In [136]:
# Do the same encoding for the test data
# Transform the data with one-hot encoding
subset_test = testset.drop("Cabin", axis = 1)
subset_test = subset_test.dropna()
print(subset_test.shape[0])
features_test = subset_test[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]
features_test['Pclass'] = features_test['Pclass'].astype(str) + "class"
features_test = pd.get_dummies(features_test)
feature_list_test = list(features_test.columns)
features_test = np.array(features_test)

331


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_test['Pclass'] = features_test['Pclass'].astype(str) + "class"


In [140]:
rf1 = RandomForestRegressor(n_estimators = 100, random_state = 42, max_depth = 4)
rf1.fit(features, labels)

In [141]:
predictions = rf1.predict(features)
predictions = (predictions >= 0.5).astype(int)
sum(predictions == labels)/predictions.shape[0]

0.8497191011235955

In [142]:
importances = list(rf1.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: Sex_female           Importance: 0.28
Variable: Sex_male             Importance: 0.24
Variable: Pclass_3class        Importance: 0.15
Variable: Age                  Importance: 0.14
Variable: Fare                 Importance: 0.09
Variable: SibSp                Importance: 0.04
Variable: Pclass_1class        Importance: 0.04
Variable: Parch                Importance: 0.01
Variable: Pclass_2class        Importance: 0.0
Variable: Embarked_C           Importance: 0.0
Variable: Embarked_Q           Importance: 0.0
Variable: Embarked_S           Importance: 0.0


[None, None, None, None, None, None, None, None, None, None, None, None]

This seems like a logical result to me. I would expect that Sex would be a strong predictor for survival, as the saying goes "First save the Women and Children." I can also imagine that being the lowest of the 3 classes on the ship likely didn't help you survive either.