In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Explororatory Data Analysis

In [3]:
# join data together for a better picture

full_data = pd.concat(
    [
        train_data.drop(["PassengerId", "Survived"], axis=1), 
        test_data.drop(["PassengerId"], axis=1),
    ]
)
y_train = train_data["Survived"].values

In [4]:
full_data.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

This time around I am going to just add a the column for prefix and then drop rows with null entries.
Mostly borrowed from https://www.kaggle.com/code/ihelon/titanic-hyperparameter-tuning-with-gridsearchcv

# Preprocessing

In [5]:
full_data = full_data.drop(["Age", "Cabin", "Embarked", "Ticket"], axis=1)

In [6]:

full_data["Prefix"] = full_data["Name"].str.extract(" ([A-Za-z]+)\.")
    
full_data["Prefix"] = full_data["Prefix"].replace(["Ms", "Mlle"], "Miss")
full_data["Prefix"] = full_data["Prefix"].replace(["Mme", "Countess", "Lady", "Dona"], "Mrs")
full_data["Prefix"] = full_data["Prefix"].replace(["Dr", "Major", "Col", "Sir", "Rev", "Jonkheer", "Capt", "Don"], "Mr")
full_data = full_data.drop(["Name"], axis=1)
full_data.sample(5)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Prefix
134,2,male,0,0,13.0,Mr
116,3,male,0,0,6.4375,Mr
51,2,male,0,0,15.0333,Mr
686,3,male,4,1,39.6875,Mr
119,2,female,1,0,26.0,Mrs


In [7]:
# Replace categorical data with int

full_data["Sex"] = full_data["Sex"].map({"male": 1, "female": 0}).astype(int) 
full_data['Prefix'] = full_data['Prefix'].map({"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3}).astype(int)   

In [8]:
# One-hot encode the 'Prefix' Column

full_data = pd.concat([full_data, pd.get_dummies(full_data["Prefix"], prefix="Prefix")], axis=1)
full_data = full_data.drop("Prefix", axis=1)
full_data

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Prefix_0,Prefix_1,Prefix_2,Prefix_3
0,3,1,1,0,7.2500,1,0,0,0
1,1,0,1,0,71.2833,0,0,1,0
2,3,0,0,0,7.9250,0,1,0,0
3,1,0,1,0,53.1000,0,0,1,0
4,3,1,0,0,8.0500,1,0,0,0
...,...,...,...,...,...,...,...,...,...
413,3,1,0,0,8.0500,1,0,0,0
414,1,0,0,0,108.9000,0,0,1,0
415,3,1,0,0,7.2500,1,0,0,0
416,3,1,0,0,8.0500,1,0,0,0


In [9]:
# apply standard scalar

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
full_data.loc[:] = scaler.fit_transform(full_data)
full_data

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Prefix_0,Prefix_1,Prefix_2,Prefix_3
0,0.841916,0.743497,0.481288,-0.445000,-0.503402,0.819619,-0.502625,-0.425920,-0.221084
1,-1.546098,-1.344995,0.481288,-0.445000,0.734222,-1.220079,-0.502625,2.347858,-0.221084
2,0.841916,-1.344995,-0.479087,-0.445000,-0.490356,-1.220079,1.989556,-0.425920,-0.221084
3,-1.546098,-1.344995,0.481288,-0.445000,0.382778,-1.220079,-0.502625,2.347858,-0.221084
4,0.841916,0.743497,-0.479087,-0.445000,-0.487940,0.819619,-0.502625,-0.425920,-0.221084
...,...,...,...,...,...,...,...,...,...
413,0.841916,0.743497,-0.479087,-0.445000,-0.487940,0.819619,-0.502625,-0.425920,-0.221084
414,-1.546098,-1.344995,-0.479087,-0.445000,1.461271,-1.220079,-0.502625,2.347858,-0.221084
415,0.841916,0.743497,-0.479087,-0.445000,-0.503402,0.819619,-0.502625,-0.425920,-0.221084
416,0.841916,0.743497,-0.479087,-0.445000,-0.487940,0.819619,-0.502625,-0.425920,-0.221084


In [10]:
# Split data back into training and test sets

X_train_processed = full_data[:y_train.shape[0]]
X_test_processed = full_data[y_train.shape[0]:]

print(f"Train X shape: {X_train_processed.shape}")
print(f"Train y shape: {y_train.shape}")
print(f"Test X shape: {X_test_processed.shape}")

Train X shape: (891, 9)
Train y shape: (891,)
Test X shape: (418, 9)


# Model

Using XGB Regressor and GridSearchCV to get best results

In [11]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

parameters = {
    'max_depth': [7, 9, 12, 15], 
    'n_estimators': [150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1]
}

model_xgb = xgb.XGBClassifier()

model_xgb = GridSearchCV(
    model_xgb, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_xgb.fit(X_train_processed, y_train)

print(f'Best parameters {model_xgb.best_params_}')
print(f'Mean cross-validated accuracy score of the best_estimator: ' + 
      f'{model_xgb.best_score_:.3f}'
)

Best parameters {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 150}
Mean cross-validated accuracy score of the best_estimator: 0.841


^ copied from https://www.kaggle.com/code/ihelon/titanic-hyperparameter-tuning-with-gridsearchcv and tweaked

In [12]:

predictions = model_xgb.predict(X_test_processed)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


# Things to investigate/Improve:
- What does including the 'Embarked' column in the training data do?
- Ensemble other than xgb