In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, precision_score, recall_score, f1_score, RocCurveDisplay
from xgboost import XGBClassifier

%matplotlib inline

RANDOM_STATE = 55

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
train_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/titanic_ml/train.csv')
test_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/titanic_ml/test.csv')
print(test_dataset.info())
train_dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Cleaning data

In [34]:
missing_age = train_dataset['Age'].isna()
train_dataset.loc[ missing_age, "Age" ] = train_dataset["Age"].median()

train_dataset['Embarked'] = train_dataset['Embarked'].fillna('S')
train_dataset.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [10]:
cat_variables = ['Sex', 'Embarked']
train_dataset = pd.get_dummies(data = train_dataset,
                         prefix = cat_variables,
                         columns = cat_variables)
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,False,True,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,True,False,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,True,False,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,True,False,False,False,True
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,False,True,False,False,True


In [11]:
features = [x for x in train_dataset.columns if x not in ['PassengerId', 'Survived', 'Name','Ticket', 'Cabin']]
print(len(features))

10


# Splitting the Dataset

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    train_dataset[features],
    train_dataset['Survived'],
    train_size=0.8,
    random_state=RANDOM_STATE,
    stratify=train_dataset['Survived']
)
print("Training set:")
print(y_train.value_counts(normalize=True))
print("\nValidation set:")
print(y_val.value_counts(normalize=True))

Training set:
Survived
0    0.616573
1    0.383427
Name: proportion, dtype: float64

Validation set:
Survived
0    0.614525
1    0.385475
Name: proportion, dtype: float64


# Training model

### LogisticRegression

In [25]:
log_reg_grid = {"C": np.logspace(-4, 4, 20), "solver": ["liblinear"]}
print(log_reg_grid)
np.random.seed(42)

rs_log_reg = GridSearchCV(LogisticRegression(), param_grid=log_reg_grid, cv=5, verbose=True)
rs_log_reg.fit(X_train, y_train)

{'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]), 'solver': ['liblinear']}
Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [23]:
rs_log_reg.best_params_

{'C': np.float64(11.288378916846883), 'solver': 'liblinear'}

In [24]:
rs_log_reg.score(X_val, y_val)

0.8100558659217877

### RandomForestClassifier

In [18]:
param_grid = {
    'n_estimators': [650, 700, 750],
    'max_depth': [5, 10, 15],
    'min_samples_split': [15, 20, 25]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    verbose=2
)
grid_search.fit(X_train, y_train)

print("New best params:", grid_search.best_params_)
print("New best CV score:", grid_search.best_score_)
print("Validation score:", grid_search.score(X_val, y_val))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=5, min_samples_split=15, n_estimators=650; total time=   2.0s
[CV] END max_depth=5, min_samples_split=15, n_estimators=650; total time=   1.8s
[CV] END max_depth=5, min_samples_split=15, n_estimators=650; total time=   1.2s
[CV] END max_depth=5, min_samples_split=15, n_estimators=650; total time=   1.2s
[CV] END max_depth=5, min_samples_split=15, n_estimators=650; total time=   1.2s
[CV] END max_depth=5, min_samples_split=15, n_estimators=700; total time=   1.3s
[CV] END max_depth=5, min_samples_split=15, n_estimators=700; total time=   1.3s
[CV] END max_depth=5, min_samples_split=15, n_estimators=700; total time=   1.3s
[CV] END max_depth=5, min_samples_split=15, n_estimators=700; total time=   4.0s
[CV] END max_depth=5, min_samples_split=15, n_estimators=700; total time=   4.9s
[CV] END max_depth=5, min_samples_split=15, n_estimators=750; total time=   2.3s
[CV] END max_depth=5, min_samples_split=15, n_e

In [20]:
best_model = RandomForestClassifier(**grid_search.best_params_, random_state=RANDOM_STATE)
scores = cross_val_score(best_model, X_train, y_train, cv=10, n_jobs=-1)
print(f"Verified score: {scores.mean():.4f} ± {scores.std():.4f}")

Verified score: 0.8230 ± 0.0353


## Make predictions

In [41]:
missing_age = test_dataset['Age'].isna()
missing_fare = test_dataset['Fare'].isna()

test_dataset.loc[ missing_age, "Age" ] = test_dataset["Age"].median()
test_dataset.loc[ missing_fare, "Fare" ] = test_dataset["Fare"].median()

test_dataset.isna().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0
Cabin,327


In [43]:
cat_variables = ['Sex', 'Embarked']
test_dataset = pd.get_dummies(data = test_dataset,
                         prefix = cat_variables,
                         columns = cat_variables)
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,False,True,False,True,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,,True,False,False,False,True
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,False,True,False,True,False
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,False,True,False,False,True
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,True,False,False,False,True


In [44]:
features = [x for x in test_dataset.columns if x not in ['PassengerId', 'Survived', 'Name','Ticket', 'Cabin']]
print(len(features))

10


In [45]:
y_preds = grid_search.predict(test_dataset[features])
y_preds

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [47]:
submission = pd.DataFrame({
    "PassengerId": test_dataset["PassengerId"],
    "Survived": y_preds
})

submission.to_csv("submission.csv", index=False)