In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [11]:
from sklearn.preprocessing import LabelEncoder

In [78]:
titanic = pd.read_csv('../data/titanic.csv') # make sure you get the correct path if the file is in your local device
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
titanic_data = titanic.drop(columns = ['Parch','Ticket','Name','PassengerId','Survived','Cabin'])
titanic_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Sex_num
0,3,male,22.0,1,7.25,S,1
1,1,female,38.0,1,71.2833,C,0
2,3,female,26.0,0,7.925,S,0
3,1,female,35.0,1,53.1,S,0
4,3,male,35.0,0,8.05,S,1


In [80]:
titanic_data['Age'] = titanic_data.groupby(['Pclass', 'Sex_num'])['Age'].transform(lambda x: x.fillna(x.median()))

In [83]:
titanic_data.isna().sum()

Pclass        0
Age           0
SibSp         0
Fare          0
Sex_num       0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [82]:
le = LabelEncoder()

In [40]:
Sex_num= le.fit_transform(titanic['Sex'])

In [42]:
titanic_data['Sex_num'] = Sex_num
titanic_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Sex_num
0,3,male,22.0,1,7.25,S,1
1,1,female,38.0,1,71.2833,C,0
2,3,female,26.0,0,7.925,S,0
3,1,female,35.0,1,53.1,S,0
4,3,male,35.0,0,8.05,S,1


In [43]:
titanic_data.drop(columns = 'Sex',inplace=True)

In [44]:
titanic_data.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,Embarked,Sex_num
0,3,22.0,1,7.25,S,1
1,1,38.0,1,71.2833,C,0
2,3,26.0,0,7.925,S,0
3,1,35.0,1,53.1,S,0
4,3,35.0,0,8.05,S,1


In [57]:
titanic_data['Embarked'].fillna("S",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Embarked'].fillna("S",inplace=True)


In [64]:
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'], drop_first=False)

In [65]:
titanic_data.isna().sum()

Pclass        0
Age           0
SibSp         0
Fare          0
Sex_num       0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [66]:
from sklearn.model_selection import train_test_split

In [71]:
X_train,X_test,y_train,y_test = train_test_split(titanic_data,titanic['Survived'],train_size=0.8)

In [68]:
from sklearn.ensemble import RandomForestClassifier

In [69]:
forest = RandomForestClassifier()

In [84]:
forest.fit(titanic_data,titanic['Survived'])

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [74]:
print("Score of titanic train: {:.2f}".format(forest.score(X_train,y_train)))

Score of titanic train: 0.98


In [73]:
print("Score of titanic test: {:.2f}".format(forest.score(X_test,y_test)))

Score of titanic test: 0.99


In [76]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = forest.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[119   0]
 [  2  58]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       119
           1       1.00      0.97      0.98        60

    accuracy                           0.99       179
   macro avg       0.99      0.98      0.99       179
weighted avg       0.99      0.99      0.99       179



In [116]:
# Select 5 random samples from X_test
sample_data = X_test.sample(n=5, random_state=42)

# Predict using your trained model (RandomForest)
predictions = forest.predict(sample_data)

# Get the actual target values for those same samples
actuals = y_test.loc[sample_data.index]

In [118]:
# Display nicely
for i in range(len(sample_data)):
    print(f"Passenger {i+1}:")
    print("Features:", sample_data.iloc[i].to_dict())
    print("Predicted:", predictions[i])
    print("Actual   :", actuals.iloc[i])
    print("-" * 80)

Passenger 1:
Features: {'Pclass': 2, 'Age': 50.0, 'SibSp': 0, 'Fare': 10.5, 'Sex_num': 0, 'Embarked_C': False, 'Embarked_Q': False, 'Embarked_S': True}
Predicted: 1
Actual   : 1
--------------------------------------------------------------------------------
Passenger 2:
Features: {'Pclass': 2, 'Age': 18.0, 'SibSp': 0, 'Fare': 13.0, 'Sex_num': 1, 'Embarked_C': False, 'Embarked_Q': False, 'Embarked_S': True}
Predicted: 0
Actual   : 0
--------------------------------------------------------------------------------
Passenger 3:
Features: {'Pclass': 3, 'Age': 18.0, 'SibSp': 0, 'Fare': 9.8417, 'Sex_num': 0, 'Embarked_C': False, 'Embarked_Q': False, 'Embarked_S': True}
Predicted: 1
Actual   : 1
--------------------------------------------------------------------------------
Passenger 4:
Features: {'Pclass': 1, 'Age': 17.0, 'SibSp': 0, 'Fare': 110.8833, 'Sex_num': 1, 'Embarked_C': True, 'Embarked_Q': False, 'Embarked_S': False}
Predicted: 1
Actual   : 1
---------------------------------------