In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import joblib

## used the cleaned data

In [34]:
df=pd.read_csv('../../Data/train_processed.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin,AgeGroup,FareGroup,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1,0,YoungAdult,Low,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,2,1,Adult,VeryHigh,2,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1,0,YoungAdult,Mid,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,1,YoungAdult,VeryHigh,2,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1,0,YoungAdult,Mid,1,1


In [35]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
HasCabin         0
AgeGroup         0
FareGroup        0
FamilySize       0
IsAlone          0
dtype: int64

Dropping the string type colums which are not used in model which are not useful(ex: string values)

In [36]:
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

In [37]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,AgeGroup,FareGroup,FamilySize,IsAlone
0,0,3,0,22.0,1,0,7.25,1,0,YoungAdult,Low,2,0
1,1,1,1,38.0,1,0,71.2833,2,1,Adult,VeryHigh,2,0
2,1,3,1,26.0,0,0,7.925,1,0,YoungAdult,Mid,1,1
3,1,1,1,35.0,1,0,53.1,1,1,YoungAdult,VeryHigh,2,0
4,0,3,0,35.0,0,0,8.05,1,0,YoungAdult,Mid,1,1


# Converting categorical columns using One Hot Encoding
In Pandas, the get_dummies() function converts categorical variables into dummy/indicator variables (known as one-hot encoding) True/false values

for example: 

we have [Low','Mid','High','VeryHigh]

<pre>
       Low     Mid       High    VeryHigh         
0       0       1         0        0         
1       0       0         0        1          
2       0       0         0        0          
3       ...

</pre>

In [38]:

df = pd.get_dummies(df,columns=["AgeGroup", "FareGroup"],drop_first=True)

In [39]:
y = df['Survived']  # target variable
X = df.drop('Survived' , axis=1)  #featured

In [40]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=42)

In [41]:
RF = RandomForestClassifier(
        n_estimators=300,
        max_depth=6,
        min_samples_leaf=3,
        min_samples_split=8,
        max_features='sqrt',
        random_state=42,
        
        
    )

In [42]:
RF.fit(train_X,train_y)

print("Train score:", RF.score(train_X, train_y))
print("Test score:", RF.score(test_X, test_y))


Train score: 0.875
Test score: 0.8044692737430168


In [43]:
predictions = RF.predict(test_X)
accuracy=accuracy_score(test_y,predictions)

In [44]:
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8045


In [45]:
param_dist = {
    "n_estimators": np.arange(100, 500, 50),
    "max_depth": [None, 4, 6, 8, 10],
    "min_samples_split": np.arange(2, 10),
    "min_samples_leaf": np.arange(1, 5)
}

random_search = RandomizedSearchCV(
    RF,
    param_dist,
    n_iter=50,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X, y)

print(random_search.best_score_)
print(random_search.best_params_)


0.8339212855439081
{'n_estimators': np.int64(300), 'min_samples_split': np.int64(2), 'min_samples_leaf': np.int64(4), 'max_depth': None}


In [46]:
# Save the model as a pickle in a file
joblib.dump(RF, 'RF_Model.joblib')

['RF_Model.joblib']

## Confusion Matrix

In [47]:
cm = confusion_matrix(test_y, predictions)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[93 12]
 [23 51]]


The confusion matrix shows that the model correctly predicts most non-survivors and survivors.
However, it misses some survivors, indicating room for improvement in recall.   
Validating confusion matrix-    
Correct predictions = 91 + 55 = 146     
Total samples = 91 + 14 + 19 + 55 = 179     
Accuracy = 146 / 179 ≈ 0.8156   
Matches the accuracy exactly.   

91 non-survivors correctly identified   
55 survivors correctly identified   
14 Predicted survived but didn’t    
19 survivors missed (false negatives)