In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import joblib

## used the cleaned data

In [48]:
df=pd.read_csv('../../Data/train_processed.csv')
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,HasCabin,AgeGroup,FareGroup,FamilySize,IsAlone
0,0,3,0,22.0,1,0,7.25,1,Mr,0,YoungAdult,Low,2,0
1,1,1,1,38.0,1,0,71.2833,2,Mrs,1,Adult,VeryHigh,2,0
2,1,3,1,26.0,0,0,7.925,1,Miss,0,YoungAdult,Mid,1,1
3,1,1,1,35.0,1,0,53.1,1,Mrs,1,YoungAdult,VeryHigh,2,0
4,0,3,0,35.0,0,0,8.05,1,Mr,0,YoungAdult,Mid,1,1


In [49]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
Title         0
HasCabin      0
AgeGroup      0
FareGroup     0
FamilySize    0
IsAlone       0
dtype: int64

# Converting categorical columns using One Hot Encoding
In Pandas, the get_dummies() function converts categorical variables into dummy/indicator variables (known as one-hot encoding) True/false values

for example: 

we have [Low','Mid','High','VeryHigh]

<pre>
       Low     Mid       High    VeryHigh         
0       0       1         0        0         
1       0       0         0        1          
2       0       0         0        0          
3       ...

</pre>

In [50]:
df = pd.get_dummies(df,columns=["AgeGroup", "FareGroup","Title"],drop_first=True)

In [51]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,FamilySize,...,AgeGroup_Senior,AgeGroup_Teen,AgeGroup_YoungAdult,FareGroup_Low,FareGroup_Mid,FareGroup_VeryHigh,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,3,0,22.0,1,0,7.25,1,0,2,...,False,False,True,True,False,False,False,True,False,False
1,1,1,1,38.0,1,0,71.2833,2,1,2,...,False,False,False,False,False,True,False,False,True,False
2,1,3,1,26.0,0,0,7.925,1,0,1,...,False,False,True,False,True,False,True,False,False,False
3,1,1,1,35.0,1,0,53.1,1,1,2,...,False,False,True,False,False,True,False,False,True,False
4,0,3,0,35.0,0,0,8.05,1,0,1,...,False,False,True,False,True,False,False,True,False,False


In [52]:
y = df['Survived']  # target variable
X = df.drop('Survived' , axis=1)  #featured

In [53]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=42)

In [54]:
RF = RandomForestClassifier(random_state=42)

## Adding RandomizedSearchCV 

its purpose is to  Tune Random Forest hyperparameters efficiently using random combinations instead of all possible ones.

In [55]:
param_dist = {
    "n_estimators": np.arange(100, 500, 50),
    "max_depth": [None, 4, 6, 8, 10],
    "min_samples_split": np.arange(2, 10),
    "min_samples_leaf": np.arange(1, 5)
}

random_search = RandomizedSearchCV(
    RF,
    param_dist,
    n_iter=50,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

random_search.fit(train_X, train_y)

print(random_search.best_score_)
print(random_search.best_params_)


0.8384221412390428
{'n_estimators': np.int64(450), 'min_samples_split': np.int64(8), 'min_samples_leaf': np.int64(3), 'max_depth': 6}


Select the best model found by RandomizedSearchCV

In [56]:
best_model = random_search.best_estimator_
predictions= best_model.predict(test_X)

test_accuracy = accuracy_score(test_y, predictions)
print("Final Test Accuracy:", round(test_accuracy, 4))

Final Test Accuracy: 0.8324


Saving feature columns and trained Random Forest model for inference

In [14]:
feature_cols = list(train_X.columns)
joblib.dump(feature_cols, "feature_cols.pkl")
joblib.dump(best_model, "RF_Model.joblib")

['RF_Model.joblib']

## Confusion Matrix

In [11]:
cm = confusion_matrix(test_y, predictions)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[92 13]
 [17 57]]


The confusion matrix shows that the model correctly predicts most non-survivors and survivors.
However, it misses some survivors, indicating room for improvement in recall.   
Validating confusion matrix-    
Correct predictions = 92 + 57 = 149     
Total samples = 92 + 13 + 17 + 57 = 179     
Accuracy = 149 / 179 ≈ 0.8324  
Matches the accuracy exactly.   

92 non-survivors correctly identified   
57 survivors correctly identified   
13 Predicted survived but didn’t    
17 survivors missed (false negatives)

## Additional Evaluation Metrics

To better understand the model performance beyond accuracy, we evaluate:
- **Precision**: how many predicted survivors were actually survivors  
- **Recall**: how many actual survivors were correctly identified  
- **F1-score**: harmonic mean of precision and recall

In [None]:

precision = precision_score(test_y, predictions)
recall = recall_score(test_y, predictions)
f1 = f1_score(test_y, predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(test_y, predictions))


Precision: 0.8143
Recall: 0.7703
F1-score: 0.7917

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       105
           1       0.81      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.83      0.83      0.83       179



The precision score indicates that when the model predicts survival, it is often correct.  
Recall shows that some survivors are still missed by the model.  
The F1-score balances both precision and recall, giving a more complete picture of model performance.