In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import joblib

## used the cleaned data

In [21]:
df=pd.read_csv('../../Data/train_processed.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin,Title,AgeGroup,FareGroup,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1,0,Mr,YoungAdult,Low,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,2,1,Mrs,Adult,VeryHigh,2,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1,0,Miss,YoungAdult,Mid,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,1,Mrs,YoungAdult,VeryHigh,2,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1,0,Mr,YoungAdult,Mid,1,1


In [22]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
HasCabin         0
Title            0
AgeGroup         0
FareGroup        0
FamilySize       0
IsAlone          0
dtype: int64

Dropping the string type colums which are not used in model which are not useful(ex: string values)

In [23]:
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

In [24]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,Title,AgeGroup,FareGroup,FamilySize,IsAlone
0,0,3,0,22.0,1,0,7.25,1,0,Mr,YoungAdult,Low,2,0
1,1,1,1,38.0,1,0,71.2833,2,1,Mrs,Adult,VeryHigh,2,0
2,1,3,1,26.0,0,0,7.925,1,0,Miss,YoungAdult,Mid,1,1
3,1,1,1,35.0,1,0,53.1,1,1,Mrs,YoungAdult,VeryHigh,2,0
4,0,3,0,35.0,0,0,8.05,1,0,Mr,YoungAdult,Mid,1,1


# Converting categorical columns using One Hot Encoding
In Pandas, the get_dummies() function converts categorical variables into dummy/indicator variables (known as one-hot encoding) True/false values

for example: 

we have [Low','Mid','High','VeryHigh]

<pre>
       Low     Mid       High    VeryHigh         
0       0       1         0        0         
1       0       0         0        1          
2       0       0         0        0          
3       ...

</pre>

In [25]:
df = pd.get_dummies(df,columns=["AgeGroup", "FareGroup"],drop_first=True)

In [26]:
df = df.select_dtypes(exclude=["object"]) #apeksha added to exclude object type columns

In [27]:
y = df['Survived']  # target variable
X = df.drop('Survived' , axis=1)  #featured

In [28]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
RF = RandomForestClassifier(
        n_estimators=300,
        max_depth=6,
        min_samples_leaf=3,
        min_samples_split=8,
        max_features='sqrt',
        random_state=42,
        
        
    )

In [30]:
train_X.dtypes

Pclass                   int64
Sex                      int64
Age                    float64
SibSp                    int64
Parch                    int64
Fare                   float64
Embarked                 int64
HasCabin                 int64
FamilySize               int64
IsAlone                  int64
AgeGroup_Child            bool
AgeGroup_Senior           bool
AgeGroup_Teen             bool
AgeGroup_YoungAdult       bool
FareGroup_Low             bool
FareGroup_Mid             bool
FareGroup_VeryHigh        bool
dtype: object

In [31]:
train_X.select_dtypes(include=["object"]).columns

Index([], dtype='object')

In [32]:
RF.fit(train_X,train_y)

print("Train score:", RF.score(train_X, train_y))
print("Test score:", RF.score(test_X, test_y))


Train score: 0.8735955056179775
Test score: 0.8212290502793296


In [33]:
predictions = RF.predict(test_X)
accuracy=accuracy_score(test_y,predictions)

In [34]:
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8212


In [35]:
param_dist = {
    "n_estimators": np.arange(100, 500, 50),
    "max_depth": [None, 4, 6, 8, 10],
    "min_samples_split": np.arange(2, 10),
    "min_samples_leaf": np.arange(1, 5)
}

random_search = RandomizedSearchCV(
    RF,
    param_dist,
    n_iter=50,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X, y)

print(random_search.best_score_)
print(random_search.best_params_)


0.8327851358985626
{'n_estimators': np.int64(400), 'min_samples_split': np.int64(8), 'min_samples_leaf': np.int64(2), 'max_depth': 10}


In [36]:
# Save the model as a pickle in a file
joblib.dump(RF, 'RF_Model.joblib')

['RF_Model.joblib']

## Confusion Matrix

In [37]:
cm = confusion_matrix(test_y, predictions)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[93 12]
 [20 54]]


The confusion matrix shows that the model correctly predicts most non-survivors and survivors.
However, it misses some survivors, indicating room for improvement in recall.   
Validating confusion matrix-    
Correct predictions = 91 + 55 = 146     
Total samples = 91 + 14 + 19 + 55 = 179     
Accuracy = 146 / 179 ≈ 0.8156   
Matches the accuracy exactly.   

91 non-survivors correctly identified   
55 survivors correctly identified   
14 Predicted survived but didn’t    
19 survivors missed (false negatives)

## Additional Evaluation Metrics

To better understand the model performance beyond accuracy, we evaluate:
- **Precision**: how many predicted survivors were actually survivors  
- **Recall**: how many actual survivors were correctly identified  
- **F1-score**: harmonic mean of precision and recall

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

precision = precision_score(test_y, predictions)
recall = recall_score(test_y, predictions)
f1 = f1_score(test_y, predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(test_y, predictions))


Precision: 0.8182
Recall: 0.7297
F1-score: 0.7714

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



The precision score indicates that when the model predicts survival, it is often correct.  
Recall shows that some survivors are still missed by the model.  
The F1-score balances both precision and recall, giving a more complete picture of model performance.