## RF Model 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [2]:
df=pd.read_csv('../../Data/train_processed.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin,Title,AgeGroup,FareGroup,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1,0,Mr,YoungAdult,Low,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,2,1,Mrs,Adult,VeryHigh,2,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1,0,Miss,YoungAdult,Mid,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,1,Mrs,YoungAdult,VeryHigh,2,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1,0,Mr,YoungAdult,Mid,1,1


In [3]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
HasCabin         0
Title            0
AgeGroup         0
FareGroup        0
FamilySize       0
IsAlone          0
dtype: int64

In [4]:
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,Title,AgeGroup,FareGroup,FamilySize,IsAlone
0,0,3,0,22.0,1,0,7.25,1,0,Mr,YoungAdult,Low,2,0
1,1,1,1,38.0,1,0,71.2833,2,1,Mrs,Adult,VeryHigh,2,0
2,1,3,1,26.0,0,0,7.925,1,0,Miss,YoungAdult,Mid,1,1
3,1,1,1,35.0,1,0,53.1,1,1,Mrs,YoungAdult,VeryHigh,2,0
4,0,3,0,35.0,0,0,8.05,1,0,Mr,YoungAdult,Mid,1,1


In [6]:
df = pd.get_dummies(df,columns=["AgeGroup", "FareGroup","Title"],drop_first=True)

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,FamilySize,...,AgeGroup_Senior,AgeGroup_Teen,AgeGroup_YoungAdult,FareGroup_Low,FareGroup_Mid,FareGroup_VeryHigh,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,3,0,22.0,1,0,7.25,1,0,2,...,False,False,True,True,False,False,False,True,False,False
1,1,1,1,38.0,1,0,71.2833,2,1,2,...,False,False,False,False,False,True,False,False,True,False
2,1,3,1,26.0,0,0,7.925,1,0,1,...,False,False,True,False,True,False,True,False,False,False
3,1,1,1,35.0,1,0,53.1,1,1,2,...,False,False,True,False,False,True,False,False,True,False
4,0,3,0,35.0,0,0,8.05,1,0,1,...,False,False,True,False,True,False,False,True,False,False


In [8]:
y = df['Survived']  # target variable
X = df.drop('Survived' , axis=1)  #featured

In [9]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
RF = RandomForestClassifier(random_state=42)

In [11]:
param_dist = {
    "n_estimators": np.arange(100, 500, 50),
    "max_depth": [None, 4, 6, 8, 10],
    "min_samples_split": np.arange(2, 10),
    "min_samples_leaf": np.arange(1, 5)
}

random_search = RandomizedSearchCV(
    RF,
    param_dist,
    n_iter=50,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

random_search.fit(train_X, train_y)

print(random_search.best_score_)
print(random_search.best_params_)


0.8384221412390428
{'n_estimators': np.int64(450), 'min_samples_split': np.int64(8), 'min_samples_leaf': np.int64(3), 'max_depth': 6}


In [12]:
best_model = random_search.best_estimator_
predictions= best_model.predict(test_X)

test_accuracy = accuracy_score(test_y, predictions)
print("Final Test Accuracy:", round(test_accuracy, 4))

Final Test Accuracy: 0.8324


Test Case 1: Dataset Integrity & Structure Validation   
Objective-  
To verify that the cleaned dataset is properly loaded, structured, and free from missing values.

In [17]:
print(df.shape)                 #shape confirms number of samples and features

print(df.info())                #info() validates numeric and categorical encodings

print(df.isnull().sum())        #isnull().sum() ensures no unexpected missing values


(891, 22)
<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Survived             891 non-null    int64  
 1   Pclass               891 non-null    int64  
 2   Sex                  891 non-null    int64  
 3   Age                  891 non-null    float64
 4   SibSp                891 non-null    int64  
 5   Parch                891 non-null    int64  
 6   Fare                 891 non-null    float64
 7   Embarked             891 non-null    int64  
 8   HasCabin             891 non-null    int64  
 9   FamilySize           891 non-null    int64  
 10  IsAlone              891 non-null    int64  
 11  AgeGroup_Child       891 non-null    bool   
 12  AgeGroup_Senior      891 non-null    bool   
 13  AgeGroup_Teen        891 non-null    bool   
 14  AgeGroup_YoungAdult  891 non-null    bool   
 15  FareGroup_Low        891 non-null    bool

Test Case 2: Feature–Target Separation Test 
To ensure correct separation of independent variables (X) and target variable (y).      
Confirms correct dimensionality     
Validates class distribution for imbalance  

In [18]:
print(X.shape)
print(y.shape)
print(y.value_counts())

(891, 21)
(891,)
Survived
0    549
1    342
Name: count, dtype: int64


## Error Handling

1.While Loading Dataset 
Scenario-       
File path incorrect     
CSV file missing or corrupted

In [19]:
try:
    df = pd.read_csv('../../Data/train_processed.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found.")
except pd.errors.EmptyDataError:
    print("Error: Dataset file is empty.")
except Exception as e:
    print("Unexpected error while loading data:", e)


Dataset loaded successfully.


2.During Preprocessing    
Scenario-   
Required columns missing    
Wrong data types

In [None]:
required_columns = ['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']

try:
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns: {missing_cols}")
except ValueError as e:
    print("Preprocessing Error:", e)

#no output if valid

3.Error Handling During Model Training  
Scenario-      
Invalid hyperparameters     
Training failure due to bad data    

In [21]:
try:
    random_search.fit(train_X, train_y)
    best_model = random_search.best_estimator_
    print("Model trained successfully.")
except ValueError as e:
    print("Training Error:", e)
except Exception as e:
    print("Unexpected training error:", e)


Model trained successfully.
