# Advanced Model Evaluation and Optimization
`Objective:`

Enhance Random Forest model’s performance by exploring advanced evaluation metrics, cross-validation strategies, 
and hyperparameter optimization techniques. Compare the optimized model with Day 7 deployment.



In [71]:
# import libraries
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
def wrangle(filepath):
    df = pd.read_csv(filepath)

    return df

In [4]:
df = wrangle(r"C:\Users\User\Desktop\100DayOfCode\Titanic_clean.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [46]:
def preprocessing(df):
    df_processed = df.copy()
    # Drop irrelevant column
    df_processed = df_processed.drop(columns = ["Unnamed: 0"])
    # subset data
    df_processed["FamilySize"] = df_processed["SibSp"] + df_processed["Parch"] + 1
    df_processed["Title"] = df_processed["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)
    # Drop unwanted columns
    df_processed.drop(columns = ["Name", "SibSp", "Parch"], inplace = True)

    # One hot encoing using pandas
    cols = ["Sex", "Embarked", "Title"]
    df_processed = pd.get_dummies(df_processed, columns = cols, drop_first=1)

    # To convert bool to int
    for col in df_processed.columns:
        if df_processed[col].dtype == "bool":
            df_processed[col] = df_processed[col].astype(int)
            
     # Standardize numerical columns
    num_cols = ["Age", "Fare", "FamilySize", "Pclass"]
    scaler = StandardScaler()
    df_processed[num_cols] = scaler.fit_transform(df_processed[num_cols])
    return df_processed, scaler

In [47]:
df_clean, scaler = preprocessing(df)
df_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,1,0,0.827377,-0.565736,A/5 21171,-0.502445,0.05916,1,0,1,...,0,0,0,0,0,1,0,0,0,0
1,2,1,-1.566107,0.663861,PC 17599,0.786845,0.05916,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,1,0.827377,-0.258337,STON/O2. 3101282,-0.488854,-0.560975,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,4,1,-1.566107,0.433312,113803,0.42073,0.05916,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,5,0,0.827377,0.433312,373450,-0.486337,-0.560975,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [52]:
features = (col for col in df_clean.columns if col not in ["PassengerId","Survived","Ticket"])
X = df_clean[features]
y = df_clean["Survived"]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Random forest model
rf_model = RandomForestClassifier(random_state = 42)
params = {
    "n_estimators": [50,100,200],
    "max_depth": [None, 10, 100]
}

grid_search = GridSearchCV(rf_model, params, cv = 5, scoring = "accuracy")
grid_search.fit(X_train, y_train)

In [56]:
# To save the best model
best_rf_model = grid_search.best_estimator_
joblib.dump(best_rf_model, "Titanic_rf_model.joblib")
joblib.dump(scaler, "Titanic_scaler.joblib")

['Titanic_scaler.joblib']

In [49]:
X.head()

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,Title_Col,Title_Countess,Title_Don,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,0.827377,-0.565736,-0.502445,0.05916,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,-1.566107,0.663861,0.786845,0.05916,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.827377,-0.258337,-0.488854,-0.560975,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,-1.566107,0.433312,0.42073,0.05916,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.827377,0.433312,-0.486337,-0.560975,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [63]:
# Prediction using the saved model
loaded_model = joblib.load("Titanic_rf_model.joblib")
y_pred = loaded_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Rf_Model_accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Rf_Model_accuracy: 0.84
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       105
           1       0.83      0.77      0.80        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179

Confusion Matrix:
 [[93 12]
 [17 57]]


In [70]:
# Stratified K-fold cross validation
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
cv_score = []
# To iterate through the 5 folds and return accuracy
for train_idx, val_idx in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_idx], y.iloc[val_idx]
    y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[val_idx]
    rf2_model = RandomForestClassifier(random_state = 42)
    rf2_model.fit(X_train_fold, y_train_fold)
    cv_score.append(rf2_model.score(X_train_fold, y_train_fold))

# Evaluate models accuracy
print(f"Stratified K-fold scores: {cv_score}")
print(f"Mean Cv accuray:\n {np.mean(cv_score):.2f} (+/- {np.std(cv_score)*2 :.2f})")

Stratified K-fold scores: [0.9817415730337079, 0.9873772791023843, 0.9845722300140253, 0.9859747545582047, 0.9803646563814866]
Mean Cv accuray:
 0.98 (+/- 0.01)


In [80]:
# Hyperparameter optimization with RandomizedSearchCV
param_dist = {
    "n_estimators": [50,100, 200,300],
    "max_depth": [None, 10, 20,30],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
    "max_features":["auto", "sqrt"]
    
}
rf3 = RandomForestClassifier(random_state= 42)
random_search = RandomizedSearchCV(rf3,  param_distributions = param_dist, n_iter = 10, cv = 5, scoring = "f1", random_state = 42, n_jobs = -1)
random_search.fit(X_train, y_train)

In [86]:
print("Best Parameters fromm Randomized Search:", random_search.best_params_)
best_rf_model = random_search.best_estimator_
y_pred_optimized = best_rf_model.predict(X_test)
print(f"Optimized Model Accuracy: {accuracy_score(y_test, y_pred_optimized):.2f}")
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_optimized))

Best Parameters fromm Randomized Search: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}
Optimized Model Accuracy: 0.85
Optimized Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.89      0.87       105
           1       0.83      0.80      0.81        74

    accuracy                           0.85       179
   macro avg       0.85      0.84      0.84       179
weighted avg       0.85      0.85      0.85       179

