# Implementing a Random Forest Classifier

`Objective:` 

Replace the logistic regression model from day 4 with a Random Forest Classifier to predict Titanic survival. 
Compare its performance to the logistic regression model from Day 4, focusing on feature importance and model evaluation.



In [69]:
# To import the required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    return df
               

In [28]:
df = wrangle(r"C:\Users\User\Desktop\100DayOfCode\Titanic_clean.csv")

In [53]:
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [54]:
# Cstom preprocessing function
def preprocess_data(df):
    df_processed = df.copy()
    df_processed.drop(columns= ["Unnamed: 0"], inplace = True)
    df_processed["FamilySize"] = df_processed["SibSp"] + df_processed["Parch"] + 1
    df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df_processed.drop(["Name",	"SibSp", "Parch"], axis = 1, inplace = True)

    # Data Preprocessing
    # Encoding caegorical variables
    categorical_cols =["Embarked","Title","Sex"]
    df_processed = pd.get_dummies(df_processed, columns = categorical_cols, drop_first = 1)
    # To convert bool to int
    for col in df_processed.columns:
        if df_processed[col].dtype == "bool":
            df_processed[col] = df_processed[col].astype(int)
    # Scale numerical variables
    numeric_cols = ["FamilySize","Age", "Fare", "Pclass"]
    scaler = StandardScaler()
    df_processed[numeric_cols] = scaler.fit_transform(df_processed[numeric_cols])
    
    return df_processed
        

In [55]:
df_clean = preprocess_data(df)

In [56]:
df_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,FamilySize,Embarked_Q,Embarked_S,Title_Col,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Sex_male
0,1,0,0.827377,-0.565736,A/5 21171,-0.502445,0.05916,0,1,0,...,0,0,0,0,1,0,0,0,0,1
1,2,1,-1.566107,0.663861,PC 17599,0.786845,0.05916,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,1,0.827377,-0.258337,STON/O2. 3101282,-0.488854,-0.560975,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,4,1,-1.566107,0.433312,113803,0.42073,0.05916,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,0,0.827377,0.433312,373450,-0.486337,-0.560975,0,1,0,...,0,0,0,0,1,0,0,0,0,1


In [67]:
# Feature and Target
features= (col for col in df_clean.columns if col not in ("PassengerId","Ticket", "Survived"))
X = df_clean[features]
y = df_clean["Survived"]

# Train _test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

# train model 
# Hyperparameter tuning wit Grid search
rf_model = RandomForestClassifier(random_state = 42)
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20]
}
grid_search = GridSearchCV(rf_model ,param_grid, cv=5, scoring = "accuracy")
grid_search.fit(X_train, y_train)


In [70]:
best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Best Random Forest Parameters{grid_search.best_params_}")
print(f"Accuracy Score: {accuracy}")
print(f"Classification_report:\n", classification_report(y_test, y_pred_rf))


Best Random Forest Parameters{'max_depth': 10, 'n_estimators': 100}
Accuracy Score: 0.8324022346368715
Classification_report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       105
           1       0.81      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.83      0.83      0.83       179



In [76]:
# Feature Importance
features= (col for col in df_clean.columns if col not in ("PassengerId","Ticket", "Survived"))
feature_importance = pd.DataFrame({'Feature': features, 'Importance': best_rf_model.feature_importances_})
print("\nRandom Forest Feature Importance:\n", feature_importance.sort_values(by='Importance', ascending=False))


Random Forest Feature Importance:
            Feature  Importance
2             Fare    0.202434
17        Title_Mr    0.158577
1              Age    0.158558
22        Sex_male    0.152821
0           Pclass    0.096598
3       FamilySize    0.090865
18       Title_Mrs    0.045341
14      Title_Miss    0.035463
5       Embarked_S    0.022585
4       Embarked_Q    0.012062
13    Title_Master    0.009796
20       Title_Rev    0.005518
9         Title_Dr    0.003328
12     Title_Major    0.001559
15      Title_Mlle    0.001376
6        Title_Col    0.001349
11      Title_Lady    0.000704
19        Title_Ms    0.000577
16       Title_Mme    0.000421
7   Title_Countess    0.000070
10  Title_Jonkheer    0.000000
8        Title_Don    0.000000
21       Title_Sir    0.000000
