In [18]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 

In [2]:
train = pd.read_csv("D:\\LSU\\Sp_2025\\BE7910\\titanic\\train.csv")
test = pd.read_csv("D:\\LSU\\Sp_2025\\BE7910\\titanic\\test.csv")

In [3]:
train["FamilySize"] = train["Parch"] + train["SibSp"]
test["FamilySize"] = test["Parch"] + test["SibSp"]

In [4]:
x = train[['Pclass', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
y = train[['Survived']]

In [5]:
x_test = test[['Pclass', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]

In [6]:
x.isna().sum()

Pclass          0
Sex             0
Age           177
FamilySize      0
Fare            0
Embarked        2
dtype: int64

In [7]:
x_test.isna().sum()

Pclass         0
Sex            0
Age           86
FamilySize     0
Fare           1
Embarked       0
dtype: int64

In [8]:
encoder = OneHotEncoder()
encoded_df = encoder.fit_transform(x[['Sex', 'Embarked']])

feature_names = encoder.get_feature_names_out()
encoded_df = pd.DataFrame(encoded_df.toarray(), columns=feature_names)

# Drop the last encoded column to avoid dummy variable trap (optional)
encoded_df = encoded_df.drop(feature_names[-1], axis=1)

# Integrate back into X
x1 = x.drop(['Sex', 'Embarked'], axis=1)
x1 = pd.concat([x1, encoded_df], axis=1)
x1.isna().sum()

Pclass          0
Age           177
FamilySize      0
Fare            0
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [9]:
encoded_df_test = encoder.transform(x_test[['Sex', 'Embarked']])

encoded_df_test = pd.DataFrame(encoded_df_test.toarray(), columns=feature_names)

# Drop the last encoded column to avoid dummy variable trap (optional)
encoded_df_test = encoded_df_test.drop(feature_names[-1], axis=1)

# Integrate back into X
x1_test = x_test.drop(['Sex', 'Embarked'], axis=1)
x1_test = pd.concat([x1_test, encoded_df_test], axis=1)
x1_test.isna().sum()

Pclass         0
Age           86
FamilySize     0
Fare           1
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [10]:
imputer = SimpleImputer(strategy='median')
age_imputed = imputer.fit_transform(pd.DataFrame(x1['Age']))

imputed_df = pd.DataFrame(age_imputed, columns=["Age"])

# Replace original Age column
x1 = x1.drop(['Age'], axis=1)
x1 = pd.concat([x1, imputed_df], axis=1)
x1.isna().sum()

Pclass        0
FamilySize    0
Fare          0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
Age           0
dtype: int64

In [11]:
age_imputed_test = imputer.transform(pd.DataFrame(x1_test['Age']))

fare_imputer = SimpleImputer(strategy='median')

fare_imputed = fare_imputer.fit_transform(pd.DataFrame(x1['Fare']))
fare_imputed_test = fare_imputer.transform(pd.DataFrame(x1_test['Fare']))

imputed_df_age_test = pd.DataFrame(age_imputed_test, columns=["Age"])
imputed_df_fare_test = pd.DataFrame(fare_imputed_test, columns=["Fare"])

# Replace original Age column
x1_test = x1_test.drop(['Age'], axis=1)
x1_test = x1_test.drop(['Fare'], axis=1)
x1_test = pd.concat([x1_test, imputed_df_age_test, imputed_df_fare_test], axis=1)
x1_test.isna().sum()

Pclass        0
FamilySize    0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
Age           0
Fare          0
dtype: int64

In [12]:
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x1)

x_scaled = pd.DataFrame(x_scaled, columns=x1.columns)

In [13]:
x1_test = x1_test[x1.columns]

x_test_scaled = scaler.transform(x1_test)

x_test_scaled = pd.DataFrame(x_test_scaled, columns=x1_test.columns)

x_test_scaled = x_test_scaled[x_scaled.columns]

In [14]:
x_test_scaled.isna().sum()

Pclass        0
FamilySize    0
Fare          0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
Age           0
dtype: int64

In [None]:
#model = RandomForestClassifier()
#model.fit(x_scaled, y.squeeze())

#y_pred = model.predict(x_test_scaled)

Hyperparameter Tuning

In [19]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5]
}
# Create a base model
rf = RandomForestClassifier(random_state=42)
# GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,             # 5-fold cross-validation
    n_jobs=-1     # use all available CPU cores
)
# Fit on training data
grid_search.fit(x_scaled, y.squeeze())
# Inspect best hyperparams and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Hyperparameters: {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.8282844768062269


In [20]:
# Use the best estimator to predict on test data
best_rf = grid_search.best_estimator_
y_test_pred = best_rf.predict(x_test_scaled)

In [21]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_test_pred})

In [22]:
output.to_csv("D:\\LSU\\Sp_2025\\BE7910\\titanic\\pred_RF_classifier_HP_tuning.csv", index = False)