In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
# Load the training and testing data (you should replace 'train.csv' and 'test.csv' with your data files)
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data = pd.get_dummies(train_data, 
                            columns=['pclass', 'age', 'gender', 'survived'], 
                            drop_first=True)
test_data = pd.get_dummies(test_data, 
                            columns=['pclass', 'age', 'gender', 'survived'],
                            drop_first=True)

In [4]:
train_data.head(10)

Unnamed: 0,pclass_2nd,pclass_3rd,pclass_crew,age_child,gender_male,survived_yes
0,False,False,False,False,True,True
1,False,False,False,False,True,True
2,False,False,False,False,True,True
3,False,False,False,False,True,True
4,False,False,False,False,True,True
5,False,False,False,False,True,True
6,False,False,False,False,True,True
7,False,False,False,False,True,True
8,False,False,False,False,True,True
9,False,False,False,False,True,True


In [5]:
train_data.columns = ['pclass_is2nd', 'pclass_is3rd', 'pclass_iscrew', 
                     'age_ischild', 'gender_ismale', 'survived']

train_data

Unnamed: 0,pclass_is2nd,pclass_is3rd,pclass_iscrew,age_ischild,gender_ismale,survived
0,False,False,False,False,True,True
1,False,False,False,False,True,True
2,False,False,False,False,True,True
3,False,False,False,False,True,True
4,False,False,False,False,True,True
...,...,...,...,...,...,...
2145,False,False,True,False,False,True
2146,False,False,True,False,False,True
2147,False,False,True,False,False,False
2148,False,False,True,False,False,False


In [6]:
test_data.columns = ['pclass_is2nd', 'pclass_is3rd', 
                     'age_ischild', 'gender_ismale', 'survived']

# test_data

In [7]:
missing_features = set(train_data.columns) - set(test_data.columns)

for feat in missing_features:
    test_data[feat] = False

In [8]:
test_data = test_data[['pclass_is2nd', 'pclass_is3rd', 'pclass_iscrew', 
                     'age_ischild', 'gender_ismale', 'survived']]

test_data

Unnamed: 0,pclass_is2nd,pclass_is3rd,pclass_iscrew,age_ischild,gender_ismale,survived
0,False,False,False,False,True,True
1,False,False,False,False,True,True
2,False,False,False,False,True,True
3,False,False,False,False,True,True
4,False,False,False,False,True,True
...,...,...,...,...,...,...
61,False,True,False,False,True,True
62,False,True,False,False,True,True
63,False,True,False,False,True,True
64,False,True,False,False,True,True


In [9]:
X_train = train_data.drop(columns=['survived'], axis=1)
y_train = train_data['survived']

X_test = test_data.drop(columns=['survived'], axis=1)
y_test = test_data['survived']

In [16]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model's performance
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy  (Initial Model): {:.5f}%".format(accuracy * 100))
print("Precision (Initial Model): {:.5f}%".format(precision * 100))
print("Recall    (Initial Model): {:.5f}%".format(recall * 100))

Accuracy  (Initial Model): 53.03030%
Precision (Initial Model): 100.00000%
Recall    (Initial Model): 49.18033%


In [11]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid to search through
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 1, 2, 3],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_rf_classifier = grid_search.best_estimator_

# Make predictions with the best model
y_pred_tuned = best_rf_classifier.predict(X_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [18]:
best_rf_classifier

In [12]:
# Evaluate the tuned model's performance
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)

In [17]:
print("Accuracy (Tuned Model): {:.5f}%".format(accuracy_tuned * 100))
print("Precision (Tuned Model): {:.5f}%".format(precision_tuned * 100))
print("Recall (Tuned Model): {:.5f}%".format(recall_tuned * 100))

Accuracy (Tuned Model): 53.03030%
Precision (Tuned Model): 100.00000%
Recall (Tuned Model): 49.18033%
