In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
# Load the Titanic training data
data = pd.read_csv('../titanic/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Preprocess the data
# Fill missing Age with median
data['Age'].fillna(data['Age'].median(), inplace=True)

# Fill missing Embarked with mode
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Drop columns that are not useful: PassengerId, Name, Ticket, Cabin
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Encode categorical variables
le_sex = LabelEncoder()
data['Sex'] = le_sex.fit_transform(data['Sex'])

le_embarked = LabelEncoder()
data['Embarked'] = le_embarked.fit_transform(data['Embarked'])

# Features and target
X = data.drop('Survived', axis=1)
y = data['Survived']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
data['Embarked']

0      2
1      0
2      2
3      2
4      2
      ..
886    2
887    2
888    2
889    0
890    1
Name: Embarked, Length: 891, dtype: int64

In [6]:
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Best score
print("Best cross-validation score: ", grid_search.best_score_)

# Evaluate on test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("Test set accuracy: ", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot

In [8]:
# sklearn removed `grid_scores_`; use cv_results_ instead
grid_mean_scores = grid_search.cv_results_['mean_test_score']
print("Grid mean scores: ", grid_mean_scores)

Grid mean scores:  [0.80060081 0.79779376 0.79920221 0.81460652 0.81459667 0.81602482
 0.82019108 0.82298828 0.81876293 0.81458682 0.81316852 0.81598542
 0.81878263 0.81457697 0.81599527 0.81736433 0.82017138 0.81735448
 0.81737418 0.82298828 0.81877278 0.81737418 0.82298828 0.81877278
 0.81876293 0.82157983 0.82580518 0.81598542 0.81598542 0.81879248
 0.81596572 0.81737418 0.81737418 0.81736433 0.82158968 0.81453758
 0.82297843 0.81877278 0.81456712 0.81595588 0.81455727 0.81596572
 0.81876293 0.81734463 0.81876293 0.82159953 0.82439673 0.82017138
 0.82159953 0.82439673 0.82017138 0.82718408 0.82439673 0.82862208
 0.7978233  0.79639515 0.80060081 0.81179947 0.81459667 0.81602482
 0.82019108 0.82298828 0.81876293 0.81458682 0.81316852 0.81598542
 0.82019108 0.81457697 0.81599527 0.81736433 0.82017138 0.81735448
 0.81737418 0.82298828 0.81877278 0.81737418 0.82298828 0.81877278
 0.81876293 0.82157983 0.82580518 0.80060081 0.79779376 0.79920221
 0.81460652 0.81459667 0.81602482 0.8201910

In [None]:
plt.plot()