# Model

## Importing Libraries

In [110]:
import pandas as pd
from sklearn.model_selection import (
   train_test_split,
   GridSearchCV
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
   accuracy_score,
   precision_score,
   recall_score
)
from joblib import dump

## Loading Clean Data

### Train Data

In [111]:
df_train = pd.read_csv(
   "Datasets/Clean/train.csv",
   index_col="PassengerId"
)
df_train.head()

Unnamed: 0_level_0,Survived,Class,Gender,Age,SibSp,ParCh,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,0,22,1,0,7.25,2
2,1,1,1,38,1,0,71.2833,0
3,1,3,1,26,0,0,7.925,2
4,1,1,1,35,1,0,53.1,2
5,0,3,0,35,0,0,8.05,2


### Validation Data

In [124]:
df_validation = pd.read_csv(
   "Datasets/Clean/test.csv",
   index_col="PassengerId"
)
df_validation.head()

Unnamed: 0_level_0,Class,Gender,Age,SibSp,ParCh,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,0,34,0,0,7.8292,1
893,3,1,47,1,0,7.0,2
894,2,0,62,0,0,9.6875,1
895,3,0,27,0,0,8.6625,2
896,3,1,22,1,1,12.2875,2


## Split Data in train and test

In [112]:
X = df_train[
   ["Class","Gender","Age","SibSp","ParCh","Fare","Embarked"]
]
y = df_train["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
   X, y,
   train_size=0.9,
   random_state=5
)

## Training Model and Grid Search of Hyperparameters

### Random Forest Classifier

In [120]:
rfc = RandomForestClassifier()

### Grid of parameters

In [123]:
params = {
   "n_estimators": list(range(100, 401, 50)),
   "max_leaf_nodes": list(range(40, 81, 5)),
   "random_state": [1]
}

### Grid Search

In [113]:
grid_search = GridSearchCV(
   rfc,
   params,
   scoring="accuracy",
   verbose=1,
   n_jobs=-1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 63 candidates, totalling 315 fits


### Best parameters found

In [114]:
grid_search.best_params_

{'max_leaf_nodes': 40, 'n_estimators': 150, 'random_state': 1}

## Model Score and Feature Importance

In [115]:
y_pred = grid_search.predict(X_test)

print("Scores:")
print(f"  Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"  Precision: {precision_score(y_test, y_pred):.2%}")
print(f"  Recall: {recall_score(y_test, y_pred):.2%}")
print("\nFeature Importance:")
for name, importance in zip(
   X.columns,
   grid_search.best_estimator_.feature_importances_
):
   print(f"  {name}: {importance:.2%}")

Scores:
  Accuracy: 88.76%
  Precision: 88.46%
  Recall: 76.67%

Feature Importance:
  Class: 12.25%
  Gender: 35.81%
  Age: 17.70%
  SibSp: 5.47%
  ParCh: 3.79%
  Fare: 21.70%
  Embarked: 3.27%


## Saving results

In [117]:
df_result = pd.DataFrame(
   data={
      "Survived": grid_search.predict(df_validation)
   },
   index=df_validation.index
)
df_result.to_csv("Datasets/predictions.csv")

## Saving the best model

In [118]:
dump(grid_search.best_estimator_, "model.joblib")

['model.joblib']