In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# Look for missing values
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# Imputing missing values
train['Pclass'] = train['Pclass'].astype('category')
train['Age'].fillna(train['Age'].mean(), inplace=True)

train['Cabin'] = train[~train['Cabin'].isna()]['Cabin'].map(lambda x: x[0])
train['Cabin'].fillna(train['Cabin'].mode()[0], inplace=True)

In [5]:
train['Cabin'].value_counts()

C    746
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [6]:
# Since there are no T-Cabins in the test dataset, we drop that sample
#train = train[train['Cabin'] != 'T']

In [75]:
# 1-hot encoding for Categorical data
X = pd.get_dummies(train.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin' \
                               ], axis=1), drop_first=True).values

y = train['Survived'].values

In [76]:
# Split training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y)

In [86]:
# Load the model
forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=9)

# Fit the model
forest.fit(X_train, y_train)

# Accuracy of the model
forest.score(X_test,y_test)

0.8470149253731343

In [53]:
# Train the model using the whole dataset
forest.fit(X,y)

RandomForestClassifier(max_depth=4, n_estimators=140, random_state=9)

### GridSearch

In [54]:
from sklearn.model_selection import GridSearchCV, KFold

In [55]:
param_grid = {'max_depth':np.arange(1,10,1), 'n_estimators':np.arange(70,150, 10) }

kf = KFold(n_splits=6, shuffle=True, random_state=9)

forest = RandomForestClassifier()

forest_cv = GridSearchCV(forest, param_grid, cv=kf)

forest_cv.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=6, random_state=9, shuffle=True),
             estimator=RandomForestClassifier(),
             param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                         'n_estimators': array([ 70,  80,  90, 100, 110, 120, 130, 140])})

In [78]:
forest_cv.best_params_

{'max_depth': 5, 'n_estimators': 90}

In [79]:
forest_cv.best_score_

0.8427153348269852

In [80]:
forest = RandomForestClassifier(max_depth=5, n_estimators=90)

forest.fit(X,y)

RandomForestClassifier(max_depth=5, n_estimators=90)

### Prepare the test data

In [81]:
test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [82]:
# Imputing missing values
test['Pclass'] = test['Pclass'].astype('category')
test['Age'].fillna(train['Age'].mean(), inplace=True)

test['Cabin'] = test[~test['Cabin'].isna()]['Cabin'].map(lambda x: x[0])
test['Cabin'].fillna(train['Cabin'].mode()[0], inplace=True)

test['Fare'].fillna(train['Fare'].mean(), inplace=True)

In [83]:
# 1-hot encoding for Categorical data
X_new = pd.get_dummies(test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin' \
                               ], axis=1), drop_first=True).values

In [84]:
# Predict the survivals
pred = forest.predict(X_new)

In [85]:
survival = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived': pred})

# Export the predictions
survival.to_csv('survival.csv', index=False)