In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

In [19]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'
train.set_index('PassengerId', inplace=True)
test.set_index('PassengerId', inplace=True)
train = train[features + [target]]
test = test[features]

In [20]:
# Count nan in train
nan_percentage_train = []
nan_percentage_test = []
for feature in features:
    cnt = train[feature].isna().sum()
    nan_percentage_train.append(cnt/len(train))
    cnt = test[feature].isna().sum()
    nan_percentage_test.append(cnt/len(test))
df = pd.DataFrame(index = features)
df['train'] = nan_percentage_train
df['test'] = nan_percentage_test
df

In [21]:
data = train[features].append(test)
data = pd.get_dummies(data=data, columns=['Pclass', 'Embarked'], prefix=['Pclass', 'Embarked'])
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
imp = IterativeImputer(max_iter=10, random_state=0)
data_mice = imp.fit_transform(data)
data_mice = pd.DataFrame(data_mice, columns = data.columns, index=data.index)
data = data_mice

In [22]:
data_scaled = MinMaxScaler().fit_transform(data)
data_scaled = pd.DataFrame(data_scaled, columns = data.columns, index = data.index)
data = data_scaled

In [23]:
X_train = data.loc[train.index]
X_test = data.loc[test.index]
y_train = train['Survived']

In [24]:
rfc = RandomForestClassifier(random_state=0)
parameters = {'n_estimators':[100,200,500,1000],
              'max_depth':[2,3,4,5,6]}
clf = GridSearchCV(rfc, param_grid = parameters)
clf.fit(X_train, y_train)

In [25]:
print(clf.best_score_)
print(clf.best_params_)

In [26]:
ran_forestclf = RandomForestClassifier(random_state=0, max_depth=5, n_estimators=200)
ran_forestclf.fit(X_train, y_train)

test['Survived'] = ran_forestclf.predict(X_test)
test['Survived'].to_csv('titanic_submission.csv')