In [182]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline        

#DS
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

#ML
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import shuffle

In [183]:
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [184]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
train_df.head(5)

In [185]:
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_df.head(5)

In [186]:
train_df.drop(['PassengerId'], axis=1, inplace=True)
train_df.describe()

**Survivial rate based on gender**

In [187]:
train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

**Survival rate based on Pclass**

In [188]:
train_df[["Pclass", "Survived"]].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

**Sex**

In [189]:
train_df['Sex_clean'] = train_df['Sex'].astype('category').cat.codes
test_df['Sex_clean'] = test_df['Sex'].astype('category').cat.codes

**Embarked**

In [190]:
train_df['Embarked'].fillna('S', inplace=True)

train_df['Embarked_clean'] = train_df['Embarked'].astype('category').cat.codes
test_df['Embarked_clean'] = test_df['Embarked'].astype('category').cat.codes

**Family**

In [191]:
train_df['Family'] = 1 + train_df['SibSp'] + train_df['Parch']
test_df['Family'] = 1 + test_df['SibSp'] + test_df['Parch']

train_df['Solo'] = (train_df['Family'] == 1)
test_df['Solo'] = (test_df['Family'] == 1)

**Fare**

In [192]:
train_df['FareBin'] = pd.qcut(train_df['Fare'], 5)
test_df['FareBin'] = pd.qcut(test_df['Fare'], 5)

train_df['Fare_clean'] = train_df['FareBin'].astype('category').cat.codes
test_df['Fare_clean'] = test_df['FareBin'].astype('category').cat.codes

**Title**

In [193]:
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

train_df['Title'] = train_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
test_df['Title'] = test_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

train_df['Title'] = train_df['Title'].replace('Mlle', 'Miss')
train_df['Title'] = train_df['Title'].replace('Ms', 'Miss')
train_df['Title'] = train_df['Title'].replace('Mme', 'Mrs')

test_df['Title'] = test_df['Title'].replace('Mlle', 'Miss')
test_df['Title'] = test_df['Title'].replace('Ms', 'Miss')
test_df['Title'] = test_df['Title'].replace('Mme', 'Mrs')

train_df['Title_clean'] = train_df['Title'].astype('category').cat.codes
test_df['Title_clean'] = test_df['Title'].astype('category').cat.codes

**Age**

In [194]:
train_df["Age"].fillna(train_df.groupby("Title")["Age"].transform("median"), inplace=True)
test_df["Age"].fillna(test_df.groupby("Title")["Age"].transform("median"), inplace=True)

train_df.loc[ train_df['Age'] <= 10, 'Age_clean'] = 0
train_df.loc[(train_df['Age'] > 10) & (train_df['Age'] <= 16), 'Age_clean'] = 1
train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 20), 'Age_clean'] = 2
train_df.loc[(train_df['Age'] > 20) & (train_df['Age'] <= 26), 'Age_clean'] = 3
train_df.loc[(train_df['Age'] > 26) & (train_df['Age'] <= 30), 'Age_clean'] = 4
train_df.loc[(train_df['Age'] > 30) & (train_df['Age'] <= 36), 'Age_clean'] = 5
train_df.loc[(train_df['Age'] > 36) & (train_df['Age'] <= 40), 'Age_clean'] = 6
train_df.loc[(train_df['Age'] > 40) & (train_df['Age'] <= 46), 'Age_clean'] = 7
train_df.loc[(train_df['Age'] > 46) & (train_df['Age'] <= 50), 'Age_clean'] = 8
train_df.loc[(train_df['Age'] > 50) & (train_df['Age'] <= 60), 'Age_clean'] = 9
train_df.loc[ train_df['Age'] > 60, 'Age_clean'] = 10

# Test
test_df.loc[ test_df['Age'] <= 10, 'Age_clean'] = 0
test_df.loc[(test_df['Age'] > 10) & (test_df['Age'] <= 16), 'Age_clean'] = 1
test_df.loc[(test_df['Age'] > 16) & (test_df['Age'] <= 20), 'Age_clean'] = 2
test_df.loc[(test_df['Age'] > 20) & (test_df['Age'] <= 26), 'Age_clean'] = 3
test_df.loc[(test_df['Age'] > 26) & (test_df['Age'] <= 30), 'Age_clean'] = 4
test_df.loc[(test_df['Age'] > 30) & (test_df['Age'] <= 36), 'Age_clean'] = 5
test_df.loc[(test_df['Age'] > 36) & (test_df['Age'] <= 40), 'Age_clean'] = 6
test_df.loc[(test_df['Age'] > 40) & (test_df['Age'] <= 46), 'Age_clean'] = 7
test_df.loc[(test_df['Age'] > 46) & (test_df['Age'] <= 50), 'Age_clean'] = 8
test_df.loc[(test_df['Age'] > 50) & (test_df['Age'] <= 60), 'Age_clean'] = 9
test_df.loc[ test_df['Age'] > 60, 'Age_clean'] = 10

**Cabin**

In [195]:
cabin = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'T': 7
}

train_df['Cabin_clean'] = train_df['Cabin'].str[:1]
train_df['Cabin_clean'] = train_df['Cabin_clean'].map(cabin)
train_df['Cabin_clean'] = train_df.groupby('Pclass')['Cabin_clean'].transform('median')

test_df['Cabin_clean'] = test_df['Cabin'].str[:1]
test_df['Cabin_clean'] = test_df['Cabin_clean'].map(cabin)
test_df['Cabin_clean'] = test_df.groupby('Pclass')['Cabin_clean'].transform('median')

**Feature / Label**

In [214]:
feature = [
    'Pclass',
    'SibSp',
    'Parch',
    'Sex_clean',
    'Embarked_clean',
    'Family',
    'Solo',
    'Title_clean',
    'Age_clean',
    'Cabin_clean',
    'Fare_clean',
]

label = [
    'Survived',
]

In [215]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

X_train = train_df[feature]
Y_train = train_df[label]
Y_train = Y_train.values.ravel()
X_test = test_df[feature]
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

clf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
cross_val_score(clf, X_train, Y_train, cv=k_fold, scoring='accuracy', ).mean()

In [209]:
clf.fit(X_train, Y_train)
gender_submission['Survived'] = clf.predict(X_test)
gender_submission.to_csv('submission_update.csv',index=False)