In [132]:
#data preprocessing
# measure data quality, handling missing data, cleaning
import warnings
warnings.filterwarnings('ignore')


import scipy as sp
import pandas as pd
import re

import matplotlib as mlt
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [133]:
train = pd.read_csv (r'../input/titanic/train.csv')
test = pd.read_csv (r'../input/titanic/test.csv')
train.head(15)

In [134]:
# PREPROCESSING

# check missing data and gives feature data types(numerical variable/ categorical variable, string)
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    types = [str(data[i].dtype) for i in data.columns]
    
    df = pd.DataFrame({'Total':total, 'Precent':percent, 'Types':types})
    
    return(sp.transpose(df))

missing_data(train)
missing_data(test)

Check distribution for features that have missing values, so that we can have an idea on how to fix missing data. 
* For feature "Age" and "Fare", those are continuous features, we can construct densiity plot to see its distribution.
* "Embarked" is categorical feature, has value of "S","C", or "Q".

In [135]:
# check data distribution, see how to fix missing data
# e.g. pairwise deletion/ mean substitution/median substitution/ regression imputation
train.describe()

In [136]:
sns.distplot(train['Fare'])

In [137]:
sns.distplot(train['Age'])

* "Fare" & "Age" is right skewed -> consider median imputation
* "Embarked" use default mode imputation
* "Cabin" fillna with "no value" statement

In [138]:
#drop train labels, concat train&test to do missing values imputation
ytrain = train['Survived']
data = pd.concat([train.drop(columns='Survived'), test])

data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Cabin'].fillna('no value', inplace=True)
missing_data(data)

In [139]:
#split the train test dataset
train = data.loc[:train.index[-1]]
test = data.loc[train.index[-1]:][1:]
train = pd.concat([ytrain, train], axis=1)

#check distribution of ytrain to see it there is class label imbalance
sns.countplot(x='Survived', data=train).set_title('Survived probablity density function')
plt.show()

In [140]:
y_train = train['Survived']
X_train = train.drop(columns=['Survived',"Ticket",'Cabin','Name','PassengerId'])
X_train

In [141]:
#discretization: ordinal encoding for embarked & sex
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
X_train[["Embarked","Sex"]] = enc.fit_transform(X_train[["Embarked","Sex"]])

In [142]:
#correlation matrix- quick view of 'plasma soup' and its relationships
data = X_train
data["Survived"] = y_train
corrmat = data.corr()
plot = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

#check the 10 largest variable that are correlated with 'Survived'
k = 10 #in this cass, all variable
cols = corrmat.nlargest(k, 'Survived')['Survived'].index # selected feautre with hightest corr
cm = np.corrcoef(data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [143]:
X_train = X_train.drop(columns='Survived')

In [144]:
def confusion_matrix_scorer(clf, X, y):
...      y_pred = clf.predict(X)
...      cm = confusion_matrix(y, y_pred)
...      return {'tn': cm[0, 0], 'fp': cm[0, 1],
...              'fn': cm[1, 0], 'tp': cm[1, 1]}

In [145]:
#Random forest classifier
X_train_new, X_val, y_train_new, y_val = train_test_split(X_train, y_train, test_size = 0.28, random_state = 42)
RFmodel = RandomForestClassifier(n_estimators=100, max_depth=len(X_train.columns)+1, random_state=1)
RFmodel.fit(X_train_new, y_train_new)
#use cross validation with cofusion matrix to evaluate model
cv_results = cross_validate(RFmodel, X_val, y_val, cv=5,scoring=confusion_matrix_scorer)
cv_results

In [148]:
print("train accuracy: \n ", metrics.accuracy_score(y_train, RFmodel.predict(X_train)))
print("cross validation accuracy:  \n",cross_val_score(RFmodel, X_val, y_val, cv=10))

In [151]:
Average(cross_val_score(RFmodel, X_val, y_val, cv=10))

In [147]:
#preprocess X_test
X_test = test.drop(columns=["Ticket",'Cabin','Name','PassengerId'])
enc = OrdinalEncoder()
X_test[["Embarked","Sex"]] = enc.fit_transform(X_test[["Embarked","Sex"]])
#predict and ourput to csv
y_test = RFmodel.predict(X_test)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_test})
output.head()
output.to_csv('./submission.csv', index=False)