In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

In [None]:
sns.set()
sns.set_style('whitegrid')

In [None]:
train = pd.read_csv("/kaggle/input/titanic-dataset/titanic_train.csv")

In [None]:
print(train.info())

In [None]:
print(train.describe())

In [None]:
train = train.drop(['PassengerId'], axis=1)
print(train.head())

In [None]:
sns.heatmap(train.corr(), annot=True)
plt.tight_layout()

In [None]:
sns.catplot(data=train, x='Parch', y='Survived', kind='bar', hue='Sex', ci=None)

In [None]:
sns.catplot(data=train, x='SibSp', y='Survived', kind='bar', hue='Sex', ci=None)

In [None]:
sns.catplot(data=train, x='Pclass', y='Survived', kind='bar', hue='Sex', ci=None)

In [None]:
sns.jointplot(data=train, x='Age', y='Fare', color='g')

In [None]:
sns.boxplot(data=train, x='Pclass', y='Age')

In [None]:
sns.boxplot(data=train, x='Pclass', y='Fare')

In [None]:
train = train.drop(['Ticket', 'Cabin'], axis=1)

In [None]:
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

In [None]:
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
train = train.drop(['Name'], axis=1)

In [None]:
Y = train['Survived'].values

In [None]:
train = train.drop(['Survived'], axis=1)

In [None]:
# check missing values
train.info()

In [None]:
# Replace missing age values with mean
age_mean = np.mean(train['Age'])
train['Age'] = train[['Age']].fillna(age_mean)

In [None]:
# The rest replace with 'missing'
train = train.fillna('missing')

In [None]:
print(train.info())

In [None]:
# One Hot Encoding of categorical data
categoricals = list(train.select_dtypes(include=['O']).columns) + ['Pclass']
encoder = OneHotEncoder(sparse=False, drop='first')
encoded = encoder.fit_transform(train[categoricals])

In [None]:
# df with OHE, add to train and delete categorical
train_ohe = pd.DataFrame(encoded, columns=np.hstack([x[1:] for x in encoder.categories_]))
X = pd.concat((train, train_ohe), axis=1).drop(categoricals, axis=1)

In [None]:
# split to test and train
X=X.values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

In [None]:
model = DecisionTreeClassifier(random_state=1)

In [None]:
model.fit(X_train, y_train)

In [None]:
def calculate_metrics(model, X_test, y_test):
    pred = model.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    acc = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f_score = f1_score(y_test, pred)
    print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1_score: {}'.format(
        acc, precision, recall, f_score))
    return cm

In [None]:
cm = calculate_metrics(model, X_test, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.ylabel('True label')
plt.xlabel('Predicted label');

In [None]:
max_depths = [2*x for x in range(1,9)]
for max_depth in max_depths:
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=71830)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train) > 0.5
    test_pred = model.predict(X_test) > 0.5
    print('Model params: max_depth:{}'.format(max_depth))
    print('Train_acc: {}\tTest_acc: {}'.format(accuracy_score(y_train, train_pred), accuracy_score(y_test, test_pred)))

In [None]:
min_samples_splits = [2] + [5*x for x in range(1,15,2)]
for min_sample in min_samples_splits:
    model = DecisionTreeClassifier(min_samples_split=min_sample, random_state=71830)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train) > 0.5
    test_pred = model.predict(X_test) > 0.5
    print('Model param: min_samples_split:{}'.format(min_sample))
    print('Train_acc: {}\tTest_acc: {}'.format(accuracy_score(y_train, train_pred), accuracy_score(y_test, test_pred)))

In [None]:
parameters = {'criterion': ['entropy', 'gini'],
              'min_samples_split': [2] + [5*x for x in range(1,15,2)],
              'min_samples_leaf': [2*x+1 for x in range(14)],
              'max_leaf_nodes': [None] + [2*x for x in range(1, 9)],
              'max_depth': [None] + [2*x for x in range(1,9)]}

In [None]:
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid=parameters, cv=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
best_model = DecisionTreeClassifier(**grid_search.best_params_)
best_model.fit(X_train, y_train)

cm = calculate_metrics(best_model, X_test, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test) > 0.5
cm = confusion_matrix(y_test, pred)
acc = accuracy_score(y_test, pred)
precision, recall, fscore = precision_score(y_test, pred), recall_score(y_test, pred), f1_score(y_test, pred)

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1_score: {}'.format(
        acc, precision, recall, fscore))
sns.heatmap(cm, annot=True, fmt='g')
plt.ylabel('True label')
plt.xlabel('Predicted label');