In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
BASE_PATH = '/kaggle/input/titanic'

### Let us first fetch the data!

In [None]:
train_data = pd.read_csv(BASE_PATH + '/train.csv')
test_data = pd.read_csv(BASE_PATH + '/test.csv')
idx = test_data['PassengerId']

In [None]:
print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

So, as can be seen, we have **891** samples each having **12** features in our train dataset.

In our test dataset, we have **418** samples each having **11** features. 

(one less than the train dataset because it does not contain the target variable)

## Now, we will leave the test dataset be by itself. We will explore the train dataset.

In [None]:
train_data.head()

It can be seen that the columns **PassengerId**, **Name**, **Ticket**, and **Cabin** are not useful, so we will drop these columns.

In [None]:
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']

In [None]:
train_data = train_data.drop(columns = drop_cols)
test_data = test_data.drop(columns = drop_cols)
train_data.head()

In [None]:
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Sex', 'Embarked']

In [None]:
train_data['Survived'].value_counts()

In [None]:
train_data['Pclass'].value_counts()

In [None]:
train_data['Embarked'].value_counts()

## We will now explore the data.

### Let's first see how many males and females survived in proportion.

In [None]:
import seaborn as sns
sns.catplot(x = "Survived", data = train_data, hue = 'Sex', kind = 'count')
plt.show()
total_males = (train_data.Sex.values == "male").sum()
survived_males = (train_data["Survived"] == 1).loc[train_data["Sex"] == "male"].sum()
not_survived_males = total_males - survived_males
total_females = (train_data.Sex.values == "female").sum()
survived_females = (train_data["Survived"] == 1).loc[train_data["Sex"] == "female"].sum()
not_survived_females = total_females - survived_females
prop_survived_males = (100 * survived_males) / total_males
prop_survived_females = (100 * survived_females) / total_females
print(f"Proportion of males survived w.r.o males: {prop_survived_males}")
print(f"Proportion of females survived w.r.o females: {prop_survived_females}")

#### As it is seen, only 18.9% of the males survive, whereas 74.2% of the females survive.

In [None]:
sns.catplot(x = "Survived", y = "Age", hue = "Sex", data = train_data, kind = "violin")

### Let's observe the correlation matrix

In [None]:
sns.heatmap(train_data.corr(), annot = True)

#### We can see that the survival feature is negatively correlated with the class. Let's plot the relationship.

In [None]:
sns.catplot(x = "Survived", kind = "count", hue = "Pclass", data = train_data)

Oh **wow**. That is QUITE surprising.

In [None]:
sns.catplot(x = "Survived", kind = "count", hue = "Embarked", data = train_data)

#### We will create a new feature which will be the cumulation of SibSp and Parch.

In [None]:
train_data["family"] = train_data["SibSp"] + train_data["Parch"]
test_data["family"] = test_data["SibSp"] + test_data["Parch"]
num_cols.append("family")
train_data["family"].describe()

In [None]:
sns.histplot(data = train_data, x = "family",bins = 10)

In [None]:
sns.catplot(x = "family", kind = "count", hue = "Survived", data = train_data)

I think this data looks much more insightful than SibSp and Parch alone.

In [None]:
sns.heatmap(train_data.corr(), annot = True)

In [None]:
sns.catplot(x = "family", kind = "count", hue = "Pclass", data = train_data)

In [None]:
sns.catplot(x = "family", y = "Fare",hue = "Pclass", data = train_data)

In [None]:
sns.catplot(x = "Sex", y = "Survived", hue = "Pclass", kind = "point", data = train_data)

In [None]:
sns.catplot(x = "Sex", y = "family", hue = "Survived", kind = "point", data = train_data)

In [None]:
train_data = train_data.drop(columns = ["SibSp", "Parch"])
test_data = test_data.drop(columns = ["SibSp", "Parch"])
num_cols.remove("SibSp")
num_cols.remove("Parch")

Now, let's check for skewed features.

In [None]:
from scipy.stats import shapiro
num_features = train_data.dtypes[train_data.dtypes != 'object'].index
skew_feats = train_data[num_features].skew().sort_values(ascending = False)
print(skew_feats)

Looks like the fare and family attributes are the most skewed, let's look at them.

In [None]:
sns.kdeplot(train_data["Fare"], shade = True)

It can be seen that this is heavily right-skewed.

In [None]:
sns.kdeplot(train_data["family"], shade = True)

Now, let us normalize/transform both features.

In [None]:
print(train_data.Fare.isna().sum())
print(train_data.family.isna().sum())

In [None]:
from sklearn.preprocessing import PowerTransformer
yeojohnson = PowerTransformer()
train_data["Fare"] = yeojohnson.fit_transform(train_data["Fare"].values.reshape(-1, 1))
test_data["Fare"] = yeojohnson.transform(test_data["Fare"].values.reshape(-1, 1))
train_data["family"] = yeojohnson.fit_transform(train_data["family"].values.reshape(-1, 1))
test_data["family"] = yeojohnson.transform(test_data["family"].values.reshape(-1, 1))
print(train_data.Fare.describe())
print(train_data.family.describe())

In [None]:
train_data.hist(figsize = (20, 20), bins = 20)

#### Checking for null values

In [None]:
train_data.isna().sum()

As there are not many null values for **Embarked**, we will just drop these two entries.

In [None]:
train_data = train_data.dropna(subset = ['Embarked'])
test_data = test_data.dropna(subset = ['Embarked'])
train_data["Embarked"].isna().sum()

In [None]:
X_train = train_data.drop(columns = ['Survived'])
y_train = train_data['Survived']

Cool!


uhhhhh.... well we still have a *lot* of null values in the **Age** column.
We have a few things that can be done here:
1. Imputation based on mean
2. Imputation based on median
3. Drop all instances

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "mean")

In [None]:
imputer.fit(X_train[num_cols])

In [None]:
imputer.statistics_

In [None]:
X_train[num_cols] = imputer.transform(X_train[num_cols])
test_data[num_cols] = imputer.transform(test_data[num_cols])
X_train.isna().sum()

Ok, null values are officially GONE.

Now, for our categorical columns, we will encode them using one-hot encoding.

In [None]:
one_hot_train = pd.get_dummies(X_train[cat_cols])
one_hot_test = pd.get_dummies(test_data[cat_cols])
X_train = pd.concat([X_train, one_hot_train], axis = 1)
test_data = pd.concat([test_data, one_hot_test], axis = 1)
X_train = X_train.drop(columns = ['Embarked', 'Sex'])
test_data = test_data.drop(columns = ['Embarked', 'Sex'])
X_train.head()

In [None]:
y_train.head()

Perfect! Now, we move forward to model selection

In [None]:
X_train_numpy = X_train.to_numpy()
y_train_numpy = y_train.to_numpy()

Let's try Logistic Regression first because, why not xD

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train_numpy, y_train_numpy)
scores = cross_val_score(lr, X_train_numpy, y_train_numpy, scoring = 'f1', cv = 10)
print(f"{scores.mean():.2f} f1 score with a standard deviation of {scores.std():.2f}.")
acc_scores = cross_val_score(lr, X_train_numpy, y_train_numpy, scoring = 'accuracy', cv = 10)
print(f"{acc_scores.mean():.2f} accuracy with a standard deviation of {acc_scores.std():.2f}.")

In [None]:
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
scores = cross_val_predict(lr, X_train_numpy, y_train_numpy, method = "decision_function", cv = 3)

preds = lr.predict(X_train_numpy)
print(f"Precision: {precision_score(y_train_numpy, preds):.3f}")
print(f"Recall: {recall_score(y_train_numpy, preds):.3f}")
precisions, recalls, thresholds = precision_recall_curve(y_train_numpy, scores)

In [None]:
plt.plot(thresholds, precisions[:-1], "b--", label = "Precision")
plt.plot(thresholds, recalls[:-1], "g--", label = "Recall")
plt.axis([-3.7, 4.2, 0, 1])
plt.plot()

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_numpy, scores)
plt.plot(fpr, tpr, linewidth = 2)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("How many negative examples gone unseen?")
plt.ylabel("How many positive examples were seen?")
plt.show()

In [None]:
def evaluate_model(model, X_train, y_train, th_low = -5, th_high = 5):
    acc = cross_val_score(model, X = X_train, y = y_train, scoring = "accuracy", cv = 10).mean()
    f1 = cross_val_score(model, X = X_train, y = y_train, scoring = "f1", cv = 10).mean()
    preds = model.predict(X_train)
    recall = recall_score(y_train, preds)
    precision = precision_score(y_train, preds)
    print(f"Accuracy : {acc:.3f}\nF1 Score : {f1:.3f}\nRecall : {recall:.3f}\nPrecision : {precision:.3f}")
    try:
        scores = cross_val_predict(model, X_train, y_train, method = "decision_function", cv = 10)
    except:
        scores = cross_val_predict(model, X_train, y_train, method = "predict_proba", cv = 10)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_train, scores)
    f, axs = plt.subplots(1,2,figsize=(15,5))
    axs[0].plot(thresholds, precisions[:-1], "b--", label = "Precision")
    axs[0].plot(thresholds, recalls[:-1], "g--", label = "Recall")
    axs[0].axis([th_low, th_high, 0, 1])
    axs[0].set_xlabel("Threshold")
    axs[0].set_ylabel("Value")
    fpr, tpr, thresholds = roc_curve(y_train_numpy, scores)
    axs[1].plot(fpr, tpr, linewidth = 2)
    axs[1].plot([0, 1], [0, 1], 'k--')
    axs[1].set_xlabel("How many negative examples gone unseen?")
    axs[1].set_ylabel("How many positive examples were seen?")
    plt.show()

In [None]:
evaluate_model(lr, X_train_numpy, y_train_numpy, -3.7, 4.2)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train_numpy, y_train_numpy)

In [None]:
evaluate_model(sgd_clf, X_train_numpy, y_train_numpy)

In [None]:
!pip install delayed -U
!pip install percentile

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_numpy, y_train_numpy)

evaluate_model(rf, X_train_numpy, y_train_numpy)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dftree = DecisionTreeClassifier()
dftree.fit(X_train_numpy, y_train_numpy)

evaluate_model(dftree, X_train_numpy, y_train_numpy)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(weights = 'distance')
knn.fit(X_train_numpy, y_train_numpy)
evaluate_model(knn, X_train_numpy, y_train_numpy)

#### Okay! So both Random Forest, Decision tree, and KNN looks good here. We're gonna tune all their hyperparameters.

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [{'n_neighbors' : [4, 5, 6], "weights" : ['uniform', 'distance']}]
best_knn = GridSearchCV(knn, param_grid, cv = 5)
best_knn.fit(X_train_numpy, y_train_numpy)
evaluate_model(best_knn, X_train_numpy, y_train_numpy)
print(best_knn.best_params_)

In [None]:
preds = best_knn.predict(test_data)

In [None]:
subm = pd.DataFrame({'Survived': preds})
subm['PassengerId'] = idx
subm.head()

In [None]:
subm.to_csv('submission_knn.csv', index=False)

In [None]:
param_grid = [{'criterion' : ['gini', 'entropy'], 'max_features' : ['auto', 'sqrt', 'log2', None]}]
best_dftree = GridSearchCV(dftree, param_grid, cv = 5)
best_dftree.fit(X_train_numpy, y_train_numpy)
evaluate_model(best_dftree, X_train_numpy, y_train_numpy)
print(best_dftree.best_params_)

In [None]:
preds = best_dftree.predict(test_data)
subm = pd.DataFrame({'Survived': preds})
subm['PassengerId'] = idx
subm.to_csv('submission_dftree.csv', index=False)

In [None]:
param_grid = [{'criterion' : ['gini', 'entropy'], 'n_estimators' : [50, 100, 150, 200], 'oob_score' : [False, True]}]
best_rf = GridSearchCV(rf, param_grid, cv = 5)
best_rf.fit(X_train_numpy, y_train_numpy)
evaluate_model(best_rf, X_train_numpy, y_train_numpy)
print(best_rf.best_params_)

In [None]:
preds = best_rf.predict(test_data)
subm = pd.DataFrame({'Survived': preds})
subm['PassengerId'] = idx
subm.to_csv('submission_rf.csv', index=False)