In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [17]:
#input the data
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [18]:
train.info()

In [19]:
test.info()

From the basic info of the training dataset and the test dataset, we can find that there is a lot of null values. Moreover, after reading the variable description, I find that some of the info may be useless, including the name and ticket. So that, we should firstly clean the dataset.

In [20]:
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

In [23]:
train.isna().sum()

For the age data, I would like to fill na with the median of the train set.

In [24]:
train.Age.fillna(train.Age.median(), inplace = True)

In [35]:
train.Embarked.value_counts()

For the embarked data, since it is the categorical data, I would fill the na with the most common category.

In [36]:
train.Embarked.fillna(train.Embarked.value_counts().index[0], inplace = True)

In [37]:
train.isna().sum()

For the test data, I would use the same data as the train data to fill the na, since I think we should only extract info from the training set to avoid data leakage.

In [38]:
test.isna().sum()

In [39]:
test.Age.fillna(train.Age.median(), inplace = True)
test.Fare.fillna(train.Fare.median(), inplace = True)

In [40]:
test.isna().sum()

Then we can go through the EDA, and try to find some useful pattern from our dataset.

In [42]:
import seaborn as sns
import matplotlib.pyplot as plt

Then I would like to go through each categorcal feature and find their relationship with the survivied data, and hopefully we can find some pattern.

In [64]:
cat_col = train.columns.drop(['Survived', 'Age', 'Fare'])

In [73]:
fig, axes = plt.subplots(1, len(cat_col), figsize = (20, 5))
for ind, cat in enumerate(cat_col):
    sns.barplot(ax = axes[ind], data = train, x = cat, y = 'Survived')

From the barplots, we find that the Sex, Pclass, and Embarkerd may be crucial to the ratio of survived.

In [81]:
sns.displot(data = train, x = 'Age', hue = 'Survived', kind="kde")

From the distribution of the age and survival ratio, we find that those who are in their 20-40 have the lowest survival rate.

In [82]:
sns.displot(data = train, x = 'Fare', hue = 'Survived', kind="kde")

Similarly, those who have the cheaper tickets, are more likely to not survive.

Given the EDA, I would like to add a new feature to the dataset which indicate if the passenger is bwteen 20 and 40.

In [86]:
test['20_40'] = test.Age.apply(lambda x: 1 if x>=20 and x<=40 else 0)

In [88]:
train['20_40'] = train.Age.apply(lambda x: 1 if x>=20 and x<=40 else 0)

Now, we can do the machine learning part.

First, since the survive rate is not balanced, we should use sss to split the data as the ratio of survival.

In [93]:
from sklearn.model_selection import StratifiedShuffleSplit

In [95]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(train, train['Survived']):
    train_set = train.iloc[train.index.intersection(train_index)]
    test_set = train.iloc[train.index.intersection(test_index)]

In [103]:
train_set.Survived.value_counts()[1]/train_set.Survived.value_counts()[0]

In [102]:
test_set.Survived.value_counts()[1]/test_set.Survived.value_counts()[0]

In [118]:
train_set_X = train_set.drop('Survived', axis = 1)
train_set_y = train_set['Survived']
test_set_X = test_set.drop('Survived', axis = 1)
test_set_y = test_set['Survived']

Then we should do some preprocessing to the data.

In [108]:
train_set.info()

In [119]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_attribs = ['Age', 'SibSp', 'Parch', 'Fare']
cat_attribs = list(train_set_X.drop(num_attribs, axis=1).columns)

In [120]:
full_pipeline = ColumnTransformer([
        ("num", StandardScaler(), num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

train_X_prepared = full_pipeline.fit_transform(train_set_X)
test_X_prepared = full_pipeline.transform(test_set_X)

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [91]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    ]

In [124]:
for name, clf in zip(names, classifiers):
    clf.fit(train_X_prepared, train_set_y)
    y_pred = clf.predict(test_X_prepared)
    accuracy = clf.score(test_X_prepared, test_set_y)
    print("%s Accuracy: %.2f%%" % (name,accuracy * 100.0))

We can find that the Gaussian Process Accuracy and Nearest Neighbors Accuracy are highest, I would choose the Gaussian Process model and check the output accuracy.

In [125]:
model = GaussianProcessClassifier(1.0 * RBF(1.0))
model.fit(train_X_prepared, train_set_y)

In [127]:
transformed_test = full_pipeline.transform(test)

In [134]:
predictions = model.predict(transformed_test)
submission=pd.read_csv("../input/titanic/gender_submission.csv")
submission.Survived = predictions
submission.to_csv("submission.csv", index=False)
display(submission)

The final accuray is 0.76794