In [1]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.io import output_notebook

%matplotlib inline

In [2]:
FILE_PATH_TRAIN = 'https://raw.githubusercontent.com/jay619/Kaggle-Competitions/main/1-Titanic/titanic/train.csv'
FILE_PATH_TEST = 'https://raw.githubusercontent.com/jay619/Kaggle-Competitions/main/1-Titanic/titanic/train.csv'

In [3]:
train_data = pd.read_csv(FILE_PATH_TRAIN)
test_data = pd.read_csv(FILE_PATH_TEST)
train_data.columns = train_data.columns.str.lower()
test_data.columns = test_data.columns.str.lower()

In [4]:
train_data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## EDA

In [6]:
def describe_category(data):
    nas = data.isna().sum()
    if nas > 0:
        print(f"{nas} records missing data")
    described = data.value_counts().to_frame(name="counts").sort_index()
    described["frequency"] = data.value_counts(normalize=True).sort_index()
    described["names"] = data.value_counts().sort_index().index.tolist()
    return described

In [7]:
def plot_bokeh_vbar(data, feature, has_numerical_categories=False):
    described = describe_category(data[feature])

    source = ColumnDataSource(data=described)
    if has_numerical_categories:
        p = figure(
            title = f"Distribution by {feature}",
            plot_width=400,
            plot_height=400,
            tooltips=[
                ("Value", "@names"),
                ("counts", "@counts"),
                ("frequency", "@frequency{%0.1f}")
            ],
            x_axis_label=f"{feature}"
        )
    else:
        p = figure(
            title = f"Distribution by {feature}",
            plot_width=400,
            plot_height=400,
            tooltips=[
                ("Value", "@names"),
                ("counts", "@counts"),
                ("frequency", "@frequency{%0.1f}")
            ],
            x_axis_label=f"{feature}",
            x_range=described["names"]
        )
    
    width = 1/1.5
    p.vbar('names', top='frequency', width=width, source=source)
    show(p)

In [8]:
output_notebook()

### Single Variable

#### Passenger ID

Passenger ID doesn't provide any useful insignts into the data so we'll skip this column

#### PClass

Represents passengers ticket class. We have 3 classes; 1st class, 2nd class and 3rd class. Ticket class also represents the socio-economic class. `1st` = Upper class, `2nd` = middle class and `3rd` = lower class

In [9]:
described = describe_category(train_data["pclass"])
described

Unnamed: 0,counts,frequency,names
1,216,0.242424,1
2,184,0.20651,2
3,491,0.551066,3


In [16]:
# source = ColumnDataSource(data=described)
#
# p = figure(
#     title = "Distribution by Pclass",
#     plot_width=400,
#     plot_height=400,
#     tooltips=[
#         ("Value", "@names"),
#         ("counts", "@counts"),
#         ("frequency", "@frequency{%0.1f}")
#     ],
#     x_axis_label="pclass"
# )
# width = 1/1.5
# p.vbar('names', top='frequency', width=width, source=source)
# show(p)
plot_bokeh_vbar(train_data, "pclass", True)

#### Sex

In [17]:
described = describe_category(train_data["sex"])
described

Unnamed: 0,counts,frequency,names
female,314,0.352413,female
male,577,0.647587,male


In [18]:
source = ColumnDataSource(data=described)
p = figure(
    title = "Distribution by Sex",
    plot_width=400,
    plot_height=400,
    tooltips=[
        ("Value", "@names"),
        ("counts", "@counts"),
        ("frequency", "@frequency{%0.1f}")
    ],
    x_axis_label="sex",
    x_range=described["names"]
)
width = 1/1.5
p.vbar('names', top='frequency', width=width, source=source)
show(p)

#### Age

#### sibsp

In [19]:
plot_bokeh_vbar(train_data, "sibsp", True)

#### parch

`parch` is described as the number of parents or children the passenger had aboard the Titanic. Majority of the passengers were traveling by themselves (about 76%).

In [21]:
describe_category(train_data["parch"])

Unnamed: 0,counts,frequency,names
0,678,0.760943,0
1,118,0.132435,1
2,80,0.089787,2
3,5,0.005612,3
4,4,0.004489,4
5,5,0.005612,5
6,1,0.001122,6


In [22]:
plot_bokeh_vbar(train_data, "parch", True)

#### fare

#### cabin

In [23]:
describe_category(train_data["cabin"])

687 records missing data


Unnamed: 0,counts,frequency,names
A10,1,0.004902,A10
A14,1,0.004902,A14
A16,1,0.004902,A16
A19,1,0.004902,A19
A20,1,0.004902,A20
...,...,...,...
F33,3,0.014706,F33
F38,1,0.004902,F38
F4,2,0.009804,F4
G6,4,0.019608,G6


#### embarked

In [24]:
describe_category(train_data["embarked"])

2 records missing data


Unnamed: 0,counts,frequency,names
C,168,0.188976,C
Q,77,0.086614,Q
S,644,0.724409,S


In [26]:
plot_bokeh_vbar(train_data, "embarked", False)

2 records missing data


Most of the passengers embarked from the Southampton port. Not sure if this feature provides any useful information.

### Pairwise

## Preprocessing

## Modeling

### Logistic Regression

### Random Forest

### Bayes Classifier

## Model Evaluation

## Model Performance