In [1]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.io import output_notebook

%matplotlib inline

In [2]:
FILE_PATH_TRAIN = 'https://raw.githubusercontent.com/jay619/Kaggle-Competitions/main/1-Titanic/titanic/train.csv'
FILE_PATH_TEST = 'https://raw.githubusercontent.com/jay619/Kaggle-Competitions/main/1-Titanic/titanic/train.csv'

In [3]:
train_data = pd.read_csv(FILE_PATH_TRAIN)
test_data = pd.read_csv(FILE_PATH_TEST)
train_data.columns = train_data.columns.str.lower()
test_data.columns = test_data.columns.str.lower()

In [4]:
train_data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## EDA

In [6]:
def describe_category(data):
    nas = data.isna().sum()
    if nas > 0:
        print(f"{nas} records missing data")
    described = data.value_counts().to_frame(name="counts").sort_index()
    described["frequency"] = data.value_counts(normalize=True).sort_index()
    described["names"] = data.value_counts().sort_index().index.tolist()
    return described

In [7]:
def plot_bokeh_vbar(data, feature, has_numerical_categories=False):
    described = describe_category(data[feature])

    source = ColumnDataSource(data=described)
    if has_numerical_categories:
        p = figure(
            title = f"Distribution by {feature}",
            plot_width=400,
            plot_height=400,
            tooltips=[
                ("Value", "@names"),
                ("counts", "@counts"),
                ("frequency", "@frequency{%0.1f}")
            ],
            x_axis_label=f"{feature}"
        )
    else:
        p = figure(
            title = f"Distribution by {feature}",
            plot_width=400,
            plot_height=400,
            tooltips=[
                ("Value", "@names"),
                ("counts", "@counts"),
                ("frequency", "@frequency{%0.1f}")
            ],
            x_axis_label=f"{feature}",
            x_range=described["names"]
        )
    
    width = 1/1.5
    p.vbar('names', top='frequency', width=width, source=source)
    show(p)

In [8]:
def plot_bokeh_histogram(data, feature):
    hist, edges = np.histogram(data[feature], bins="fd")
    p = figure(
        title = f"Distribution for {feature}",
        plot_width=400,
        plot_height=400,
    )
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="white", line_alpha=0.9)
    show(p)

In [9]:
output_notebook()

### Single Variable

#### Passenger ID

Passenger ID doesn't provide any useful insights into the data, so we'll skip this column

#### PClass

Represents passengers ticket class. We have 3 classes; 1st class, 2nd class and 3rd class. Ticket class also represents the socio-economic class. `1st` = Upper class, `2nd` = middle class and `3rd` = lower class

In [33]:
described = describe_category(train_data["pclass"])
described

Unnamed: 0,counts,frequency,names
1,216,0.242424,1
2,184,0.20651,2
3,491,0.551066,3


In [42]:
plot_bokeh_vbar(train_data, "pclass", True)

#### name

`name` does not provide a lot of insights in the data. Passenger name is unique for each passenger

#### Sex

In [12]:
described = describe_category(train_data["sex"])
described

Unnamed: 0,counts,frequency,names
female,314,0.352413,female
male,577,0.647587,male


In [13]:
plot_bokeh_vbar(train_data, "sex", False)

#### Age

In [36]:
train_data["age"].isna().sum()

177

There are 177 passengers for whom we don't have an age. We can either exclude these records while creating a model or we impute these records with either the mean, median or some other way.

In [14]:
train_data["age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

The average age of the passenger was about 30 yrs nad a median age at 28 yrs. The youngest passenger was 0.42 yrs (does not make sense for the years to be in decimal. Does it mean 42 months?). The oldest passenger was 80 yrs old.

In [34]:
train_data[train_data["age"] < 1]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S


There are 7 records where the age of the passenger was less than 1 year old.

In [15]:
plot_bokeh_histogram(train_data[train_data["age"].notna()], "age")

We can also try to create a new feature that tells us the passengers age decade i.e., if someone is 39 years old, their age group would be 30.

In [38]:
train_data["age_group"] =(train_data["age"] // 10) * 10

In [43]:
describe_category(train_data["age_group"])

177 records missing data


Unnamed: 0,counts,frequency,names
0.0,62,0.086835,0.0
10.0,102,0.142857,10.0
20.0,220,0.308123,20.0
30.0,167,0.233894,30.0
40.0,89,0.12465,40.0
50.0,48,0.067227,50.0
60.0,19,0.026611,60.0
70.0,6,0.008403,70.0
80.0,1,0.001401,80.0


Majority of the patients in our dataset are between 10 and 30 years old.

In [41]:
plot_bokeh_vbar(train_data, "age_group", True)

177 records missing data


#### sibsp

`sibsp` represents the number of siblings / spouses the passengers is travelling with onboard Titanic.

In [44]:
describe_category(train_data["sibsp"])

Unnamed: 0,counts,frequency,names
0,608,0.682379,0
1,209,0.234568,1
2,28,0.031425,2
3,16,0.017957,3
4,18,0.020202,4
5,5,0.005612,5
8,7,0.007856,8


In [16]:
plot_bokeh_vbar(train_data, "sibsp", True)

Majority of the patients were travelling by themselves (about 68%).

#### parch

`parch` is described as the number of parents or children the passenger had aboard the Titanic. Majority of the passengers were traveling by themselves (about 76%).

In [17]:
describe_category(train_data["parch"])

Unnamed: 0,counts,frequency,names
0,678,0.760943,0
1,118,0.132435,1
2,80,0.089787,2
3,5,0.005612,3
4,4,0.004489,4
5,5,0.005612,5
6,1,0.001122,6


In [18]:
plot_bokeh_vbar(train_data, "parch", True)

#### fare

`fare` is the ticket fare the passenger paid for the Titanic.

In [50]:
train_data["fare"].isna().sum()

0

In [19]:
train_data["fare"].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64

In [20]:
plot_bokeh_histogram(train_data, "fare")

The average ticket fare was 32 pounds (assuming it is in pounds) and the median fare was around 14.5 pounds. The highest fare was 512 pounds which I think is really expensive compared to 1912. We also have few passengers who did not have to pay any fare. Is this correct or is this a discrepancy in the data?

We can apply log transformation to `fare` and look at the distribution

In [48]:
train_data["log_fare"] = train_data["fare"].apply(lambda x: np.log(x+1))

In [49]:
plot_bokeh_histogram(train_data, "log_fare")

#### cabin

In [21]:
describe_category(train_data["cabin"])

687 records missing data


Unnamed: 0,counts,frequency,names
A10,1,0.004902,A10
A14,1,0.004902,A14
A16,1,0.004902,A16
A19,1,0.004902,A19
A20,1,0.004902,A20
...,...,...,...
F33,3,0.014706,F33
F38,1,0.004902,F38
F4,2,0.009804,F4
G6,4,0.019608,G6


`cabin` doesn't provide useful information so we'll skip this feature.

#### embarked

`embarked` tells us the port of embarkation for the passenger

In [22]:
describe_category(train_data["embarked"])

2 records missing data


Unnamed: 0,counts,frequency,names
C,168,0.188976,C
Q,77,0.086614,Q
S,644,0.724409,S


In [23]:
plot_bokeh_vbar(train_data, "embarked", False)

2 records missing data


Most of the passengers embarked from the Southampton port. Not sure if this feature provides any useful information.

### Pairwise

In [31]:
train_data.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

## Preprocessing

## Modeling

### Logistic Regression

### Random Forest

### Bayes Classifier

## Model Evaluation

## Model Performance