#  Titanic - Machine Learning from Disaster

## Imports

In [1]:
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from bokeh import io, plotting, palettes

In [2]:
io.output_notebook()

## Loading the Data

In [3]:
training_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
len(training_data)

891

## Exploratory Analysis

In [4]:
survived = training_data[training_data["Survived"] == 1]
perished = training_data[training_data["Survived"] == 0]
features = training_data.columns.values
features

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [5]:
survival_by_sex = pd.pivot_table(training_data,
                                 index="Survived",
                                 columns=["Sex"],
                                 values="PassengerId",
                                 aggfunc="count")
survival_by_sex

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [6]:
total = len(training_data)
n_survived = len(survived)
n_perished = len(perished)
n_males = len(training_data[training_data["Sex"] == "male"])
n_females = len(training_data[training_data["Sex"] == "female"])
n_firs_class = len(training_data[training_data["Pclass"] == 1])
n_second_class = len(training_data[training_data["Pclass"] == 2])
n_third_class = len(training_data[training_data["Pclass"] == 3])

In [7]:
categories = ["Total", "Perished", "Survived"]
sexes = ["Male", "Female"]
data = {"categories": categories,
        "Male": [n_males/total, survival_by_sex["male"].loc[0]/n_perished, survival_by_sex["male"].loc[1]/n_survived],
        "Female": [n_females/total, survival_by_sex["female"].loc[0]/n_perished, survival_by_sex["female"].loc[1]/n_survived],
       }
colors = palettes.linear_palette(palettes.RdYlBu[5], 2)

In [8]:
p = plotting.figure(x_range=categories,
                    plot_height=300,
                    plot_width=700,
                    title="Survival by Sex",
                    tools="hover",
                    tooltips="$name: @$name",
                    toolbar_location=None)
p.vbar_stack(sexes, x="categories", width=0.9, source=data, legend_label=sexes, color=colors)
io.show(p)

In [9]:
survival_by_class = pd.pivot_table(training_data,
                                 index="Survived",
                                 columns=["Pclass"],
                                 values="PassengerId",
                                 aggfunc="count")
survival_by_class

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,80,97,372
1,136,87,119


In [10]:
categories = ["Total", "Perished", "Survived"]
classes = ["First", "Second", "Third"]
data = {"categories": categories,
        "First": [n_firs_class/total, survival_by_class[1].loc[0]/n_perished, survival_by_class[1].loc[1]/n_survived],
        "Second": [n_second_class/total, survival_by_class[2].loc[0]/n_perished, survival_by_class[2].loc[1]/n_survived],
        "Third": [n_third_class/total, survival_by_class[3].loc[0]/n_perished, survival_by_class[3].loc[1]/n_survived],
       }
colors = palettes.linear_palette(palettes.RdYlBu[5], 3)

In [11]:
p = plotting.figure(x_range=categories,
                    plot_height=300,
                    plot_width=700,
                    title="Survival by Class",
                    tools="hover",
                    tooltips="$name: @$name",
                    toolbar_location=None)
p.vbar_stack(classes, x="categories", width=0.9, source=data, legend_label=classes, color=colors)
io.show(p)