# Titanic: Machine Learning from Disaster

In this competition, you must predict the fate of the passengers aboard the RMS Titanic, which famously sank in the Atlantic ocean during its maiden voyage from the UK to New York City after colliding with an iceberg.
While there could hardly be a more chaotic event than frightened people scrambling to escape a sinking ship, the disaster is famous for saving “women and children first”. With an inadequate number of lifeboats available only a fraction of the passengers survived, and through this series of lessons, we’ll try to predict who they were.

As with most Kaggle competitions, you are given two datasets:

 - a training set, complete with the outcome (or target variable) for a group of passengers as well as a collection of other parameters such as their age, gender, etc. This is the dataset on which you must train your predictive model.
 - a test set, for which you must predict the now unknown target variable based on the other passenger attributes that are provided for both datasets.

In [76]:
# Import packages required fpr this problem
import pandas as pd
import os
os.chdir('C:\\Users\\Rahul\\Machine-Learning\\Approach Kaggle')

In [77]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [78]:
# You dont see any negative values in age, Siblings, Fare - commonsense
train.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [79]:
train.describe(include=['object'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Isham, Miss. Ann Elizabeth",male,CA. 2343,G6,S
freq,1,577,7,4,644


In [80]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [81]:
train.Survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [82]:
# The disaster was famous for saving “women and children first”
train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [83]:
pd.crosstab(train.Survived, train.Sex,normalize=True)

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.090909,0.525253
1,0.261504,0.122334


In [84]:
pd.crosstab(train.Survived,train.SibSp,normalize=True)

SibSp,0,1,2,3,4,5,8
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.446689,0.108866,0.016835,0.013468,0.016835,0.005612,0.007856
1,0.23569,0.125701,0.01459,0.004489,0.003367,0.0,0.0


In [85]:
# Define bins as 0 to 20, 20 to 40, 40 to 60, 60 to 80
bins = [0,20,40,60,80]
group_names = ['young', 'middle','adult', 'old']
train['Agebins'] = pd.cut(train.Age, bins, labels=group_names)
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agebins
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,middle
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,middle
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,middle
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,middle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,middle
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,adult
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,young
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,middle
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,young


In [86]:
pd.crosstab(train.Agebins,train.Survived)

Survived,0,1
Agebins,Unnamed: 1_level_1,Unnamed: 2_level_1
young,97,82
middle,232,153
adult,78,50
old,17,5


In [87]:
train.Agebins.value_counts()

middle    385
young     179
adult     128
old        22
Name: Agebins, dtype: int64

In [88]:
pd.crosstab([train.Agebins,train.Sex],train.Survived)

Unnamed: 0_level_0,Survived,0,1
Agebins,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
young,female,24,53
young,male,73,29
middle,female,29,107
middle,male,203,46
adult,female,11,34
adult,male,67,16
old,female,0,3
old,male,17,2


In [89]:
train.Survived.cov(train.Age)

-0.55129609768170951

In [91]:
bins2 = [0,105,210,315,420,525]
labelz = ['low','lowmid','medium','midhigh','high']
train['pricebin'] = pd.cut(train.Fare,bins2,labels=labelz)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agebins,pricebin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,middle,low
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,middle,low
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,middle,low
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,middle,low
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,middle,low


In [92]:
# All the very rich people survived
pd.crosstab(train.Survived,train.pricebin)

Fare,low,lowmid,medium,midhigh,high
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,521,8,6,0,0
1,302,25,11,0,3


In [136]:
# So here we can convert all the pricebin null values to zero
train[train.pricebin.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agebins,pricebin,designation,firstName,lastname
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,S,middle,,Leonard,"Leonard,",Lionel
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S,middle,,Harrison,"Harrison,",William
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,S,middle,,Tornquist,"Tornquist,",William
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,S,,,Parkes,"Parkes,",Francis
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,S,young,,Johnson,"Johnson,",William
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,,S,,,Cunningham,"Cunningham,",Alfred
466,467,0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,,S,,,Campbell,"Campbell,",William
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S,,,Frost,"Frost,",Anthony
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S,adult,,Johnson,"Johnson,",Alfred
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,S,,,Parr,"Parr,",William


In [107]:
def get_position(mylist,position):
    return mylist[position]

In [118]:
train['firstName'] = train.Name.str.split().apply(get_position,position = 0)
train['designation'] = train.Name.str.split().apply(get_position,position = 1)
train['lastname'] = train.Name.str.split().apply(get_position,position = 2)
train.head()
train.designation.unique()

array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Planke,', 'Don.', 'Rev.',
       'Billiard,', 'der', 'Walle,', 'Dr.', 'Pelsmaeker,', 'Mulder,', 'y',
       'Steen,', 'Carlo,', 'Mme.', 'Impe,', 'Ms.', 'Major.', 'Gordon,',
       'Messemaeker,', 'Mlle.', 'Col.', 'Capt.', 'Velde,', 'the',
       'Shawah,', 'Jonkheer.', 'Melkebeke,', 'Cruyssen,'], dtype=object)

In [215]:
a = train.Name.str.split('.').apply(get_position,position = 0)
train['lastname'] = train.Name.str.split('.').apply(get_position,position = 1)
train['designation'] = a.str.split(',').apply(get_position,position = 1)
train['firstName'] = a.str.split(',').apply(get_position,position = 0)
train.designation.unique()

array([' Mr', ' Mrs', ' Miss', ' Master', ' Don', ' Rev', ' Dr', ' Mme',
       ' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer'], dtype=object)

In [216]:
a = train.firstName.value_counts()
a = pd.DataFrame(a)
b = train.groupby('firstName').Survived.agg('sum')
b = pd.DataFrame(b)

In [217]:
a = a.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='')
b = b.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='')

In [220]:
a.rename(columns={'index':'firstName','firstName':'familycount'},inplace=True)

Index(['firstName', 'familycount'], dtype='object')
Index(['firstName', 'Survived'], dtype='object')


In [223]:
result = pd.merge(a, b, on='firstName')
result.familycount.cov(result.Survived)

0.23295109202155667

In [130]:
pd.crosstab([train.Sex,train.designation],train.Survived)

Unnamed: 0_level_0,Survived,0,1
Sex,designation,Unnamed: 2_level_1,Unnamed: 3_level_1
female,Dr.,0,1
female,"Gordon,",0,1
female,"Impe,",2,0
female,"Messemaeker,",0,1
female,Miss.,53,126
female,Mlle.,0,2
female,Mme.,0,1
female,Mrs.,24,97
female,Ms.,0,1
female,"Planke,",2,0
