# Imports

In [210]:
import pandas as pd

# Data

In [211]:
gender = pd.read_csv('gender_submission.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# Clean

In [212]:
train.shape

(891, 12)

In [213]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [214]:
# make PassengerID the index
train.set_index('PassengerId', inplace = True)

In [215]:
train.index.is_unique

True

In [216]:
# make all columns lowercase
train.columns = train.columns.str.lower()

In [217]:
train.describe(include = 'object')

Unnamed: 0,name,sex,ticket,cabin,embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Harper, Miss. Annie Jessie ""Nina""",male,CA. 2343,B96 B98,S
freq,1,577,7,4,644


In [218]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null object
age         714 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
cabin       204 non-null object
embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 123.5+ KB


In [219]:
# convert sex to binary
train['sex'] = train.sex.map({
    'male' : 1,
    'female' : 0
})

In [220]:
# count cabins
train['cabin_count'] = train.cabin.str.split().str.len().fillna(0)

In [221]:
test = train.groupby('cabin_count').agg({
    'cabin_count' : 'count',
    'survived' : 'sum'
}).join(train.groupby('cabin_count').apply(lambda x : x.survived.sum() / x.cabin_count.count()).to_frame(name = 'ratio'))

In [222]:
test

Unnamed: 0_level_0,survived,cabin_count,ratio
cabin_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,206,687,0.299854
1.0,122,180,0.677778
2.0,9,16,0.5625
3.0,3,6,0.5
4.0,2,2,1.0


In [223]:
train.groupby('pclass').agg({
    'pclass' : 'count',
    'survived' : 'sum'
}).join(train.groupby('pclass').apply(lambda x : x.survived.sum() / x.pclass.count()).to_frame(name = 'ratio'))

Unnamed: 0_level_0,survived,pclass,ratio
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,136,216,0.62963
2,87,184,0.472826
3,119,491,0.242363


In [224]:
train.groupby([
    'sex',
    'pclass',
    'cabin_count'
]).agg({
    'pclass' : 'count',
    'survived' : 'sum'
}).join(train.groupby([
    'sex',
    'pclass',
    'cabin_count'
]).apply(lambda x : x.survived.sum() / x.pclass.count()).to_frame(name = 'rate'))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,survived,pclass,rate
sex,pclass,cabin_count,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0.0,13,13,1.0
0,1,1.0,70,71,0.985915
0,1,2.0,4,6,0.666667
0,1,3.0,2,2,1.0
0,1,4.0,2,2,1.0
0,2,0.0,61,66,0.924242
0,2,1.0,9,10,0.9
0,3,0.0,68,138,0.492754
0,3,1.0,3,5,0.6
0,3,2.0,1,1,1.0
