In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
%matplotlib inline

In [301]:
data_path = '/home/ubuntu/data/titanic'
titanic_df = pd.read_csv(data_path + '/train.csv')
train_df = titanic_df 
test_df = pd.read_csv(data_path + '/test.csv')
# only for test_df, since there is a missing "Fare" values
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)
combine = [train_df, test_df]

# drop unnecessary columns, these columns won't be useful in analysis and prediction
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket'], axis=1)
test_df    = test_df.drop(['Name','Ticket'], axis=1)

## Define our train and test sets

In [302]:
Y_train = titanic_df["Survived"]
X_train = titanic_df.drop(['Cabin','Survived', 'Sex', 'Embarked', 'Age'],axis=1)

X_test  = test_df.drop(['PassengerId','Cabin', 'Sex', 'Embarked', 'Age'],axis=1)

In [303]:
X_test.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare
0,3,0,0,7.8292
1,3,1,0,7.0
2,2,0,0,9.6875
3,3,0,0,8.6625
4,3,1,1,12.2875


In [304]:
X_test.isnull().sum()

Pclass    0
SibSp     0
Parch     0
Fare      0
dtype: int64

In [305]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
clf.score(X_train, Y_train)

0.83613916947250277

## Feature engineering 

### Impute missing values

In [306]:
titanic_df.isnull().sum()


Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [316]:
# titanic_df.loc[titanic_df['Age'].isnull()] = titanic_df['Age'].median()
# test_df.loc[test_df['Age'].isnull()] = test_df['Age'].median()

titanic_df.Age[titanic_df['Age'].isnull()] = 0

test_df.l[test_df['Age'].isnull()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [308]:
titanic_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin       529
Embarked      2
dtype: int64

### One hot encode categorical variables

In [309]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [310]:
# # encode embarked column
# titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")

# embark_dummies_titanic  = pd.get_dummies(titanic_df['Embarked'])
# embark_dummies_titanic.drop(['S'], axis=1, inplace=True)
# embark_dummies_test  = pd.get_dummies(test_df['Embarked'])
# embark_dummies_test.drop(['S'], axis=1, inplace=True)

# titanic_df = titanic_df.join(embark_dummies_titanic)
# test_df    = test_df.join(embark_dummies_test)




In [311]:
# # bin and encode fare column

# bins = [0, 10, 20, 30, 520]
# group_names = ['Cheapest', 'Cheap', 'Middle', 'Expensive']
# titanic_df['FareGroup'] = pd.cut(train_df['Fare'], bins, labels=group_names)

# test_df['FareGroup'] = pd.cut(train_df['Fare'], bins, labels=group_names)


In [312]:
# fare_dummies_titanic  = pd.get_dummies(titanic_df['FareGroup'])
# fare_dummies_test  = pd.get_dummies(test_df['FareGroup'])

# titanic_df = titanic_df.join(fare_dummies_titanic)
# test_df    = test_df.join(fare_dummies_test)

# titanic_df.drop(['FareGroup'], axis=1,inplace=True)
# test_df.drop(['FareGroup'], axis=1,inplace=True)

In [313]:
# encode gender column

print(titanic_df['Sex'].value_counts())



gender_code = {"male":0 ,"female":1}
titanic_df['Gender'] = titanic_df['Sex'].apply(gender_code.get).astype(float)
test_df['Gender'] = test_df['Sex'].apply(gender_code.get).astype(float)

male      453
female    261
0         177
Name: Sex, dtype: int64


In [287]:
print(titanic_df['Gender'].value_counts())

male      577
female    314
Name: Sex, dtype: int64

## Create extra columns

In [271]:
# full_data = [titanic_df, test_df]

# for dataset in full_data:
#     dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# for dataset in full_data:
#     dataset['IsAlone'] = 0
#     dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
# for dataset in full_data:
#     dataset['Child'] = 0
#     dataset.loc[dataset['Age'] < 16, 'Child'] = 1

In [280]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null float64
Pclass      891 non-null float64
Age         891 non-null float64
SibSp       891 non-null float64
Parch       891 non-null float64
Fare        891 non-null float64
Cabin       362 non-null object
Embarked    889 non-null object
Gender      714 non-null float64
dtypes: float64(7), object(2)
memory usage: 62.7+ KB


In [273]:
Y_train = titanic_df["Survived"]
X_train = titanic_df.drop(['Cabin','Survived','Fare', 'Embarked', 'Sex'],axis=1)

X_test  = test_df.drop(['Cabin', 'Fare', 'Embarked', 'Sex'],axis=1)


In [274]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Gender
0,3.0,22.0,1.0,0.0,0.0
1,1.0,38.0,1.0,0.0,1.0
2,3.0,26.0,0.0,0.0,1.0
3,1.0,35.0,1.0,0.0,1.0
4,3.0,35.0,0.0,0.0,0.0


In [275]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Gender
0,892.0,3.0,34.5,0.0,0.0,0.0
1,893.0,3.0,47.0,1.0,0.0,1.0
2,894.0,2.0,62.0,0.0,0.0,0.0
3,895.0,3.0,27.0,0.0,0.0,0.0
4,896.0,3.0,22.0,1.0,1.0,1.0


In [276]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
clf.score(X_train, Y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [277]:
X_train.isnull().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Gender    177
dtype: int64