#Implement baseline Random Forest model

In [27]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#Make Dummies

###Sex

In [3]:
# check type, distinct count, & values
print train.Sex.dtype, len(train.Sex.unique()), train.Sex.unique() 

# make boolean
train.Sex = (train.Sex == 'male')

# verify
print train.Sex.dtype, len(train.Sex.unique()), train.Sex.unique() 

object 2 ['male' 'female']
bool 2 [True False]


###Embarked

In [4]:
# treat nan; assume 'S'
print train.Embarked.value_counts(dropna=False)
train.Embarked = train.Embarked.replace(np.nan, 'S')

train.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64


S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [5]:
print train.Embarked.dtype, len(train.Embarked.unique()), train.Embarked.unique()

dummies = pd.get_dummies(train.Embarked, prefix='embarked').astype(np.bool)
train = pd.concat([train, dummies], axis=1)
train.head()

object 3 ['S' 'C' 'Q']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,embarked_C,embarked_Q,embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",True,22.0,1,0,A/5 21171,7.25,,S,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",False,38.0,1,0,PC 17599,71.2833,C85,C,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",False,26.0,0,0,STON/O2. 3101282,7.925,,S,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",False,35.0,1,0,113803,53.1,C123,S,False,False,True
4,5,0,3,"Allen, Mr. William Henry",True,35.0,0,0,373450,8.05,,S,False,False,True


#General Cleaning

In [6]:
# inspect types
train.ftypes

PassengerId      int64:dense
Survived         int64:dense
Pclass           int64:dense
Name            object:dense
Sex               bool:dense
Age            float64:dense
SibSp            int64:dense
Parch            int64:dense
Ticket          object:dense
Fare           float64:dense
Cabin           object:dense
Embarked        object:dense
embarked_C        bool:dense
embarked_Q        bool:dense
embarked_S        bool:dense
dtype: object

In [12]:

for col in train.columns:
    if train[col].dtype == 'int64':
        train[col] = train[col].astype(np.int32)
    elif train[col].dtype == 'float64':
        train[col] = train[col].astype(np.float32)
    
train.Age = train.Age.astype(np.float32)
train.Fare = train.Fare.astype(np.float32)

train.ftypes

PassengerId      int32:dense
Survived         int32:dense
Pclass           int32:dense
Name            object:dense
Sex               bool:dense
Age            float32:dense
SibSp            int32:dense
Parch            int32:dense
Ticket          object:dense
Fare           float32:dense
Cabin           object:dense
Embarked        object:dense
embarked_C        bool:dense
embarked_Q        bool:dense
embarked_S        bool:dense
dtype: object

In [8]:
# define feature columns
features_list = []
exclude_list = 'Survived','PassengerId'

for col in train.columns:
    if ((train[col].dtype != 'object') & (col not in exclude_list)):
        features_list.append(col)
        
features_list

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'embarked_C',
 'embarked_Q',
 'embarked_S']

In [20]:
train_y = train.Survived
train_X = train[train.columns.intersection(features_list)]
train_X = train_X.replace(np.nan, 0)

train_X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,embarked_C,embarked_Q,embarked_S
0,3,True,22.0,1,0,7.25,False,False,True
1,1,False,38.0,1,0,71.283302,True,False,False
2,3,False,26.0,0,0,7.925,False,False,True
3,1,False,35.0,1,0,53.099998,False,False,True
4,3,True,35.0,0,0,8.05,False,False,True


In [21]:
for col in train_X.columns:
    print train_X[col].value_counts(dropna=False)

3    491
1    216
2    184
Name: Pclass, dtype: int64
True     577
False    314
Name: Sex, dtype: int64
0.00     177
24.00     30
22.00     27
18.00     26
30.00     25
19.00     25
28.00     25
21.00     24
25.00     23
36.00     22
29.00     20
35.00     18
32.00     18
27.00     18
26.00     18
16.00     17
31.00     17
23.00     15
20.00     15
33.00     15
34.00     15
39.00     14
17.00     13
40.00     13
42.00     13
45.00     12
38.00     11
50.00     10
4.00      10
2.00      10
        ... 
57.00      2
64.00      2
28.50      2
0.83       2
13.00      2
71.00      2
55.00      2
70.00      2
0.75       2
30.50      2
59.00      2
45.50      2
40.50      2
63.00      2
80.00      1
70.50      1
23.50      1
20.50      1
0.42       1
14.50      1
55.50      1
74.00      1
12.00      1
36.50      1
66.00      1
24.50      1
53.00      1
34.50      1
0.67       1
0.92       1
Name: Age, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dty

#Train Random Forest

In [22]:
clf = RandomForestClassifier()

clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
train_predicted = clf.predict(train_X)

cm = confusion_matrix(train_y, train_predicted)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print cm, '\n\n\n', cm_norm

accuracy_score(train_y, train_predicted)

[[544   5]
 [ 20 322]] 


[[ 0.99089253  0.00910747]
 [ 0.05847953  0.94152047]]


0.97194163860830529