# Load Dataset #

In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv("data/train.csv", index_col=["PassengerId"])

print(train.shape)
train.head()

(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
test = pd.read_csv("data/test.csv", index_col=["PassengerId"])

print(test.shape)
test.head()

(418, 10)


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Preprocessing #

In [3]:
combined = pd.concat([train,test])

print(combined.shape)
combined.tail()

(1309, 11)


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1305,,,S,8.05,"Spector, Mr. Woolf",0,3,male,0,,A.5. 3236
1306,39.0,C105,C,108.9,"Oliva y Ocana, Dona. Fermina",0,1,female,0,,PC 17758
1307,38.5,,S,7.25,"Saether, Mr. Simon Sivertsen",0,3,male,0,,SOTON/O.Q. 3101262
1308,,,S,8.05,"Ware, Mr. Frederick",0,3,male,0,,359309
1309,,,C,22.3583,"Peter, Master. Michael J",1,3,male,1,,2668


## Encode Sex ##

In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
combined["Sex_encode"] = encoder.fit_transform(combined["Sex"])

print(combined.shape)
combined[["Sex_encode","Sex"]].head()

(1309, 12)


Unnamed: 0_level_0,Sex_encode,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,male
2,0,female
3,0,female
4,0,female
5,1,male


## Fill out missing fare ##

In [5]:
mean_fare = combined["Fare"].mean()
print (mean_fare)

33.2954792813456


In [6]:
combined["Fare_fillout"] = combined["Fare"]

combined.loc[pd.isnull(combined["Fare"]),"Fare_fillout"]=mean_fare

missing_fare = combined[pd.isnull(combined["Fare"])]

print(missing_fare.shape)
combined.loc[missing_fare.index,["Fare","Fare_fillout"]].head()

(1, 13)


Unnamed: 0_level_0,Fare,Fare_fillout
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1044,,33.295479


## Fill out missing age

In [7]:
mean_age = combined["Age"].mean()

combined["Age_fillout"] = combined["Age"]

combined.loc[pd.isnull(combined["Age"]),"Age_fillout"]=mean_age

missing_age = combined[pd.isnull(combined["Age"])]

print(missing_age.shape)
combined.loc[missing_age.index,["Age","Age_fillout"]].head()

(263, 14)


Unnamed: 0_level_0,Age,Age_fillout
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
6,,29.881138
18,,29.881138
20,,29.881138
27,,29.881138
29,,29.881138


## Combine Parch & SibSp ##

In [8]:
combined["Parch_SibSp"] = combined["Parch"] + combined["SibSp"]

print(combined.shape)
combined[["Parch_SibSp","Parch","SibSp"]].head()

(1309, 15)


Unnamed: 0_level_0,Parch_SibSp,Parch,SibSp
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,1
2,1,0,1
3,0,0,0
4,1,0,1
5,0,0,0


## Sort names ##

In [9]:
combined["IsMaster"] = combined["Name"]
combined["IsMr"] = combined["Name"]
combined["IsMrs"] = combined["Name"]



for i in range(1,len(combined["Name"])+1):
    if "Master" in combined["Name"][i]:
        combined["IsMaster"][i] = True
    else:
        combined["IsMaster"][i] = False
        
    if "Mr" in combined["Name"][i]:
        if "Mrs" not in combined["Name"][i]:
            combined["IsMr"][i] = True
            combined["IsMrs"][i] = False
        else:
            combined["IsMr"][i] = False
            combined["IsMrs"][i] = True
    else:
        combined["IsMr"][i] = False
        combined["IsMrs"][i] = False

print (combined.shape)
combined[["IsMr","IsMrs","IsMaster"]].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://panda

(1309, 18)


Unnamed: 0_level_0,IsMr,IsMrs,IsMaster
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,True,False,False
2,False,True,False
3,False,False,False
4,False,True,False
5,True,False,False


## Encode Embarked ##

In [10]:
embarked = pd.get_dummies(combined["Embarked"],prefix="Embarked").astype(np.bool)

In [11]:
combined = pd.concat([combined,embarked],axis=1)

print(combined.shape)
combined[["Embarked","Embarked_C","Embarked_Q","Embarked_S"]].head()

(1309, 21)


Unnamed: 0_level_0,Embarked,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,S,False,False,True
2,C,True,False,False
3,S,False,False,True
4,S,False,False,True
5,S,False,False,True


## Split dataset into train/test ##

In [12]:
train = combined[pd.notnull(combined["Survived"])]

print(train.shape)
train.head()

(891, 21)


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,...,Sex_encode,Fare_fillout,Age_fillout,Parch_SibSp,IsMaster,IsMr,IsMrs,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,...,1,7.25,22.0,1,False,True,False,False,False,True
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,...,0,71.2833,38.0,1,False,False,True,True,False,False
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,...,0,7.925,26.0,0,False,False,False,False,False,True
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,...,0,53.1,35.0,1,False,False,True,False,False,True
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,...,1,8.05,35.0,0,False,True,False,False,False,True


In [13]:
test = combined[pd.isnull(combined["Survived"])]

test.drop("Survived",axis=1,inplace=True)

print(test.shape)
test.head()

(418, 20)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Sex_encode,Fare_fillout,Age_fillout,Parch_SibSp,IsMaster,IsMr,IsMrs,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
892,34.5,,Q,7.8292,"Kelly, Mr. James",0,3,male,0,330911,1,7.8292,34.5,0,False,True,False,False,True,False
893,47.0,,S,7.0,"Wilkes, Mrs. James (Ellen Needs)",0,3,female,1,363272,0,7.0,47.0,1,False,False,True,False,False,True
894,62.0,,Q,9.6875,"Myles, Mr. Thomas Francis",0,2,male,0,240276,1,9.6875,62.0,0,False,True,False,False,True,False
895,27.0,,S,8.6625,"Wirz, Mr. Albert",0,3,male,0,315154,1,8.6625,27.0,0,False,True,False,False,False,True
896,22.0,,S,12.2875,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,3,female,1,3101298,0,12.2875,22.0,2,False,False,True,False,False,True


# Train #

In [14]:
feature_names = ["Pclass", "Sex_encode", "Fare_fillout", "Age_fillout", "IsMaster"]
feature_names = feature_names + list(embarked.columns)

label_name = "Survived"

In [15]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(891, 8)


Unnamed: 0_level_0,Pclass,Sex_encode,Fare_fillout,Age_fillout,IsMaster,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,1,7.25,22.0,False,False,False,True
2,1,0,71.2833,38.0,False,True,False,False
3,3,0,7.925,26.0,False,False,False,True
4,1,0,53.1,35.0,False,False,False,True
5,3,1,8.05,35.0,False,False,False,True


In [16]:
y_train = train[label_name]

print(y_train.shape)
y_train.head()

(891,)


PassengerId
1    0.0
2    1.0
3    1.0
4    1.0
5    0.0
Name: Survived, dtype: float64

In [17]:
from sklearn.tree import DecisionTreeClassifier
seed = 34
model = DecisionTreeClassifier(max_depth = 5, random_state=seed)

# Score #

In [18]:
from sklearn.cross_validation import cross_val_score
%time score = cross_val_score(model, X_train, y_train, cv = 100).mean()

print(score)



Wall time: 239 ms
0.834388888889


# Predict #

In [19]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(418, 8)


Unnamed: 0_level_0,Pclass,Sex_encode,Fare_fillout,Age_fillout,IsMaster,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,1,7.8292,34.5,False,False,True,False
893,3,0,7.0,47.0,False,False,False,True
894,2,1,9.6875,62.0,False,False,True,False
895,3,1,8.6625,27.0,False,False,False,True
896,3,0,12.2875,22.0,False,False,False,True


In [20]:
model.fit(X_train,y_train)

prediction = model.predict(X_test)

print(prediction.shape)
prediction[:20]

(418,)


array([ 0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  1.,  0.])

# Submit #

In [21]:
submission = pd.read_csv("data/gender_submission.csv",index_col="PassengerId")

submission["Survived"] = prediction.astype(np.int32)

print(submission.shape)
submission.head()

(418, 1)


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
