In [44]:
from sklearn import preprocessing 
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor

import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None

In [45]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv('gender_submission.csv')

In [46]:
data_combine = train.append(test)
data_combine.reset_index(inplace=True, drop=True)
data_combine.shape

(1309, 12)

## Feature engineering

In [47]:
# combine SibSp and Parch
data_combine['family'] = data_combine['SibSp'] + data_combine['Parch']
data_combine.drop('SibSp', 1, inplace=True)
data_combine.drop('Parch', 1, inplace=True)
data_combine.shape

(1309, 11)

In [48]:
data_combine['Ticket_info'] = \
data_combine['Ticket'].apply(lambda x : x.replace(".","").replace("/","").strip().split(' ')[0] if not x.isdigit() else 'X')

In [49]:
# Fare fillna with mean
data_combine['Fare'] = data_combine['Fare'].fillna(data_combine['Fare'].mean())

In [50]:
# Embarked fillna with C
#data_combine['Embarked'] = data_combine['Embarked'].fillna("C")
data_combine['Embarked'] = data_combine['Embarked'].fillna("S")

In [51]:
# Cabin fillna
'''
mask = (data_combine["Survived"]==0.0) & (data_combine["Cabin"].isnull())
data_combine.loc[mask, "Cabin"] = "G"

mask = (data_combine["Survived"]==1.0) & (data_combine["Cabin"].isnull())
data_combine.loc[mask, "Cabin"] = "B"

mask = (data_combine["Survived"].isnull()) & (data_combine["Cabin"].isnull()) & (data_combine["Pclass"] == 1)
data_combine.loc[mask, "Cabin"] = "B"

mask = (data_combine["Survived"].isnull()) & (data_combine["Cabin"].isnull()) & (data_combine["Pclass"] == 2)
data_combine.loc[mask, "Cabin"] = "F"

mask = (data_combine["Survived"].isnull()) & (data_combine["Cabin"].isnull()) & (data_combine["Pclass"] == 3)
data_combine.loc[mask, "Cabin"] = "G"

data_combine["Cabin"] = data_combine['Cabin'].apply(lambda x : str(x)[0])
'''
data_combine["Cabin"] = data_combine['Cabin'].apply(lambda x : str(x)[0] if not pd.isnull(x) else 'NoCabin')
data_combine["Cabin"].unique()
#data_combine["Cabin"] = data_combine['Cabin'].apply(lambda x : 1 if not pd.isnull(x) else 0)
# data_combine['Cabin']

array(['NoCabin', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [52]:
# Name
'''
for typeName in ["Ms.", "Miss.", "Mr.", "Master.", "Mme."]:
   
    mask =  (data_combine['Name'].str.contains(typeName))
    replaceTitle = typeName
        
    if(typeName=="Mme."):
        replaceTitle = "Ms."
        
    data_combine.loc[mask,'Title'] = replaceTitle
    
mask = data_combine["Title"].isnull() & (data_combine['Age']<14) & (data_combine['Sex']=='male')
data_combine.loc[mask, "Title"] = "Master."

mask = data_combine["Title"].isnull() & (data_combine['Age']>13) & (data_combine['Sex']=='male')
data_combine.loc[mask, "Title"] = "Mr."

mask = data_combine["Title"].isnull() & (data_combine['Sex']=='female')
data_combine.loc[mask, "Title"] = "Miss."

data_combine.drop('Name',axis=1,inplace=True)
'''
data_combine['Title1'] = data_combine['Name'].str.split(", ", expand=True)[1]
data_combine['Title1'] = data_combine['Title1'].str.split(".", expand=True)[0]
data_combine['Title'] = data_combine['Title1'] \
.replace(['Mlle','Mme','Ms','Dr','Major','Lady','the Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
         ['Miss','Mrs','Miss','Mr','Mr','Mrs','Mrs','Mr','Mr','Mr','Mr','Mr','Mr','Mrs'])
data_combine['Title'].unique()
pd.crosstab(data_combine['Title'],data_combine['Survived'])

Survived,0.0,1.0
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,17,23
Miss,55,130
Mr,451,87
Mrs,26,102


In [53]:
# Label Encoding : http://pbpython.com/categorical-encoding.html
data_combine['Sex'] = data_combine['Sex'].astype('category').cat.codes
data_combine['Embarked'] = data_combine['Embarked'].astype('category').cat.codes
data_combine['Pclass'] = data_combine['Pclass'].astype('category').cat.codes
data_combine['Title'] = data_combine['Title'].astype('category').cat.codes
data_combine['Cabin'] = data_combine['Cabin'].astype('category').cat.codes
data_combine['Ticket_info'] = data_combine['Ticket_info'].astype('category').cat.codes
data_combine.head(5)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,PassengerId,Pclass,Sex,Survived,Ticket,family,Ticket_info,Title1,Title
0,22.0,7,2,7.25,"Braund, Mr. Owen Harris",1,2,1,0.0,A/5 21171,1,2,Mr,2
1,38.0,2,0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,0,0,1.0,PC 17599,1,14,Mrs,3
2,26.0,7,2,7.925,"Heikkinen, Miss. Laina",3,2,0,1.0,STON/O2. 3101282,0,31,Miss,1
3,35.0,2,2,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,0,0,1.0,113803,1,36,Mrs,3
4,35.0,7,2,8.05,"Allen, Mr. William Henry",5,2,1,0.0,373450,0,36,Mr,2


In [54]:
# Age
dataAgeNull = data_combine[data_combine["Age"].isnull()]
dataAgeNotNull = data_combine[data_combine["Age"].notnull()]

remove_outlier = \
dataAgeNotNull[~((np.abs(dataAgeNotNull["Fare"]-dataAgeNotNull["Fare"].mean())>(4*dataAgeNotNull["Fare"].std()))|
(np.abs(dataAgeNotNull["Ticket_info"]-dataAgeNotNull["Ticket_info"].mean())>(4*dataAgeNotNull["Ticket_info"].std())))]


rfModel_age = RandomForestRegressor(n_estimators=2000,random_state=42)

ageColumns = ['Embarked', 'Fare', 'Pclass', 'Sex', 'family', 'Title','Cabin','Ticket_info']
rfModel_age.fit(remove_outlier[ageColumns], remove_outlier["Age"])

ageNullValues = rfModel_age.predict(X= dataAgeNull[ageColumns])
dataAgeNull.loc[:,"Age"] = ageNullValues
data_combine = dataAgeNull.append(dataAgeNotNull)
data_combine.reset_index(inplace=True, drop=True)

#print(data_combine.describe())

In [55]:
# train
dataTrain = data_combine[data_combine['Survived'].notnull()].sort_values(by=["PassengerId"])
dataTest = data_combine[data_combine['Survived'].isnull()].sort_values(by=["PassengerId"])

dataTrain = dataTrain[['Survived', 'Age', 'Embarked', 'Fare',  'Pclass', 'Sex', 'family', 'Title','Ticket_info','Cabin']]
dataTest = dataTest[['Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'family', 'Title','Ticket_info','Cabin']]
dataTrain.head(5)

Unnamed: 0,Survived,Age,Embarked,Fare,Pclass,Sex,family,Title,Ticket_info,Cabin
263,0.0,22.0,2,7.25,2,1,1,2,2,7
264,1.0,38.0,0,71.2833,0,0,1,3,14,2
265,1.0,26.0,2,7.925,2,0,0,1,31,7
266,1.0,35.0,2,53.1,0,0,1,3,36,2
267,0.0,35.0,2,8.05,2,1,0,2,36,7


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics, cross_validation
from sklearn.cross_validation import train_test_split
 
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=1000,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1) 


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=12,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [57]:
#X_train, X_test, y_train, y_test = train_test_split(dataTrain.iloc[:, 1:], dataTrain.iloc[:, 0])
#rf.fit(X_train, y_train)
#rf_res =  rf.predict(X_test)
#print(metrics.classification_report(y_test, rf_res))

In [58]:

rf.fit(dataTrain.iloc[:, 1:], dataTrain.iloc[:, 0])
print("%.4f" % rf.oob_score_)   #maybe he higher the beter

rf_res =  rf.predict(dataTest)
print(rf_res.shape)
print(submit['Survived'].shape)      

submit['Survived'] = submit['Survived'].astype(int)
submit.to_csv('submit.csv', index= False)

submit

0.8373
(418,)
(418,)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
