In [62]:
# In this notebook we will be using the random forest algorithm to predict the survival outcomes of the titanic 
# passengers
import os.path as path
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

data_folder = '/Users/GraysTECH/BigQLabs/portfolio/machine-learning/kaggle-titanic/data/'

files = {"training_file":"train.csv", "testing_file":"test.csv"}

train_file = path.join(data_folder,files["training_file"])
test_file = path.join(data_folder,files["testing_file"])


# We will be using the following columns to predict the outcome
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","FamilySize","NameLength","Title"]
titanic = pd.read_csv(train_file)

#adress na's in the age column by assigning the median age 
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())

#Convert the category values of the Sex column to numerical values, encode male to 0 and female to 1
titanic.loc[titanic["Sex"] == "male","Sex"] = 0
titanic.loc[titanic["Sex"] == "female","Sex"] = 1

#address the na's in the embarked column
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S","Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C","Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q","Embarked"] = 2

titanic_test = pd.read_csv(test_file)

#adress na's in the age column by assigning the median age of the train dataset
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())

#Convert the category values of the Sex column to numerical values, encode male to 0 and female to 1
titanic_test.loc[titanic_test["Sex"] == "male","Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female","Sex"] = 1

#address the na's in the Embarked column of the test data set
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S","Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C","Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q","Embarked"] = 2

titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

# Feature Engineering
# We can engineer new features based on existing features that can be used to improve the accuracy of the model
# 1. For example the length of the name of a person can be an indicator of the social status of the individual.
# 2. The total number of people in the family which is SibSp+Parch

#The easy way to generate features is to use the apply method of the pandas dataframe

titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
titanic["NameLength"] = titanic["Name"].apply(lambda x:len(x))

titanic_test["FamilySize"] = titanic_test["SibSp"] + titanic_test["Parch"]
titanic_test["NameLength"] = titanic_test["Name"].apply(lambda x:len(x))


In [63]:
# Using regular expressions to retrieve title from the name
import re

def get_title(name):
    # use regular expressions to search for title. Title's always consist of capital and lowercase letters, 
    # and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.',name)
    if title_search:
        return title_search.group(1)
    
    return ""
# Get all the titles and print how often each one occurs
titles = titanic["Name"].apply(lambda x:get_title(x) )
#print(pandas.value_counts(titles))
print(pd.value_counts(titles))

# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
    titles[titles == k] = v
print(pd.value_counts(titles))

#Add in the title column
titanic["Title"] = titles

#feature engineer the test set
titles = titanic_test["Name"].apply(lambda x:get_title(x))
for k,v in title_mapping.items():
    titles[titles == k] = v
titanic_test["Title"] = titles

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Sir           1
Countess      1
Don           1
Capt          1
Lady          1
Mme           1
Jonkheer      1
Name: Name, dtype: int64
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64


In [64]:
#initialize the algorithm:
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
#kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
#Compute the accuracy scores for all the cross-validation folds
#scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)

#print(scores)
#print(scores.mean())
alg.fit(titanic[predictors],titanic["Survived"])
#predictions = alg.predict(titanic_test[predictors])
titanic_test[predictors].describe()
#Create a csv file with only the columns required by the Kaggle competition
#submission = pd.DataFrame({"PassengerId":titanic_test["PassengerId"], "Survived":predictions})
#submission.to_csv(path.join(data_folder,"randomforest_1.csv"), index=False)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamilySize,NameLength
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,29.805024,0.447368,0.392344,35.576535,0.839713,27.483254
std,0.841838,12.667969,0.89676,0.981429,55.850103,1.519072,9.971228
min,1.0,0.17,0.0,0.0,0.0,0.0,13.0
25%,1.0,23.0,0.0,0.0,7.8958,0.0,20.0
50%,3.0,28.0,0.0,0.0,14.4542,0.0,25.0
75%,3.0,35.75,1.0,0.0,31.471875,1.0,30.75
max,3.0,76.0,8.0,9.0,512.3292,10.0,63.0
