Kaggle Titanic Competition 

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV

Get the data

In [2]:
train_data = pd.read_csv("train_titanic.csv", index_col=0)
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data_final = pd.read_csv("test_titanic.csv", index_col=0)
test_data_final.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
test_data_final.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,418.0,332.0,418.0,418.0,417.0
mean,2.26555,30.27259,0.447368,0.392344,35.627188
std,0.841838,14.181209,0.89676,0.981429,55.907576
min,1.0,0.17,0.0,0.0,0.0
25%,1.0,21.0,0.0,0.0,7.8958
50%,3.0,27.0,0.0,0.0,14.4542
75%,3.0,39.0,1.0,0.0,31.5
max,3.0,76.0,8.0,9.0,512.3292


In [6]:
pd.isnull(train_data).sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [7]:
age_avg_train = train_data["Age"].mean()
age_std_train = train_data["Age"].std()

In [8]:
train_data["Age"].fillna(np.random.randint(age_avg_train - age_std_train, 
                                           age_avg_train + age_std_train), inplace=True)
train_data["Embarked"].fillna("Q", inplace=True)

In [9]:
pd.isnull(test_data_final).sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [10]:
age_avg_test = test_data_final["Age"].mean()
age_std_test = test_data_final["Age"].std()

In [11]:
test_data_final["Age"].fillna(np.random.randint(age_avg_test - age_std_test, 
                                           age_avg_test + age_std_test), inplace=True)
test_data_final["Fare"].fillna(test_data_final["Fare"].median(), inplace=True)

In [12]:
# create a new column to see if who was traveling alone
train_data["n_of_Family_members"] = train_data["SibSp"] + train_data["Parch"]

In [13]:
train_data["n_of_Family_members"].value_counts()

0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: n_of_Family_members, dtype: int64

In [14]:
train_data[["n_of_Family_members", "Survived"]].groupby("n_of_Family_members").sum()

Unnamed: 0_level_0,Survived
n_of_Family_members,Unnamed: 1_level_1
0,163
1,89
2,59
3,21
4,3
5,3
6,4
7,0
10,0


In [15]:
# create a new column to store the info about if person is alone or not 
# 1 represent person who was traveling alone, 0 otherwise
train_data["Alone"] = 1
train_data.loc[train_data["n_of_Family_members"] > 0, "Alone"] = 0

In [16]:
test_data_final["n_of_Family_members"] = test_data_final["SibSp"] + test_data_final["Parch"]

In [17]:
test_data_final["n_of_Family_members"].value_counts()

0     253
1      74
2      57
3      14
4       7
10      4
6       4
5       3
7       2
Name: n_of_Family_members, dtype: int64

In [18]:
test_data_final["Alone"] = 1
test_data_final.loc[test_data_final["n_of_Family_members"] > 0, "Alone"] = 0

In [19]:
# create a new column for titles using regular expressions
train_data["Title"] = train_data["Name"].str.extract(' ([A-Za-z]+)\.')

  


In [20]:
train_data["Title"].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Sir           1
Capt          1
Mme           1
Don           1
Countess      1
Lady          1
Jonkheer      1
Ms            1
Name: Title, dtype: int64

In [21]:
# clean up and group
train_data["Title"].replace(["Dr", "Rev", "Col", "Major", "Lady", "Countess", 
                            "Sir", "Capt", "Jonkheer", "Don"], "Unusual", inplace=True)
train_data["Title"].replace(["Ms", "Mlle"], "Miss", inplace=True)
train_data["Title"].replace("Mme", "Mrs", inplace=True)

In [22]:
train_data["Title"].value_counts()

Mr         517
Miss       185
Mrs        126
Master      40
Unusual     23
Name: Title, dtype: int64

In [23]:
# categorical ==> numeric
train_data["Title"] = train_data["Title"].map({"Mr": 0, "Miss": 1,
                                                        "Mrs": 2, "Master": 3, "Unusual": 4}).astype(int)

In [24]:
test_data_final["Title"] = test_data_final["Name"].str.extract(' ([A-Za-z]+)\.')

  """Entry point for launching an IPython kernel.


In [25]:
test_data_final["Title"].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dr          1
Dona        1
Ms          1
Name: Title, dtype: int64

In [26]:
test_data_final["Title"].replace(["Rev", "Col","Dona", "Dr"], "Unusual", inplace=True)
test_data_final["Title"].replace(["Ms"], "Miss", inplace=True)

In [27]:
test_data_final["Title"].value_counts()

Mr         240
Miss        79
Mrs         72
Master      21
Unusual      6
Name: Title, dtype: int64

In [28]:
test_data_final["Title"] = test_data_final["Title"].map({"Mr": 0, "Miss": 1,
                                                        "Mrs": 2, "Master": 3, "Unusual": 4}).astype(int)

In [29]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,n_of_Family_members,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,2
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1,0


In [30]:
test_data_final.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,n_of_Family_members,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,0,2
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,1,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2,0,2


In [31]:
# drop the data we do not need 
train_data.drop(["Ticket", "Cabin", "Name", "Parch", 
                 "SibSp", "n_of_Family_members"], axis=1, inplace=True)
test_data_final.drop(["Ticket", "Cabin", "Name", 
                      "Parch", "SibSp", "n_of_Family_members"], axis=1, inplace=True)

In [32]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,7.25,S,0,0
2,1,1,female,38.0,71.2833,C,0,2
3,1,3,female,26.0,7.925,S,1,1
4,1,1,female,35.0,53.1,S,0,2
5,0,3,male,35.0,8.05,S,1,0


In [33]:
test_data_final.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,7.8292,Q,1,0
893,3,female,47.0,7.0,S,0,2
894,2,male,62.0,9.6875,Q,1,0
895,3,male,27.0,8.6625,S,1,0
896,3,female,22.0,12.2875,S,0,2


Categorical ==> numeric

In [34]:
train_data["Sex"] = train_data["Sex"].map({"female":0, "male": 1}).astype(int)

In [35]:
train_data['Embarked'] = train_data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [36]:
test_data_final["Sex"] = test_data_final["Sex"].map({"female":0, "male": 1}).astype(int)

In [37]:
test_data_final["Embarked"] = test_data_final["Embarked"].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [38]:
train_data["Fare"].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

Grouping

In [39]:
train_data.loc[train_data["Fare"] <= 7.9104, "Fare"] = 0

In [40]:
train_data.loc[(train_data["Fare"] > 7.9104) & (train_data["Fare"] <= 14.4542), "Fare"] = 1

In [41]:
train_data.loc[(train_data["Fare"] > 14.4542) & (train_data["Fare"] <= 31), "Fare"] = 2

In [42]:
train_data.loc[(train_data["Fare"] > 31), "Fare"] = 3

In [43]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,0.0,0,0,0
2,1,1,0,38.0,3.0,1,0,2
3,1,3,0,26.0,1.0,0,1,1
4,1,1,0,35.0,3.0,0,0,2
5,0,3,1,35.0,1.0,0,1,0


In [44]:
train_data["Fare"].value_counts()

1.0    224
0.0    223
2.0    222
3.0    222
Name: Fare, dtype: int64

In [45]:
test_data_final["Fare"].describe()

count    418.000000
mean      35.576535
std       55.850103
min        0.000000
25%        7.895800
50%       14.454200
75%       31.471875
max      512.329200
Name: Fare, dtype: float64

In [46]:
test_data_final.loc[test_data_final["Fare"] <= 7.9104, "Fare"] = 0
test_data_final.loc[(test_data_final["Fare"] > 7.9104) & (test_data_final["Fare"] <= 14.4542), "Fare"] = 1
test_data_final.loc[(test_data_final["Fare"] > 14.4542) & (test_data_final["Fare"] <= 31), "Fare"] = 2
test_data_final.loc[(test_data_final["Fare"] > 31), "Fare"] = 3

In [47]:
test_data_final.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,34.5,0.0,2,1,0
893,3,0,47.0,0.0,0,0,2
894,2,1,62.0,1.0,2,1,0
895,3,1,27.0,1.0,0,1,0
896,3,0,22.0,1.0,0,0,2


In [48]:
test_data_final["Fare"].value_counts()

0.0    114
3.0    108
2.0     99
1.0     97
Name: Fare, dtype: int64

In [49]:
train_data["Age"].describe()

count    891.000000
mean      31.149461
std       13.324685
min        0.420000
25%       22.000000
50%       32.000000
75%       37.000000
max       80.000000
Name: Age, dtype: float64

In [50]:
train_data.loc[train_data["Age"] <= 22, "Age"] = 0
train_data.loc[(train_data["Age"] > 22) & (train_data["Age"] <= 27), "Age"] = 1
train_data.loc[(train_data["Age"] > 27) & (train_data["Age"] <= 35), "Age"] = 2
train_data.loc[train_data["Age"] > 35, "Age"] = 3

In [51]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,0.0,0.0,0,0,0
2,1,1,0,3.0,3.0,1,0,2
3,1,3,0,1.0,1.0,0,1,1
4,1,1,0,2.0,3.0,0,0,2
5,0,3,1,2.0,1.0,0,1,0


In [52]:
test_data_final["Age"].describe()

count    418.000000
mean      29.805024
std       12.667969
min        0.170000
25%       23.000000
50%       28.000000
75%       35.750000
max       76.000000
Name: Age, dtype: float64

In [53]:
test_data_final.loc[test_data_final["Age"] <= 22, "Age"] = 0
test_data_final.loc[(test_data_final["Age"] > 22) & (test_data_final["Age"] <= 27), "Age"] = 1
test_data_final.loc[(test_data_final["Age"] > 27) & (test_data_final["Age"] <= 35), "Age"] = 2
test_data_final.loc[test_data_final["Age"] > 35, "Age"] = 3

In [54]:
test_data_final.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,2.0,0.0,2,1,0
893,3,0,3.0,0.0,0,0,2
894,2,1,3.0,1.0,2,1,0
895,3,1,1.0,1.0,0,1,0
896,3,0,0.0,1.0,0,0,2


In [55]:
y = train_data["Survived"]
y.head()

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [56]:
X = train_data.drop(["Survived"], axis=1)

In [57]:
X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,0.0,0.0,0,0,0
2,1,0,3.0,3.0,1,0,2
3,3,0,1.0,1.0,0,1,1
4,1,0,2.0,3.0,0,0,2
5,3,1,2.0,1.0,0,1,0


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)

In [71]:
# parameters for GridSearchCV
param_grid = {"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
              "max_leaf_nodes": [8, 9, 10, 11, 12, 13, 14, 15, 16]}

In [75]:
# look for the best parameters for RandomForestClassifier
rnd_clf = RandomForestClassifier()
grid_search = GridSearchCV(rnd_clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'max_leaf_nodes': [8, 9, 10, 11, 12, 13, 14, 15, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [76]:
grid_search.best_params_

{'max_depth': 14, 'max_leaf_nodes': 16}

In [77]:
# rnd_clf_tuned = grid_search.best_estimator_
rnd_clf_tuned = RandomForestClassifier(n_estimators=1000, bootstrap=True, max_depth=14, max_leaf_nodes=16)
rnd_clf_tuned.fit(X_train, y_train)
y_pred_rf = rnd_clf_tuned.predict(X_test)
accuracy_score(y_test, y_pred_rf)

0.84916201117318435

In [78]:
# trying voting classifier
votting_clf = VotingClassifier(estimators=[('rf', rnd_clf_tuned)], voting="soft")

In [79]:
votting_clf.fit(X_train, y_train)
y_pred_votting = votting_clf.predict(X_test)

In [80]:
accuracy_score(y_test, y_pred_votting)

0.84916201117318435

In [66]:
# voting does not perform better than rf, using rf
rnd_clf_tuned.fit(X, y)
y_pred_final = rnd_clf_tuned.predict(test_data_final)

In [67]:
# create submission file 
submission = pd.DataFrame()
submission["PassengerId"] = test_data_final.index
submission["Survived"] = y_pred_final
submission["Survived"].head()

0    0
1    0
2    0
3    0
4    1
Name: Survived, dtype: int64

In [68]:
submission.to_csv("submission.csv", index=False)

the submission file got score of .80382
I did not specify random_state at any step, so the score might vary 