In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("data/train.csv", index_col = "PassengerId")
print(train.shape)
train.head()

(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv("data/test.csv", index_col ="PassengerId")

print(test.shape)
test.head()

(418, 10)


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Preprocessing

### Encording Sex

In [4]:
train["Sex_encord"] = train["Sex"].replace("male", 0 ).replace("female", 1)

print(train.shape)
train[["Sex", "Sex_encord"]].head()

(891, 12)


Unnamed: 0_level_0,Sex,Sex_encord
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,0
2,female,1
3,female,1
4,female,1
5,male,0


In [5]:
test["Sex_encord"] = test["Sex"].replace("male", 0 ).replace("female", 1)

print(test.shape)
test[["Sex", "Sex_encord"]].head()

(418, 11)


Unnamed: 0_level_0,Sex,Sex_encord
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,male,0
893,female,1
894,male,0
895,male,0
896,female,1


### Fill in missing Fare

In [6]:
test["Fare"] = test["Fare"].fillna(0)

test[test["Fare"].isnull()]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


### Encord Embarked

In [7]:
# C == 0, S == 1, Q == 2
# 2 * S = Q?
# S + S = Q?

# One Hot Encording
# True == 1, False == 0
# C == [True, False, False]
# S == [False, True, False]
# Q == [False, False, True]

train["Embarked_C"] = train["Embarked"] == "C"
train["Embarked_S"] = train["Embarked"] == "S"
train["Embarked_Q"] = train["Embarked"] == "Q"
print(train.shape)
train[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()

(891, 15)


Unnamed: 0_level_0,Embarked,Embarked_C,Embarked_S,Embarked_Q
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,S,False,True,False
2,C,True,False,False
3,S,False,True,False
4,S,False,True,False
5,S,False,True,False


In [8]:
test["Embarked_C"] = test["Embarked"] == "C"
test["Embarked_S"] = test["Embarked"] == "S"
test["Embarked_Q"] = test["Embarked"] == "Q"
print(test.shape)
test[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()

(418, 14)


Unnamed: 0_level_0,Embarked,Embarked_C,Embarked_S,Embarked_Q
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
892,Q,False,False,True
893,S,False,True,False
894,Q,False,False,True
895,S,False,True,False
896,S,False,True,False


### Making age section

In [9]:
train.loc[train["Age"] < 15, "AgeType"] = "Young"
train.loc[(train["Age"] >= 15) & (train["Age"] < 30), "AgeType"] = "Medium"
train.loc[train["Age"] >= 30, "AgeType"] = "Old"

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,AgeType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,False,True,False,Medium
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,True,False,False,Old
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,False,True,False,Medium
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,False,True,False,Old
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,False,True,False,Old


In [10]:
test

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,False,False,True
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1,False,True,False
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,False,False,True
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,False,True,False
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,False,True,False
897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S,0,False,True,False
898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,1,False,False,True
899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S,0,False,True,False
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,1,True,False,False
901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S,0,False,True,False


In [11]:
test[test["Age"].isnull()]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S,0,False,True,False
914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S,1,False,True,False
921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C,0,True,False,False
925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.4500,,S,1,False,True,False
928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.0500,,S,1,False,True,False
931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S,0,False,True,False
933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.5500,D34,S,0,False,True,False
939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.7500,,Q,0,False,False,True
946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C,0,True,False,False
950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1000,,S,0,False,True,False


In [12]:
train["Age(fill)"] = train["Age"]
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,AgeType,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,False,True,False,Medium,22.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,True,False,False,Old,38.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,False,True,False,Medium,26.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,False,True,False,Old,35.0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,False,True,False,Old,35.0


In [13]:
train.loc[(train["Age(fill)"].isnull()) & (train["Pclass"]== 1), "Age(fill)"] = 38.233441
train.loc[(train["Age(fill)"].isnull()) & (train["Pclass"]== 2), "Age(fill)"] = 29.877630
train.loc[(train["Age(fill)"].isnull()) & (train["Pclass"]== 3), "Age(fill)"] = 25.140620
train.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,AgeType,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,False,True,False,Medium,22.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,True,False,False,Old,38.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,False,True,False,Medium,26.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,False,True,False,Old,35.0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,False,True,False,Old,35.0
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0,False,False,True,,25.14062
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0,False,True,False,Old,54.0
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0,False,True,False,Young,2.0
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,1,False,True,False,Medium,27.0
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,1,True,False,False,Young,14.0


### Making test data age section

In [14]:
test["Age(fill)"] = test["Age"]
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,False,False,True,34.5
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,False,True,False,47.0
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,False,False,True,62.0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,False,True,False,27.0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,False,True,False,22.0


In [15]:
test

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,False,False,True,34.5
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1,False,True,False,47.0
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,False,False,True,62.0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,False,True,False,27.0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,False,True,False,22.0
897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S,0,False,True,False,14.0
898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,1,False,False,True,30.0
899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S,0,False,True,False,26.0
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,1,True,False,False,18.0
901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S,0,False,True,False,21.0


In [16]:
test.loc[(train["Age(fill)"].isnull()) & (test["Pclass"]== 1), "Age(fill)"] = 38.233441
test.loc[(train["Age(fill)"].isnull()) & (test["Pclass"]== 2), "Age(fill)"] = 29.877630
test.loc[(train["Age(fill)"].isnull()) & (test["Pclass"]== 3), "Age(fill)"] = 25.140620
test.head(10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,False,False,True,34.5
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,False,True,False,47.0
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,False,False,True,62.0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,False,True,False,27.0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,False,True,False,22.0
897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,0,False,True,False,14.0
898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,1,False,False,True,30.0
899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,0,False,True,False,26.0
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,1,True,False,False,18.0
901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,0,False,True,False,21.0


In [17]:
test.loc[test["Age"] < 15, "AgeType"] = "Young"
test.loc[(test["Age"] >= 15) & (test["Age"] < 30), "AgeType"] = "Medium"
test.loc[test["Age"] >= 30, "AgeType"] = "Old"

In [18]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,Age(fill),AgeType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,False,False,True,34.5,Old
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,False,True,False,47.0,Old
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,False,False,True,62.0,Old
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,False,True,False,27.0,Medium
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,False,True,False,22.0,Medium


### Encord train data age section

In [19]:
train["Age_Old"] = train["AgeType"] == "Old"
train["Age_Medium"] = train["AgeType"] == "Medium"
train["Age_Young"] = train["AgeType"] == "Young"
print(train.shape)
train[["AgeType", "Age_Old", "Age_Medium", "Age_Young"]].head()

(891, 20)


Unnamed: 0_level_0,AgeType,Age_Old,Age_Medium,Age_Young
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Medium,False,True,False
2,Old,True,False,False
3,Medium,False,True,False
4,Old,True,False,False
5,Old,True,False,False


### Encord test data age section

In [20]:
test["Age_Old"] = test["AgeType"] == "Old"
test["Age_Medium"] = test["AgeType"] == "Medium"
test["Age_Young"] = test["AgeType"] == "Young"
print(test.shape)
test[["AgeType", "Age_Old", "Age_Medium", "Age_Young"]].head()

(418, 19)


Unnamed: 0_level_0,AgeType,Age_Old,Age_Medium,Age_Young
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
892,Old,True,False,False
893,Old,True,False,False
894,Old,True,False,False
895,Medium,False,True,False
896,Medium,False,True,False


### Making Family Size

In [21]:
train["Single"]=(train["SibSp"]== 0) & (train["Parch"]== 0)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,AgeType,Age(fill),Age_Old,Age_Medium,Age_Young,Single
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,...,0,False,True,False,Medium,22.0,False,True,False,False
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,1,True,False,False,Old,38.0,True,False,False,False
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,...,1,False,True,False,Medium,26.0,False,True,False,True
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,...,1,False,True,False,Old,35.0,True,False,False,False
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,...,0,False,True,False,Old,35.0,True,False,False,True


In [22]:
train["FamilySize"]=(train["SibSp"]) + (train["Parch"]) + 1
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Embarked_C,Embarked_S,Embarked_Q,AgeType,Age(fill),Age_Old,Age_Medium,Age_Young,Single,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,...,False,True,False,Medium,22.0,False,True,False,False,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,True,False,False,Old,38.0,True,False,False,False,2
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,...,False,True,False,Medium,26.0,False,True,False,True,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,...,False,True,False,Old,35.0,True,False,False,False,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,...,False,True,False,Old,35.0,True,False,False,True,1


In [23]:
train.loc[train["FamilySize"] < 2, "FamilyType"] = "Single"
train.loc[(train["FamilySize"] >= 2) & (train["FamilySize"] < 5), "FamilyType"] = "Nuclear"
train.loc[train["FamilySize"] >= 5, "FamilyType"] = "Big"

In [24]:
train[["FamilySize", "FamilyType"]].head(10)

Unnamed: 0_level_0,FamilySize,FamilyType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,Nuclear
2,2,Nuclear
3,1,Single
4,2,Nuclear
5,1,Single
6,1,Single
7,1,Single
8,5,Big
9,3,Nuclear
10,2,Nuclear


### Making Family Size for test data

In [25]:
test["Single"]=(test["SibSp"]== 0) & (test["Parch"]== 0)
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encord,Embarked_C,Embarked_S,Embarked_Q,Age(fill),AgeType,Age_Old,Age_Medium,Age_Young,Single
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,False,False,True,34.5,Old,True,False,False,True
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,False,True,False,47.0,Old,True,False,False,False
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,False,False,True,62.0,Old,True,False,False,True
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,False,True,False,27.0,Medium,False,True,False,True
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,False,True,False,22.0,Medium,False,True,False,False


In [26]:
test["FamilySize"]=(test["SibSp"]) + (test["Parch"]) + 1
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,...,Embarked_C,Embarked_S,Embarked_Q,Age(fill),AgeType,Age_Old,Age_Medium,Age_Young,Single,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,...,False,False,True,34.5,Old,True,False,False,True,1
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,...,False,True,False,47.0,Old,True,False,False,False,2
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,...,False,False,True,62.0,Old,True,False,False,True,1
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,...,False,True,False,27.0,Medium,False,True,False,True,1
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,...,False,True,False,22.0,Medium,False,True,False,False,3


In [27]:
test.loc[test["FamilySize"] < 2, "FamilyType"] = "Single"
test.loc[(test["FamilySize"] >= 2) & (test["FamilySize"] < 5), "FamilyType"] = "Nuclear"
test.loc[test["FamilySize"] >= 5, "FamilyType"] = "Big"

In [28]:
test[["FamilySize", "FamilyType"]].head(10)

Unnamed: 0_level_0,FamilySize,FamilyType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,1,Single
893,2,Nuclear
894,1,Single
895,1,Single
896,3,Nuclear
897,1,Single
898,1,Single
899,3,Nuclear
900,1,Single
901,3,Nuclear


### Encording Family Size

In [29]:
train["Family_Single"] = train["FamilyType"] == "Single"
train["Family_Nuclear"] = train["FamilyType"] == "Nuclear"
train["Family_Big"] = train["FamilyType"] == "Big"
print(train.shape)
train[["FamilyType", "Family_Single", "Family_Nuclear", "Family_Big"]].head()

(891, 26)


Unnamed: 0_level_0,FamilyType,Family_Single,Family_Nuclear,Family_Big
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Nuclear,False,True,False
2,Nuclear,False,True,False
3,Single,True,False,False
4,Nuclear,False,True,False
5,Single,True,False,False


### Encording Family Size for test data

In [30]:
test["Family_Single"] = test["FamilyType"] == "Single"
test["Family_Nuclear"] = test["FamilyType"] == "Nuclear"
test["Family_Big"] = test["FamilyType"] == "Big"
print(test.shape)
test[["FamilyType", "Family_Single", "Family_Nuclear", "Family_Big"]].head()

(418, 25)


Unnamed: 0_level_0,FamilyType,Family_Single,Family_Nuclear,Family_Big
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
892,Single,True,False,False
893,Nuclear,False,True,False
894,Single,True,False,False
895,Single,True,False,False
896,Nuclear,False,True,False


## Train

In [31]:
feature_names = ["Pclass", "Sex_encord", "Fare", "Embarked_C", "Embarked_S", "Embarked_Q","Family_Single", "Family_Nuclear", "Family_Big", "Age_Old", "Age_Medium", "Age_Young"]


feature_names 

['Pclass',
 'Sex_encord',
 'Fare',
 'Embarked_C',
 'Embarked_S',
 'Embarked_Q',
 'Family_Single',
 'Family_Nuclear',
 'Family_Big',
 'Age_Old',
 'Age_Medium',
 'Age_Young']

In [32]:
X_train = train[feature_names]
print(X_train.shape)
X_train.head()

(891, 12)


Unnamed: 0_level_0,Pclass,Sex_encord,Fare,Embarked_C,Embarked_S,Embarked_Q,Family_Single,Family_Nuclear,Family_Big,Age_Old,Age_Medium,Age_Young
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,3,0,7.25,False,True,False,False,True,False,False,True,False
2,1,1,71.2833,True,False,False,False,True,False,True,False,False
3,3,1,7.925,False,True,False,True,False,False,False,True,False
4,1,1,53.1,False,True,False,False,True,False,True,False,False
5,3,0,8.05,False,True,False,True,False,False,True,False,False


In [33]:
X_test = test[feature_names]
print(X_test.shape)
X_test.head()

(418, 12)


Unnamed: 0_level_0,Pclass,Sex_encord,Fare,Embarked_C,Embarked_S,Embarked_Q,Family_Single,Family_Nuclear,Family_Big,Age_Old,Age_Medium,Age_Young
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,0,7.8292,False,False,True,True,False,False,True,False,False
893,3,1,7.0,False,True,False,False,True,False,True,False,False
894,2,0,9.6875,False,False,True,True,False,False,True,False,False
895,3,0,8.6625,False,True,False,True,False,False,False,True,False
896,3,1,12.2875,False,True,False,False,True,False,False,True,False


In [34]:
label_name = "Survived"

y_train=train[label_name]
print(y_train.shape)
y_train.head()

(891,)


PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [35]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [36]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [37]:
prediction = model.predict(X_test)
print(prediction.shape)
prediction[0:5]

(418,)


array([0, 0, 0, 0, 1], dtype=int64)

In [38]:
submit = pd.read_csv("data/gender_submission.csv", index_col="PassengerId")
submit["Survived"] = prediction

print(submit.shape)
submit.head()

(418, 1)


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [39]:
submit.to_csv("decision-tree.csv")