In [1]:
# import required lirbaries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Reading the dataset
titanic = pd.read_csv('./titanic/titanic-train.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# create a deep copy for the dataset
df = titanic.copy()

In [5]:
# get rid of passengerId, Name and Ticket coulmns since they are not very useful for predicting
df.drop(['PassengerId','Name','Ticket'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [6]:
# data cleaening
# count missing values
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [7]:
# since embarked only has 2 missing values, 
# I can get rid of these two rows without haveing impact on exitsting dataset.
df.dropna(subset=['Embarked'], inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [8]:
# count missing values for Embarked
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      0
dtype: int64

In [9]:
# since Cabin has 687 out of 891 rows missing values.
# it is better to drop the whole column to make our prediction more accuracy.
df.drop('Cabin', axis=1, inplace=True)
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [10]:
# count missing again
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [11]:
# Deal with the missing values in Age.
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

df_num = df.select_dtypes(include=['int64','float64'])

df_num_fill_median = imputer.fit_transform(df_num)
imputer.statistics_

array([ 0.    ,  3.    , 28.    ,  0.    ,  0.    , 14.4542])

In [12]:
df_num_fill_median

array([[ 0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 0.    ,  3.    , 28.    ,  1.    ,  2.    , 23.45  ],
       [ 1.    ,  1.    , 26.    ,  0.    ,  0.    , 30.    ],
       [ 0.    ,  3.    , 32.    ,  0.    ,  0.    ,  7.75  ]])

In [13]:
# convert to a df
df_num_fill_median = pd.DataFrame(df_num_fill_median, columns = df_num.columns)
df_num_fill_median.isnull().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [14]:
# processing the catergorical data
df_cat = df.select_dtypes(['object'])
df_cat.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [15]:
df_cat.describe()

Unnamed: 0,Sex,Embarked
count,889,889
unique,2,3
top,male,S
freq,577,644


In [16]:
# Ordinal Encoder
# build model to encode all cat values into intergers
from sklearn.preprocessing import OrdinalEncoder

cat_encoder = OrdinalEncoder()
df_cat_encoded = cat_encoder.fit_transform(df_cat)

cat_encoder.categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [17]:
df_cat_encoded

array([[1., 2.],
       [0., 0.],
       [0., 2.],
       ...,
       [0., 2.],
       [1., 0.],
       [1., 1.]])

In [18]:
# convert numpy array into dataframe.
df_cat_encoded = pd.DataFrame(df_cat_encoded, columns=df_cat.columns)
df_cat_encoded

Unnamed: 0,Sex,Embarked
0,1.0,2.0
1,0.0,0.0
2,0.0,2.0
3,0.0,2.0
4,1.0,2.0
...,...,...
884,1.0,2.0
885,0.0,2.0
886,0.0,2.0
887,1.0,0.0


In [19]:
# One-hot Encoder
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder()
df_cat_onehot_encoded = onehot_encoder.fit_transform(df_cat)
onehot_encoder.categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [20]:
column_names = onehot_encoder.get_feature_names()
column_names

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [21]:
df_cat_onehot_encoded = df_cat_onehot_encoded.toarray()

In [22]:
df_cat_onehot_encoded = pd.DataFrame(df_cat_onehot_encoded, columns=column_names)
df_cat_onehot_encoded.head()

Unnamed: 0,x0_female,x0_male,x1_C,x1_Q,x1_S
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0


In [23]:
# Building the model.
# set the target
y = df_num_fill_median['Survived']
df_num_fill_median.drop(['Survived'], axis=1, inplace=True)
df_num_fill_median.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  889 non-null    float64
 1   Age     889 non-null    float64
 2   SibSp   889 non-null    float64
 3   Parch   889 non-null    float64
 4   Fare    889 non-null    float64
dtypes: float64(5)
memory usage: 34.9 KB


In [24]:
# traning dataset using ordinal encoder
titanic_train_encoded = pd.concat([df_num_fill_median, df_cat_encoded], axis=1)
titanic_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    889 non-null    float64
 1   Age       889 non-null    float64
 2   SibSp     889 non-null    float64
 3   Parch     889 non-null    float64
 4   Fare      889 non-null    float64
 5   Sex       889 non-null    float64
 6   Embarked  889 non-null    float64
dtypes: float64(7)
memory usage: 48.7 KB


In [25]:
# traning dataset using onehot encoder
titanic_train_onehot_encoded = pd.concat([df_num_fill_median, df_cat_onehot_encoded], axis=1)
titanic_train_onehot_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     889 non-null    float64
 1   Age        889 non-null    float64
 2   SibSp      889 non-null    float64
 3   Parch      889 non-null    float64
 4   Fare       889 non-null    float64
 5   x0_female  889 non-null    float64
 6   x0_male    889 non-null    float64
 7   x1_C       889 non-null    float64
 8   x1_Q       889 non-null    float64
 9   x1_S       889 non-null    float64
dtypes: float64(10)
memory usage: 69.6 KB


In [26]:
# train the decision tree model by using ordina encoder 
# criterion=entropy
# all other settings remain the same.
from sklearn.tree import DecisionTreeClassifier
tree_clf1 = DecisionTreeClassifier(criterion='entropy')
tree_clf1.fit(titanic_train_encoded, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [27]:
# train the decision tree model by using one-hot encoder
# criterion=entropy
# all other settings remain the same.
from sklearn.tree import DecisionTreeClassifier
tree_clf2 = DecisionTreeClassifier(criterion='entropy')
tree_clf2.fit(titanic_train_onehot_encoded, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [28]:
# train the decision tree model by using one-hot encoder
# criterion= entropy
# max_depth=4
# all other settings remain the same.
from sklearn.tree import DecisionTreeClassifier
tree_clf3 = DecisionTreeClassifier(criterion='entropy',max_depth=4)
tree_clf3.fit(titanic_train_onehot_encoded, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [29]:
# Making the predictions
# loading the test dataset
titanic_test = pd.read_csv('./titanic/titanic-test.csv')
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [30]:
# create a deepy copy for titanic test dataset 
df_test = titanic_test.copy()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [31]:
# save PassengerId column for submission
df_test_id = df_test['PassengerId']

In [32]:
df_test_id.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

In [33]:
# drop the columns
df_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [34]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [35]:
# test dataset cleanning.
df_test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [36]:
mean_imputer = SimpleImputer(strategy='mean')
df_test_num = df_test.select_dtypes(include=['int64','float64'])
df_test_num_fill_mean = mean_imputer.fit_transform(df_test_num)  
df_test_num_fill_mean = pd.DataFrame(df_test_num_fill_mean, columns=df_test_num.columns)  
df_test_num_fill_mean.isnull().sum() 

Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64

In [37]:
df_test_num_fill_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    float64
 1   Age     418 non-null    float64
 2   SibSp   418 non-null    float64
 3   Parch   418 non-null    float64
 4   Fare    418 non-null    float64
dtypes: float64(5)
memory usage: 16.5 KB


In [38]:
# encode the catergorial variables.
df_test_cat = df_test.select_dtypes(include=['object'])
df_test_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sex       418 non-null    object
 1   Embarked  418 non-null    object
dtypes: object(2)
memory usage: 6.7+ KB


In [39]:
# Ordinal Encoder
# First tree
df_test_cat_encoded = cat_encoder.fit_transform(df_test_cat)
# Convert to DataFrame
df_test_cat_encoded = pd.DataFrame(df_test_cat_encoded, columns=df_test_cat.columns)

In [40]:
df_test_cat_encoded.head()

Unnamed: 0,Sex,Embarked
0,1.0,1.0
1,0.0,2.0
2,1.0,1.0
3,1.0,2.0
4,0.0,2.0


In [41]:
# combine all columns together
titanic_test_encoded = pd.concat([df_test_num_fill_mean, df_test_cat_encoded], axis=1)
titanic_test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    float64
 1   Age       418 non-null    float64
 2   SibSp     418 non-null    float64
 3   Parch     418 non-null    float64
 4   Fare      418 non-null    float64
 5   Sex       418 non-null    float64
 6   Embarked  418 non-null    float64
dtypes: float64(7)
memory usage: 23.0 KB


In [42]:
# make prediction using the first tree
y_hat_ordinal = tree_clf1.predict(titanic_test_encoded)
y_hat_ordinal.shape

(418,)

In [43]:
# convert all my predictions into interger.
y_hat_ordinal.astype(int)

array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [44]:
# make the dataframe for submission by combining two columns
tree1_ordinal_submit = pd.DataFrame({
    'PassengerId': df_test_id, 
    'Survived': y_hat_ordinal.astype(int),
})

In [45]:
# save the resulting dataframe as a csv file for Kaggle submission
tree1_ordinal_submit.to_csv('./Assignment6/tree1_submit.csv', index=False) 

In [46]:
# Onehot encoder
# Second tree
df_test_cat_onehot_encoded = onehot_encoder.fit_transform(df_test_cat)

column_names = onehot_encoder.get_feature_names()

# onehot encoder returns a sparse matrix and we convert that to a numpy array
df_test_cat_onehot_encoded = df_test_cat_onehot_encoded.toarray()

# combine all columns together
df_test_cat_onehot_encoded = pd.DataFrame(df_test_cat_onehot_encoded, columns=column_names)
df_test_cat_onehot_encoded.head()


Unnamed: 0,x0_female,x0_male,x1_C,x1_Q,x1_S
0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0


In [47]:
titanic_test_onehot_encoded = pd.concat([df_test_num_fill_mean, df_test_cat_onehot_encoded], axis=1)
titanic_test_onehot_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     418 non-null    float64
 1   Age        418 non-null    float64
 2   SibSp      418 non-null    float64
 3   Parch      418 non-null    float64
 4   Fare       418 non-null    float64
 5   x0_female  418 non-null    float64
 6   x0_male    418 non-null    float64
 7   x1_C       418 non-null    float64
 8   x1_Q       418 non-null    float64
 9   x1_S       418 non-null    float64
dtypes: float64(10)
memory usage: 32.8 KB


In [48]:
# make prediction using the second tree
y_hat_onehot = tree_clf2.predict(titanic_test_onehot_encoded)
y_hat_onehot.shape

(418,)

In [49]:
y_hat_onehot.astype(int)

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [50]:
# make the dataframe for submission by combining two columns
tree2_onehot_submit = pd.DataFrame({
    'PassengerId': df_test_id, 
    'Survived': y_hat_onehot.astype(int),
})

In [51]:
tree2_onehot_submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [52]:
# save the resulting dataframe as a csv file for Kaggle submission
tree2_onehot_submit.to_csv('./Assignment6/tree2_submit.csv', index=False) 

In [53]:
# Third tree
y_hat_onehot2 = tree_clf3.predict(titanic_test_onehot_encoded)

# make the dataframe for submission by combining two columns
tree3_onehot_submit = pd.DataFrame({
    'PassengerId': df_test_id, 
    'Survived': y_hat_onehot2.astype(int),
})

# save the resulting dataframe as a csv file for Kaggle submission
tree3_onehot_submit.to_csv('./Assignment6/tree3_submit.csv', index=False) 