In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("../train.csv", usecols=[1,5,6,7,9])
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,22.0,1,0,7.2500
1,1,38.0,1,0,71.2833
2,1,26.0,0,0,7.9250
3,1,35.0,1,0,53.1000
4,0,35.0,0,0,8.0500
...,...,...,...,...,...
886,0,27.0,0,0,13.0000
887,1,19.0,0,0,30.0000
888,0,,1,2,23.4500
889,1,26.0,0,0,30.0000


In [3]:
df.isnull().sum()

Survived      0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
df['Family'] = df['SibSp'] + df['Parch']
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Family
0,0,22.0,1,0,7.2500,1
1,1,38.0,1,0,71.2833,1
2,1,26.0,0,0,7.9250,0
3,1,35.0,1,0,53.1000,1
4,0,35.0,0,0,8.0500,0
...,...,...,...,...,...,...
885,0,39.0,0,5,29.1250,5
886,0,27.0,0,0,13.0000,0
887,1,19.0,0,0,30.0000,0
889,1,26.0,0,0,30.0000,0


In [6]:
df.drop(columns=['SibSp','Parch'],inplace=True)
df

Unnamed: 0,Survived,Age,Fare,Family
0,0,22.0,7.2500,1
1,1,38.0,71.2833,1
2,1,26.0,7.9250,0
3,1,35.0,53.1000,1
4,0,35.0,8.0500,0
...,...,...,...,...
885,0,39.0,29.1250,5
886,0,27.0,13.0000,0
887,1,19.0,30.0000,0
889,1,26.0,30.0000,0


In [7]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
y_train

328    1
73     0
253    0
719    0
666    0
      ..
92     0
134    0
337    1
548    0
130    0
Name: Survived, Length: 571, dtype: int64

# Without Binarization

In [10]:
dtc = DecisionTreeClassifier()

dtc.fit(X_train,y_train)

y_pred = dtc.predict(X_test)

print("Decision Tree Accuracy Score",accuracy_score(y_test,y_pred))

Decision Tree Accuracy Score 0.6153846153846154


#### cross val score

In [12]:
dtc = DecisionTreeClassifier()

print("Decision Tree Cross Val Score",np.mean(cross_val_score(dtc,X,y,scoring='accuracy',cv=10)))

Decision Tree Accuracy Score 0.6555946791862285


# After Binarization

In [14]:
from sklearn.preprocessing import Binarizer

trf = ColumnTransformer(transformers=[
    ('bin',Binarizer(copy=False),['Family'])
],remainder='passthrough')


In [15]:
X_train_transformed =  trf.fit_transform(X_train)
X_test_transformed =  trf.transform(X_test)

In [16]:
dtc = DecisionTreeClassifier()

dtc.fit(X_train_transformed,y_train)

y_pred_trans = dtc.predict(X_test_transformed)
print("Decision Tree Accuracy Score After Transformed",accuracy_score(y_test,y_pred_trans))

Decision Tree Accuracy Score After Transformed 0.6293706293706294


#### cross val Score

In [18]:
X_transformed = trf.fit_transform(X)
dtc = DecisionTreeClassifier()

print("Decision Tree Cross Val Score After Transformed",np.mean(cross_val_score(dtc,X,y,scoring='accuracy')))

Decision Tree Cross Val Score After Transformed 0.6554515906628582
