In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeClassifier

In [42]:
data = pd.read_csv("data/titanic/train.csv", usecols=['Age',"Fare",'Survived'])
data.sample(5)

Unnamed: 0,Survived,Age,Fare
7,0,2.0,21.075
682,0,20.0,9.225
625,0,61.0,32.3208
510,1,29.0,7.75
832,0,,7.2292


In [43]:
data.isna().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [44]:
data["Age"].fillna(data["Age"].mean(),inplace=True)

In [45]:
data.isna().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [46]:
X = data.drop("Survived", axis=1)
y = data["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [47]:
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_preds = clf.predict(X_test)
print("Accuracy Score ",accuracy_score(y_test,y_preds)*100)

Accuracy Score  62.01117318435754


In [48]:
cv_score_clf = cross_val_score(clf,X,y,cv=10,scoring='accuracy')
print("Accuracy Score ",cv_score_clf.mean()*100)

Accuracy Score  65.99500624219725


# Discretization / Binning KBinsDiscretizer(strategy = ?)

* uniform - equal bins size
* quantile - percentile size
* kmeans - clustering centroid

In [49]:

kbin_age = KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='uniform')
kbin_fare = KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='uniform')
tfr = ColumnTransformer([
     ('age_bins',kbin_age,[0]),
     ('fare_bins',kbin_fare,[1])
],remainder='passthrough')

In [50]:
X_train_tfr =  tfr.fit_transform(X_train)
X_test_tfr = tfr.fit_transform(X_test)
clf.fit(X_train_tfr,y_train)
y_preds_tfr = clf.predict(X_test_tfr)
print("Accuracy Score ",accuracy_score(y_test,y_preds_tfr)*100)

Accuracy Score  65.36312849162012


In [51]:
X_tfr =  tfr.fit_transform(X)
cv_score_clf_tfr = cross_val_score(clf,X_tfr,y,cv=10,scoring='accuracy')
print("Accuracy Score ",cv_score_clf_tfr.mean()*100)

Accuracy Score  69.59675405742821


In [52]:
def discretizer(bins,strategy):
    kbin_age = KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    kbin_fare = KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    tfr = ColumnTransformer([
        ('age_bins',kbin_age,[0]),
        ('fare_bins',kbin_fare,[1])],remainder='passthrough')
    X_tfr =  tfr.fit_transform(X)
    cv_score_clf_tfr = cross_val_score(clf,X_tfr,y,cv=10,scoring='accuracy')
    print("Strategy :::: ",strategy)
    print("Accuracy Score :::: ",cv_score_clf_tfr.mean()*100)

In [53]:
discretizer(10,'uniform')
discretizer(10,'quantile')
discretizer(10,'kmeans')

Strategy ::::  uniform
Accuracy Score ::::  69.59675405742821
Strategy ::::  quantile
Accuracy Score ::::  68.25468164794009




Strategy ::::  kmeans
Accuracy Score ::::  69.5980024968789


# Binarizer()

In [54]:
from sklearn.preprocessing import Binarizer

In [56]:
data = pd.read_csv('data/titanic/train.csv', usecols=['Age','Fare','SibSp','Parch','Survived'])
data.sample()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
545,0,64.0,0,0,26.0


In [58]:
data['Family']=data['SibSp']+data['Parch']
data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Family
0,0,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,1
2,1,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,1
4,0,35.0,0,0,8.05,0


In [59]:
data = data.drop(['SibSp','Parch'],axis=1)
data.head()

Unnamed: 0,Survived,Age,Fare,Family
0,0,22.0,7.25,1
1,1,38.0,71.2833,1
2,1,26.0,7.925,0
3,1,35.0,53.1,1
4,0,35.0,8.05,0


In [60]:
data.isna().sum()

Survived      0
Age         177
Fare          0
Family        0
dtype: int64

In [62]:
data['Age'].fillna(data['Age'].mean(),inplace=True)
data.isna().sum()

Survived    0
Age         0
Fare        0
Family      0
dtype: int64

In [63]:
X = data.drop("Survived",axis=1)
y = data["Survived"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [64]:
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_preds = clf.predict(X_test)
print("Accuracy Score ",accuracy_score(y_test,y_preds)*100)

Accuracy Score  67.0391061452514


In [65]:
cv_score_clf = cross_val_score(clf,X,y,cv=10,scoring='accuracy')
print("Accuracy Score ",cv_score_clf.mean()*100)

Accuracy Score  64.87016229712859


In [68]:
tfr = ColumnTransformer([
    ('family_bin',Binarizer(copy=False),[2])
],remainder='passthrough')

In [69]:
X_train_tfr = tfr.fit_transform(X_train)
X_test_tfr =tfr.fit_transform(X_test)

In [70]:
clf.fit(X_train_tfr,y_train)
y_preds_tfr = clf.predict(X_test_tfr)
print("Accuracy Score ",accuracy_score(y_test,y_preds_tfr)*100)

Accuracy Score  66.4804469273743


In [71]:
X_tfr =  tfr.fit_transform(X)
cv_score_clf_tfr = cross_val_score(clf,X_tfr,y,cv=10,scoring='accuracy')
print("Accuracy Score ",cv_score_clf_tfr.mean()*100)

Accuracy Score  65.09612983770286
