# Binarization

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

from sklearn.compose import ColumnTransformer

In [2]:
# load dataset
df = sns.load_dataset('titanic')[['survived', 'age', 'fare', 'parch', 'sibsp']]
df.head()

Unnamed: 0,survived,age,fare,parch,sibsp
0,0,22.0,7.25,0,1
1,1,38.0,71.2833,0,1
2,1,26.0,7.925,0,0
3,1,35.0,53.1,0,1
4,0,35.0,8.05,0,0


In [3]:
# create a new feature 'family' by combining 'sibsp' and 'parch'
df['family'] = df['sibsp'] + df['parch']

In [4]:
df.head()

Unnamed: 0,survived,age,fare,parch,sibsp,family
0,0,22.0,7.25,0,1,1
1,1,38.0,71.2833,0,1,1
2,1,26.0,7.925,0,0,0
3,1,35.0,53.1,0,1,1
4,0,35.0,8.05,0,0,0


In [4]:

# drop rows with missing values
df.dropna(inplace=True)

In [5]:
# split dataset into features and target variable
X = df.drop(columns=['survived'])
y = df['survived']

In [6]:

# split data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Without binarization

In [16]:
# Initialize a Decision Tree classifier
clf = DecisionTreeClassifier()

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy without Binarization: {accuracy:.6f}")

Accuracy without Binarization: 0.615385


In [8]:
# cross validation score
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

np.float64(0.6415101721439751)

# Applying Binarization

In [7]:
from sklearn.preprocessing import Binarizer

In [17]:
# Applying Binarization with column transformer
trf = ColumnTransformer([
    ('bin',Binarizer(copy=False),['family'])
],remainder='passthrough')

In [18]:

# Transform the training and testing data
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [None]:
# Convert the transformed training data into a DataFrame with meaningful column names
pd.DataFrame(X_train_trf,columns=['family', 'age_bin1', 'age_bin2', 'fare_bin1', 'fare_bin2'])

Unnamed: 0,family,age_bin1,age_bin2,fare_bin1,fare_bin2
0,1.0,31.0,20.5250,1.0,1.0
1,1.0,26.0,14.4542,0.0,1.0
2,1.0,30.0,16.1000,0.0,1.0
3,0.0,33.0,7.7750,0.0,0.0
4,0.0,25.0,13.0000,0.0,0.0
...,...,...,...,...,...
566,1.0,46.0,61.1750,0.0,1.0
567,0.0,25.0,13.0000,0.0,0.0
568,0.0,41.0,134.5000,0.0,0.0
569,1.0,33.0,20.5250,1.0,1.0


In [21]:
# Initialize a Decision Tree classifier
clf = DecisionTreeClassifier()

# Train the classifier on the transformed training data
clf.fit(X_train_trf, y_train)

# Make predictions on the transformed test data
y_pred2 = clf.predict(X_test_trf)

# Evaluate the accuracy of the model on the test set
accuracy_score(y_test, y_pred2)

0.6363636363636364

In [22]:
# Transform the features using the fitted transformer
X_trf = trf.fit_transform(X)

# Evaluate a Decision Tree classifier with 10-fold cross-validation and compute the mean accuracy
np.mean(cross_val_score(DecisionTreeClassifier(), X_trf, y, cv=10, scoring='accuracy'))

np.float64(0.640160406885759)