In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Section 1: Kaggle Data Reading and Preprocessing

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from IPython.display import display

cardio_train_data_path = "/content/drive/MyDrive/Machine Learning Dr. Inas/cardio_train.csv"
df = pd.read_csv(cardio_train_data_path, sep=";")

# drop id (irrelevant feature)
df.drop(columns=["id"], inplace=True)

# remove outliers
df = df[(df["ap_hi"] > 80) & (df["ap_hi"] < 200)]
df = df[(df["ap_lo"] > 40) & (df["ap_lo"] < 160)]

df['age'] = pd.cut(df['age'] / 365.25, bins=[0,5,15,25,40,60,80,100], labels=False)
df['height'] = pd.cut(df['height'], bins=5, labels=False)
df['weight'] = pd.cut(df['weight'], bins=5, labels=False)
df['ap_hi'] = pd.cut(df['ap_hi'], bins=[40, 80, 120, 130, 140, 180, 300], labels=False)
df['ap_lo'] = pd.cut(df['ap_lo'], bins=[40, 60, 80, 90, 120, 180, 200], labels=False)


X = df.drop(columns=["cardio"])
y = df["cardio"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

display(df)
df.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,4,2,2,1,1,1,1,1,0,0,1,0
1,4,1,2,1,3,2,3,1,0,0,1,1
2,4,1,2,1,2,1,3,1,0,0,0,1
3,4,2,2,1,4,3,1,1,0,0,1,1
4,4,1,2,1,1,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,4,2,2,1,1,1,1,1,1,0,1,0
69996,5,1,2,3,3,2,2,2,0,0,1,1
69997,4,2,3,2,4,2,3,1,0,1,0,1
69998,5,1,2,1,3,1,1,2,0,0,0,1


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,68478.0,68478.0,68478.0,68478.0,68478.0,68478.0,68478.0,68478.0,68478.0,68478.0,68478.0,68478.0
mean,4.154575,1.348535,2.140556,1.164695,1.81597,1.321782,1.364,1.225255,0.08778,0.053316,0.803528,0.494407
std,0.430432,0.47651,0.354059,0.416004,1.10966,0.667589,0.678572,0.571283,0.282977,0.224665,0.397332,0.499972
min,3.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,4.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,4.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,4.0,2.0,2.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0
max,5.0,2.0,4.0,4.0,5.0,4.0,3.0,3.0,1.0,1.0,1.0,1.0


# Section 2: Results of The Decision Tree from Scratch Implementation

In [42]:
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score
import time

class ID3_tree:
  def __init__(self, max_depth=0):
    self.max_depth = max_depth
    self.tree = None

  def calc_entropy(self, labels):
    values, counts = np.unique(labels, return_counts=True)
    probabilities = counts / np.sum(counts)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

  def calc_information_gain(self, X, y, feature):
    entropy_s = self.calc_entropy(y)
    unique_values = np.unique(X[feature])

    rem = 0
    for value in unique_values:
      subset_y = y[X[feature] == value]
      rem += (len(subset_y) / len(y)) * self.calc_entropy(subset_y)

    return entropy_s - rem

  def find_best_feature(self, X, y, features):
    best_ig = -np.inf
    best_feature = None
    for feature in features:
      ig = self.calc_information_gain(X, y, feature)
      if ig >= best_ig:
        best_ig = ig
        best_feature = feature
    return best_feature

  def build_tree(self, X, y, features, depth=0):
    if len(set(y)) == 1:
      return y.iloc[0]
    if not features or depth >= self.max_depth:
      return Counter(y).most_common(1)[0][0]

    best_feature = self.find_best_feature(X, y, features)
    remaining_features = [f for f in features if f != best_feature]
    tree = {best_feature:{}}

    for value in X[best_feature].unique():
      x_subset = X[X[best_feature] == value]
      y_subset = y[X[best_feature] == value]

      if len(x_subset) == 0:
        tree[best_feature][value] = Counter(y).most_common(1)[0][0]
      else:
        tree[best_feature][value] = self.build_tree(x_subset, y_subset, remaining_features, depth+1)

    return tree

  def fit(self, X, y):
    features = X.columns.tolist()
    self.tree = self.build_tree(X, y, features)

  def predict_sample(self, sample, tree):
    if not isinstance(tree, dict):
        return tree

    feature = next(iter(tree))
    value = sample[feature]

    if value not in tree[feature]:
        return Counter(y_train).most_common(1)[0][0]

    return self.predict_sample(sample, tree[feature][value])

  def predict(self, X):
    return X.apply(lambda row: self.predict_sample(row, self.tree), axis=1)


start_time = time.time()
id3_tree = ID3_tree(max_depth=5)
id3_tree.fit(X_train, y_train)
id3_train_time = time.time() - start_time

start_time = time.time()
id3_pred = id3_tree.predict(X_test)
id3_pred_time = time.time() - start_time

id3_accuracy = accuracy_score(id3_pred, y_test)
print(f"Testing Accuracy of ID3 from Scratch is {id3_accuracy:.3f}")
print(f"Training Accuracy of ID3 from Scratch is {accuracy_score(id3_tree.predict(X_train), y_train):.3f}")
print(f"Training Time of ID3 from Scratch is {id3_train_time:.3f}")
print(f"Prediction Time of ID3 from Scratch is {id3_pred_time:.3f}")


Testing Accuracy of ID3 from Scratch is 0.732
Training Accuracy of ID3 from Scratch is 0.730
Training Time of ID3 from Scratch is 3.386
Prediction Time of ID3 from Scratch is 0.733


# Section 3: Results of The Scikit-learn Library Implementation

In [43]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import time

start_time = time.time()
dt = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=0)
dt.fit(X_train, y_train)
sikit_learn_train_time = time.time() - start_time

start_time = time.time()
y_pred = dt.predict(X_test)
sikit_learn_pred_time = time.time() - start_time

print(f"Testing Accuracy of ID3 from sikit learn is {accuracy_score(y_test, y_pred):.3f}")
print(f"Training Accuracy of ID3 from sikit learn is {accuracy_score(dt.predict(X_train), y_train):.3f}")
print(f"Training Time of ID3 from sikit learn is {sikit_learn_train_time:.3f}")
print(f"Prediction Time of ID3 from sikit learn is {sikit_learn_pred_time:.3f}")

Testing Accuracy of ID3 from sikit learn is 0.732
Training Accuracy of ID3 from sikit learn is 0.727
Training Time of ID3 from sikit learn is 0.052
Prediction Time of ID3 from sikit learn is 0.004


# Section 4: From-Scratch Implementation on the Part 3 - Student Data

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = {
    "Early registration": [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1],
    "Finished homework II": [1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0],
    "Senior": [0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0],
    "Likes Coffee": [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1],
    "Liked The Last homework": [1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0],
    "A": [1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1]
}
df = pd.DataFrame(data)
X_student = df.drop(columns=["A"])
y_student = df["A"]

X_train_student, X_test_student, y_train_student, y_test_student = train_test_split(X_student, y_student, test_size=0.2, random_state=0)


start_time = time.time()
id3_tree = ID3_tree(max_depth=5)
id3_tree.fit(X_train_student, y_train_student)
id3_train_time = time.time() - start_time

start_time = time.time()
id3_pred_student = id3_tree.predict(X_test_student)
id3_pred_time = time.time() - start_time

id3_accuracy = accuracy_score(id3_pred_student, y_test_student)
print(f"Accuracy of ID3 from Scratch is {id3_accuracy:.3f}")
print(f"Training Time of ID3 from Scratch is {id3_train_time:.3f}")
print(f"Prediction Time of ID3 from Scratch is {id3_pred_time:.3f}")




start_time = time.time()
dt = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=0)
dt.fit(X_train_student, y_train_student)
sikit_learn_train_time = time.time() - start_time

start_time = time.time()
y_pred = dt.predict(X_test_student)
sikit_learn_pred_time = time.time() - start_time

print(f"\nAccuracy of ID3 from sikit learn is {accuracy_score(y_test_student, y_pred):.3f}")
print(f"Training Accuracy of ID3 from sikit learn is {accuracy_score(y_train_student, dt.predict(X_train_student)):.3f}")
print(f"Training Time of ID3 from sikit learn is {sikit_learn_train_time:.3f}")
print(f"Prediction Time of ID3 from sikit learn is {sikit_learn_pred_time:.3f}")


Accuracy of ID3 from Scratch is 0.333
Training Time of ID3 from Scratch is 0.023
Prediction Time of ID3 from Scratch is 0.001

Accuracy of ID3 from sikit learn is 0.333
Training Accuracy of ID3 from sikit learn is 1.000
Training Time of ID3 from sikit learn is 0.003
Prediction Time of ID3 from sikit learn is 0.001
