In [1]:
import time
import tracemalloc
import psutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix, classification_report
from decision_tree_classifier import DecisionTreeClassifier
import scienceplots

In [2]:
plt.style.use(["grid", "notebook", "science"])

In [3]:
df = pd.read_csv("dataset.csv")

X, y = df.iloc[:, 1:-1], df.iloc[:, -1]


In [4]:
X.tail()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [5]:
y.tail()

145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, dtype: object

In [6]:
best_features = SelectKBest(f_classif, k=4).fit(X, y)
p_values = pd.Series(best_features.pvalues_)
p_values.index = X.columns

p_values.sort_values()

PetalLengthCm    3.051976e-91
PetalWidthCm     4.376957e-85
SepalLengthCm    1.669669e-31
SepalWidthCm     1.327917e-16
dtype: float64

In [7]:
X, y = X.to_numpy(), y.to_numpy()

In [8]:
def accuracy(test, predictions):
    return (np.sum(test == predictions) / len(test))

In [9]:
stratified_kfold = StratifiedKFold(n_splits=5, random_state=3, shuffle=True)

results_entropy = []

for i, (train_index, test_index) in enumerate(stratified_kfold.split(X, y)):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    results_entropy.append({
        "accuracy": accuracy(y_test, y_pred),
        "confusion_matrix": confusion_matrix(y_test, y_pred),
        "classification_report": classification_report(y_test, y_pred)
    })
    

In [10]:
stratified_kfold = StratifiedKFold(n_splits=5, random_state=3, shuffle=True)

results_gini = []

for i, (train_index, test_index) in enumerate(stratified_kfold.split(X, y)):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    dt = DecisionTreeClassifier(criterion="gini")
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    results_gini.append({
        "accuracy": accuracy(y_test, y_pred),
        "confusion_matrix": confusion_matrix(y_test, y_pred),
        "classification_report": classification_report(y_test, y_pred)
    })

### Accuracy

In [11]:
results_gini_df = pd.DataFrame(results_gini)
results_entropy_df = pd.DataFrame(results_entropy)

In [12]:
results_gini_df

Unnamed: 0,accuracy,confusion_matrix,classification_report
0,0.866667,"[[10, 0, 0], [0, 8, 2], [0, 2, 8]]",precision recall f1-score...
1,0.933333,"[[9, 1, 0], [0, 10, 0], [0, 1, 9]]",precision recall f1-score...
2,0.9,"[[10, 0, 0], [0, 8, 2], [0, 1, 9]]",precision recall f1-score...
3,1.0,"[[10, 0, 0], [0, 10, 0], [0, 0, 10]]",precision recall f1-score...
4,0.966667,"[[10, 0, 0], [0, 9, 1], [0, 0, 10]]",precision recall f1-score...


In [25]:
matrix = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]

for confusion_matrix in results_gini_df["confusion_matrix"]:
    matrix[0][0] += confusion_matrix[0][0]
    matrix[0][1] += confusion_matrix[0][1]
    matrix[0][2] += confusion_matrix[0][2]
    matrix[1][0] += confusion_matrix[1][0]
    matrix[1][1] += confusion_matrix[1][1]
    matrix[1][2] += confusion_matrix[1][2]
    matrix[2][0] += confusion_matrix[2][0]
    matrix[2][1] += confusion_matrix[2][1]
    matrix[2][2] += confusion_matrix[2][2]

In [26]:
matrix

[[49, 1, 0], [0, 45, 5], [0, 4, 46]]

In [23]:
for classification_report in results_gini_df["classification_report"]:
    print(classification_report)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.80      0.80      0.80        10
 Iris-virginica       0.80      0.80      0.80        10

       accuracy                           0.87        30
      macro avg       0.87      0.87      0.87        30
   weighted avg       0.87      0.87      0.87        30

                 precision    recall  f1-score   support

    Iris-setosa       1.00      0.90      0.95        10
Iris-versicolor       0.83      1.00      0.91        10
 Iris-virginica       1.00      0.90      0.95        10

       accuracy                           0.93        30
      macro avg       0.94      0.93      0.93        30
   weighted avg       0.94      0.93      0.93        30

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.89      0.80      0.84        10
 Iris-virginica       0.

In [13]:
results_gini_df["accuracy"].mean()

0.9333333333333333

In [14]:
results_entropy_df

Unnamed: 0,accuracy,confusion_matrix,classification_report
0,0.866667,"[[10, 0, 0], [0, 8, 2], [0, 2, 8]]",precision recall f1-score...
1,0.966667,"[[10, 0, 0], [0, 10, 0], [0, 1, 9]]",precision recall f1-score...
2,0.9,"[[10, 0, 0], [0, 8, 2], [0, 1, 9]]",precision recall f1-score...
3,0.966667,"[[10, 0, 0], [0, 9, 1], [0, 0, 10]]",precision recall f1-score...
4,0.966667,"[[10, 0, 0], [0, 9, 1], [0, 0, 10]]",precision recall f1-score...


In [27]:
matrix = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]

for confusion_matrix in results_entropy_df["confusion_matrix"]:
    matrix[0][0] += confusion_matrix[0][0]
    matrix[0][1] += confusion_matrix[0][1]
    matrix[0][2] += confusion_matrix[0][2]
    matrix[1][0] += confusion_matrix[1][0]
    matrix[1][1] += confusion_matrix[1][1]
    matrix[1][2] += confusion_matrix[1][2]
    matrix[2][0] += confusion_matrix[2][0]
    matrix[2][1] += confusion_matrix[2][1]
    matrix[2][2] += confusion_matrix[2][2]

In [28]:
matrix

[[50, 0, 0], [0, 44, 6], [0, 4, 46]]

In [15]:
results_entropy_df["accuracy"].mean()

0.9333333333333333

### Runtime

In [16]:
runtime = []

for _ in range(100):
    start = time.perf_counter()
    dt.fit(X_train, y_train)
    stop = time.perf_counter()
    runtime.append((stop - start) * 1000)

In [17]:
runtime_df = pd.DataFrame(runtime, columns=["Milliseconds"])

In [18]:
runtime_df.describe()

Unnamed: 0,Milliseconds
count,100.0
mean,51.533961
std,5.582829
min,45.2493
25%,48.522125
50%,50.47925
75%,52.532775
max,79.6804


In [19]:
runtime = []

for _ in range(100):
    start = time.perf_counter()
    dt.predict(X_test)
    stop = time.perf_counter()
    runtime.append((stop - start) * 1000)

In [20]:
runtime_df = pd.DataFrame(runtime, columns=["Milliseconds"])
runtime_df.describe()

Unnamed: 0,Milliseconds
count,100.0
mean,0.045439
std,0.01664
min,0.0357
25%,0.0364
50%,0.0382
75%,0.04675
max,0.1431


In [21]:

first_size, first_peak = tracemalloc.get_traced_memory()
tracemalloc.reset_peak()
print(f"{first_size=}, {first_peak=}")

first_size=0, first_peak=0


In [22]:
tracemalloc.start()
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
print(tracemalloc.get_traced_memory())


tracemalloc.stop()

(4781, 18933)
