# Assignment #4 : Decision Tree & Ensemble Algorithms

## STD IDs: 9931155 & 40133693

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from DesicionTree import HDDT, preprocess, bagging, adaboost
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import warnings

warnings.filterwarnings('ignore')
DS_Covid19HDDT="Dataset/Covid19HDDT.csv"
DS_Covid= "Dataset/Covid.csv"

max_depth=[2, 3, 4, 5]
cut_off=[10, 50, 500]
n_iteration=10
threshold = 0.3


## Part A (Decision Tree):  Dataset: Covid19HDDT.csv

### Step 1:

#### Convert the data to a two-class data set by keeping the smallest class as minority and the rest as majority 

In [2]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)

label_minority_majority = preprocess.minority_0_majority_1(label)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label_minority_majority, threshold=threshold
)

acc = HDDT.run_HDDT(
    data,
    label_minority_majority,
    n_iteration=n_iteration,
)

print(acc)


{'Precision': 0.81729, 'Recall': 0.53409, 'F-measure': 0.64581, 'G-mean': 0.66057}


#### Handle the multiclass data with OVO (One Versus One) 

In [3]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=threshold
)


acc = HDDT.OVO(
    data,
    label,
    n_iteration=n_iteration,
)

print(acc)

{'Precision': 0.96895, 'Recall': 0.96579, 'F-measure': 0.96737, 'G-mean': 0.96737}


#### Handle the multiclass data with OVA (One Versus All)

In [4]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=threshold
)


acc = HDDT.OVA(
    data,
    label,
    n_iteration=n_iteration,
)

print(acc)

{'Precision': 0.96814, 'Recall': 0.9667, 'F-measure': 0.96741, 'G-mean': 0.96742}


### Step 2:

#### Repeat  both  of  the  experiments  in  previous  parts  with  the  pruned  HDDT  trees. 

In [5]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=threshold
)

acc_metrics_precision = np.empty((len(max_depth), len(cut_off)))
acc_metrics_recall = np.empty((len(max_depth), len(cut_off)))
acc_metrics_fmeasure = np.empty((len(max_depth), len(cut_off)))
acc_metrics_gmean = np.empty((len(max_depth), len(cut_off)))

for i in range(len(max_depth)):
    for j in range(len(cut_off)):
        acc_metrics = HDDT.OVA(
            data=data,
            label=label,
            n_iteration=n_iteration,
            max_depth=max_depth[i],
            cut_off=cut_off[j],
        )
        acc_metrics_precision[i, j] = acc_metrics["Precision"]
        acc_metrics_recall[i, j] = acc_metrics["Recall"]
        acc_metrics_fmeasure[i, j] = acc_metrics["F-measure"]
        acc_metrics_gmean[i, j] = acc_metrics["G-mean"]

# Print table of result using tabulate
print("Precision")
print(
    tabulate(
        acc_metrics_precision,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nRecall")
print(
    tabulate(
        acc_metrics_recall,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nF-measure")
print(
    tabulate(
        acc_metrics_fmeasure,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nG-mean")
print(
    tabulate(
        acc_metrics_gmean,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)


Precision
╒════╤═════════╤═════════╤═════════╕
│    │      10 │      50 │     500 │
╞════╪═════════╪═════════╪═════════╡
│  2 │ 0.97494 │ 0.97262 │ 0.97374 │
├────┼─────────┼─────────┼─────────┤
│  3 │ 0.96974 │ 0.97024 │ 0.9715  │
├────┼─────────┼─────────┼─────────┤
│  4 │ 0.97077 │ 0.97081 │ 0.97088 │
├────┼─────────┼─────────┼─────────┤
│  5 │ 0.96981 │ 0.96954 │ 0.96919 │
╘════╧═════════╧═════════╧═════════╛

Recall
╒════╤═════════╤═════════╤═════════╕
│    │      10 │      50 │     500 │
╞════╪═════════╪═════════╪═════════╡
│  2 │ 0.96958 │ 0.97288 │ 0.97106 │
├────┼─────────┼─────────┼─────────┤
│  3 │ 0.97171 │ 0.97148 │ 0.96937 │
├────┼─────────┼─────────┼─────────┤
│  4 │ 0.96695 │ 0.96582 │ 0.96667 │
├────┼─────────┼─────────┼─────────┤
│  5 │ 0.96568 │ 0.96689 │ 0.96724 │
╘════╧═════════╧═════════╧═════════╛

F-measure
╒════╤═════════╤═════════╤═════════╕
│    │      10 │      50 │     500 │
╞════╪═════════╪═════════╪═════════╡
│  2 │ 0.97222 │ 0.9727  │ 0.97234 │
├────┼───

## Part B (Ensemble Learning): Dataset: Covid.csv

####  implement a bootstrap ensemble algorithm which is Bagging for imbalanced data.

With HDDT base learner

In [6]:
data, label = preprocess.read_dataset(DS_Covid)
label[label == -1] = 0
data = preprocess.fill_missing_value(data=data, method="mode")

data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=threshold
)

T = [11, 31, 51, 101]

avg_metrics = np.empty((len(T), 4))
std_metrics = np.empty((len(T), 4))

print("bagging with base learner HDDT")
for i in range(len(T)):
    model = bagging(T=T[i], base_learner="HDDT")
    avg, std = model.run_bagging(data=data, label=label, n_iteration=n_iteration)

    avg_metrics[i][0] = avg["Avg-Precision"]
    avg_metrics[i][1] = avg["Avg-Recall"]
    avg_metrics[i][2] = avg["Avg-F-measure"]
    avg_metrics[i][3] = avg["Avg-G-mean"]
    std_metrics[i][0] = std["Std-Precision"]
    std_metrics[i][1] = std["Std-Recall"]
    std_metrics[i][2] = std["Std-F-measure"]
    std_metrics[i][3] = std["Std-G-mean"]

    
print("Average metrics")
print(
    tabulate(
        avg_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

print("Standard deviation metrics")
print(
    tabulate(
        std_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

bagging with base learner HDDT
Average metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.56887 │  0.50644 │     0.52452 │  0.53094 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  31 │     0.80327 │  0.52444 │     0.62583 │  0.64446 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  51 │     0.8295  │  0.53099 │     0.64248 │  0.66102 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│ 101 │     0.5272  │  0.50633 │     0.51216 │  0.51444 │
╘═════╧═════════════╧══════════╧═════════════╧══════════╛
Standard deviation metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.2008  │  0.01539 │     0.0821  │  0.09435 │
├─────┼─────────────┼──────────┼─────────────┼──────────

With Built-in base learner

In [7]:
data, label = preprocess.read_dataset(DS_Covid)
label[label == -1] = 0
data = preprocess.fill_missing_value(data=data, method="mode")

data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=threshold
)

T = [11, 31, 51, 101]

avg_metrics = np.empty((len(T), 4))
std_metrics = np.empty((len(T), 4))

print("bagging with base learner Built-in")
for i in range(len(T)):
    print("T = ", T[i])
    model = bagging(T=T[i], base_learner="Built-in")
    avg, std = model.run_bagging(data=data, label=label, n_iteration=n_iteration)

    avg_metrics[i][0] = avg["Avg-Precision"]
    avg_metrics[i][1] = avg["Avg-Recall"]
    avg_metrics[i][2] = avg["Avg-F-measure"]
    avg_metrics[i][3] = avg["Avg-G-mean"]
    std_metrics[i][0] = std["Std-Precision"]
    std_metrics[i][1] = std["Std-Recall"]
    std_metrics[i][2] = std["Std-F-measure"]
    std_metrics[i][3] = std["Std-G-mean"]

    
print("Average metrics")
print(
    tabulate(
        avg_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

print("Standard deviation metrics")
print(
    tabulate(
        std_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

bagging with base learner Built-in
T =  11
T =  31
T =  51
T =  101
Average metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.57522 │  0.7808  │     0.66196 │  0.66993 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  31 │     0.57733 │  0.79948 │     0.67004 │  0.67916 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  51 │     0.5809  │  0.81007 │     0.67633 │  0.68584 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│ 101 │     0.5742  │  0.78845 │     0.66383 │  0.67252 │
╘═════╧═════════════╧══════════╧═════════════╧══════════╛
Standard deviation metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.01716 │  0.06236 │     0.03473 │  0.03722 │
├─────┼────────────

## Method II: Adaboost with UnderSampling

In [10]:
data, label = preprocess.read_dataset(DS_Covid)
label[label == -1] = 0
data = preprocess.fill_missing_value(data=data, method="mode")

data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=threshold
)

# balance imbalance data with under_sampling
data, label = preprocess.bootstrap_with_under_sampling(data, label)


acc_DT = np.empty((0, 4))
acc_my_adaboost = np.empty((0, 4))
acc_biultin_adaboost = np.empty((0, 4))

for _ in range(n_iteration):
    # Split train and test
    data_train, data_test, label_train, label_test = train_test_split(
        data, label, test_size=0.3, stratify=label
    )

    clf = DecisionTreeClassifier()
    clf.fit(data_train, label_train)
    predicted = clf.predict(data_test)
    acc = preprocess.accuracy(predicted, label_test)
    acc_DT = np.vstack((acc_DT, acc))

    model = adaboost(11)
    model.fit(data_train, label_train)
    predicted = model.predict(data_test)
    acc = preprocess.accuracy(predicted, label_test)
    acc_my_adaboost = np.vstack((acc_my_adaboost, acc))

    clf = AdaBoostClassifier()
    clf.fit(data_train, label_train)
    predicted = clf.predict(data_test)
    acc = preprocess.accuracy(predicted, label_test)
    acc_biultin_adaboost = np.vstack((acc_biultin_adaboost, acc))

acc_DT = np.mean(acc_DT, axis=0).round(5)
print(f"Accuracy DT:\n{acc_DT}\n")

acc_my_adaboost = np.mean(acc_my_adaboost, axis=0).round(5)
print(f"Accuracy My AdaBoost:\n{acc_my_adaboost}\n")

acc_biultin_adaboost = np.mean(acc_biultin_adaboost, axis=0).round(5)
print(f"Accuracy Built-in AdaBoost:\n{acc_biultin_adaboost}\n")



n_classifiers = [11, 31, 51, 101]

tabel_adaboost_10 = np.empty((0, 4))

for classifiers in n_classifiers:
    acc_adaboost = np.empty((0, 4))
    for _ in range(10):
        # Split train and test
        data_train, data_test, label_train, label_test = train_test_split(
            data, label, test_size=0.3, stratify=label
        )
        model = adaboost(T=classifiers)
        model.fit(data_train, label_train)
        predicted = model.predict(data_test)
        acc = preprocess.accuracy(predicted, label_test)
        acc_adaboost = np.vstack((acc_adaboost, acc))

    acc_adaboost = np.mean(acc_adaboost, axis=0).round(5)
    tabel_adaboost_10 = np.vstack((tabel_adaboost_10, acc_adaboost))


tabel_adaboost_15 = np.empty((0, 4))

for classifiers in n_classifiers:
    acc_adaboost = np.empty((0, 4))
    for _ in range(15):
        # Split train and test
        data_train, data_test, label_train, label_test = train_test_split(
            data, label, test_size=0.3, stratify=label
        )
        model = adaboost(T=classifiers)
        model.fit(data_train, label_train)
        predicted = model.predict(data_test)
        acc = preprocess.accuracy(predicted, label_test)
        acc_adaboost = np.vstack((acc_adaboost, acc))

    acc_adaboost = np.mean(acc_adaboost, axis=0).round(5)
    tabel_adaboost_15 = np.vstack((tabel_adaboost_15, acc_adaboost))

print(
    "Accuracy Adaboost with 10 iteration for different number of classifiers"
)
print(
    tabulate(
        tabel_adaboost_10,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        tablefmt="fancy_grid",
        showindex=n_classifiers,
    )
)
print(
    "\nAccuracy Adaboost with 15 iteration for different number of classifiers"
)
print(
    tabulate(
        tabel_adaboost_15,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        tablefmt="fancy_grid",
        showindex=n_classifiers,
    )
)


Accuracy DT:
[0.54927 0.54833 0.5488  0.5488 ]

Accuracy My AdaBoost:
[0.56092 0.56    0.56046 0.56046]

Accuracy Built-in AdaBoost:
[0.57981 0.57667 0.57822 0.57823]

Accuracy Adaboost with 10 iteration for different number of classifiers
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.57012 │  0.565   │     0.56752 │  0.56754 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  31 │     0.55567 │  0.55333 │     0.5545  │  0.5545  │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  51 │     0.54412 │  0.54333 │     0.54372 │  0.54372 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│ 101 │     0.57417 │  0.57167 │     0.57291 │  0.57291 │
╘═════╧═════════════╧══════════╧═════════════╧══════════╛

Accuracy Adaboost with 15 iteration for different number of classifiers
╒═════╤═════════════╤══════════╤═════════════╤═══