# Assignment #4 : Decision Tree & Ensemble Algorithms

## STD IDs: 9931155 & 40133693

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from DesicionTree import HDDT, preprocess, bagging, adaboost
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

DS_Covid19HDDT="Dataset/Covid19HDDT.csv"
DS_Covid= "Dataset/Covid.csv"

max_depth=[2, 3, 4, 5]
cut_off=[10, 50, 500]
n_iteration=10



## Part A (Decision Tree):  Dataset: Covid19HDDT.csv

### Step 1:

#### Convert the data to a two-class data set by keeping the smallest class as minority and the rest as majority 

In [4]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)

label_minority_majority = preprocess.minority_0_majority_1(label)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label_minority_majority, threshold=1
)

acc = HDDT.run_HDDT(
    data,
    label_minority_majority,
    n_iteration=n_iteration,
)

print(acc)


{'Precision': 0.90583, 'Recall': 0.89103, 'F-measure': 0.8983, 'G-mean': 0.89836}


#### Handle the multiclass data with OVO (One Versus One) 

In [3]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=1
)


acc = HDDT.OVO(
    data,
    label,
    n_iteration=n_iteration,
)

print(acc)

  - np.sqrt(n_value_negative_label / n_negative_label)


{'Precision': 1.0, 'Recall': 1.0, 'F-measure': 1.0, 'G-mean': 1.0}


#### Handle the multiclass data with OVA (One Versus All)

In [4]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=1
)


acc = HDDT.OVA(
    data,
    label,
    n_iteration=n_iteration,
)

print(acc)

  - np.sqrt(n_value_negative_label / n_negative_label)


{'Precision': 1.0, 'Recall': 1.0, 'F-measure': 1.0, 'G-mean': 1.0}


### Step 2:

#### Repeat  both  of  the  experiments  in  previous  parts  with  the  pruned  HDDT  trees. 

In [5]:
data, label = preprocess.read_dataset(DS_Covid19HDDT)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=1
)

acc_metrics_precision = np.empty((len(max_depth), len(cut_off)))
acc_metrics_recall = np.empty((len(max_depth), len(cut_off)))
acc_metrics_fmeasure = np.empty((len(max_depth), len(cut_off)))
acc_metrics_gmean = np.empty((len(max_depth), len(cut_off)))

for i in range(len(max_depth)):
    for j in range(len(cut_off)):
        acc_metrics = HDDT.OVA(
            data=data,
            label=label,
            n_iteration=n_iteration,
            max_depth=max_depth[i],
            cut_off=cut_off[j],
        )
        acc_metrics_precision[i, j] = acc_metrics["Precision"]
        acc_metrics_recall[i, j] = acc_metrics["Recall"]
        acc_metrics_fmeasure[i, j] = acc_metrics["F-measure"]
        acc_metrics_gmean[i, j] = acc_metrics["G-mean"]

# Print table of result using tabulate
print("Precision")
print(
    tabulate(
        acc_metrics_precision,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nRecall")
print(
    tabulate(
        acc_metrics_recall,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nF-measure")
print(
    tabulate(
        acc_metrics_fmeasure,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nG-mean")
print(
    tabulate(
        acc_metrics_gmean,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)


  - np.sqrt(n_value_negative_label / n_negative_label)


Precision
╒════╤══════╤══════╤═══════╕
│    │   10 │   50 │   500 │
╞════╪══════╪══════╪═══════╡
│  2 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  3 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  4 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  5 │    1 │    1 │     1 │
╘════╧══════╧══════╧═══════╛

Recall
╒════╤══════╤══════╤═══════╕
│    │   10 │   50 │   500 │
╞════╪══════╪══════╪═══════╡
│  2 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  3 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  4 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  5 │    1 │    1 │     1 │
╘════╧══════╧══════╧═══════╛

F-measure
╒════╤══════╤══════╤═══════╕
│    │   10 │   50 │   500 │
╞════╪══════╪══════╪═══════╡
│  2 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  3 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  4 │    1 │    1 │     1 │
├────┼──────┼──────┼───────┤
│  5 │    1 │    1 │     1 │
╘════╧══════╧══════╧═══════╛

G-mean
╒════╤

## Part B (Ensemble Learning): Dataset: Covid.csv

####  implement a bootstrap ensemble algorithm which is Bagging for imbalanced data.

With HDDT base learner

In [6]:
data, label = preprocess.read_dataset(DS_Covid)
label[label == -1] = 0
data = preprocess.fill_missing_value(data=data, method="mode")

data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=1
)

T = [11, 31, 51, 101]

avg_metrics = np.empty((len(T), 4))
std_metrics = np.empty((len(T), 4))

print("bagging with base learner HDDT")
for i in range(len(T)):
    model = bagging(T=T[i], base_learner="HDDT")
    avg, std = model.run_bagging(data=data, label=label, n_iteration=n_iteration)

    avg_metrics[i][0] = avg["Avg-Precision"]
    avg_metrics[i][1] = avg["Avg-Recall"]
    avg_metrics[i][2] = avg["Avg-F-measure"]
    avg_metrics[i][3] = avg["Avg-G-mean"]
    std_metrics[i][0] = std["Std-Precision"]
    std_metrics[i][1] = std["Std-Recall"]
    std_metrics[i][2] = std["Std-F-measure"]
    std_metrics[i][3] = std["Std-G-mean"]

    
print("Average metrics")
print(
    tabulate(
        avg_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

print("Standard deviation metrics")
print(
    tabulate(
        std_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

bagging with base learner HDDT


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Average metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.51858 │  0.50155 │     0.50271 │  0.5063  │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  31 │     0.77522 │  0.5291  │     0.62191 │  0.63678 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  51 │     0.81984 │  0.52277 │     0.63244 │  0.65148 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│ 101 │     0.72778 │  0.51611 │     0.59408 │  0.60774 │
╘═════╧═════════════╧══════════╧═════════════╧══════════╛
Standard deviation metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.1503  │  0.00505 │     0.05713 │  0.06715 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  31 │     0.17562 │  0.0161

With Built-in base learner

In [7]:
data, label = preprocess.read_dataset(DS_Covid)
label[label == -1] = 0
data = preprocess.fill_missing_value(data=data, method="mode")

data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=1
)

T = [11, 31, 51, 101]

avg_metrics = np.empty((len(T), 4))
std_metrics = np.empty((len(T), 4))

print("bagging with base learner Built-in")
for i in range(len(T)):
    print("T = ", T[i])
    model = bagging(T=T[i], base_learner="Built-in")
    avg, std = model.run_bagging(data=data, label=label, n_iteration=n_iteration)

    avg_metrics[i][0] = avg["Avg-Precision"]
    avg_metrics[i][1] = avg["Avg-Recall"]
    avg_metrics[i][2] = avg["Avg-F-measure"]
    avg_metrics[i][3] = avg["Avg-G-mean"]
    std_metrics[i][0] = std["Std-Precision"]
    std_metrics[i][1] = std["Std-Recall"]
    std_metrics[i][2] = std["Std-F-measure"]
    std_metrics[i][3] = std["Std-G-mean"]

    
print("Average metrics")
print(
    tabulate(
        avg_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

print("Standard deviation metrics")
print(
    tabulate(
        std_metrics,
        headers=["Precision", "Recall", "F-measure", "G-mean"],
        showindex=T,
        tablefmt="fancy_grid",
    )
)

bagging with base learner Built-in
T =  11
T =  31
T =  51
T =  101
Average metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.57333 │  0.79104 │     0.66443 │  0.67325 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  31 │     0.57541 │  0.79082 │     0.66555 │  0.67427 │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│  51 │     0.58201 │  0.81419 │     0.67844 │  0.6882  │
├─────┼─────────────┼──────────┼─────────────┼──────────┤
│ 101 │     0.57507 │  0.78957 │     0.66508 │  0.67364 │
╘═════╧═════════════╧══════════╧═════════════╧══════════╛
Standard deviation metrics
╒═════╤═════════════╤══════════╤═════════════╤══════════╕
│     │   Precision │   Recall │   F-measure │   G-mean │
╞═════╪═════════════╪══════════╪═════════════╪══════════╡
│  11 │     0.01494 │  0.05754 │     0.03094 │  0.03356 │
├─────┼────────────

## Method II: Adaboost with UnderSampling

In [9]:
data, label = preprocess.read_dataset(DS_Covid)
label[label == -1] = 0
data = preprocess.fill_missing_value(data=data, method="mode")

data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=1
)

# balance imbalance data with under_sampling
data, label = preprocess.bootstrap_with_under_sampling(data, label)


acc_DT = np.empty((0, 4))
acc_my_adaboost = np.empty((0, 4))
acc_biultin_adaboost = np.empty((0, 4))

for _ in range(n_iteration):
    # Split train and test
    data_train, data_test, label_train, label_test = train_test_split(
        data, label, test_size=0.3, stratify=label
    )

    clf = DecisionTreeClassifier()
    clf.fit(data_train, label_train)
    predicted = clf.predict(data_test)
    acc = preprocess.accuracy(predicted, label_test)
    acc_DT = np.vstack((acc_DT, acc))

    model = adaboost(11)
    model.fit(data_train, label_train)
    predicted = model.predict(data_test)
    acc = preprocess.accuracy(predicted, label_test)
    acc_my_adaboost = np.vstack((acc_my_adaboost, acc))

    clf = AdaBoostClassifier()
    clf.fit(data_train, label_train)
    predicted = clf.predict(data_test)
    acc = preprocess.accuracy(predicted, label_test)
    acc_biultin_adaboost = np.vstack((acc_biultin_adaboost, acc))

acc_DT = np.mean(acc_DT, axis=0).round(5)
print(f"Accuracy DT:\n{acc_DT}\n")

acc_my_adaboost = np.mean(acc_my_adaboost, axis=0).round(5)
print(f"Accuracy My AdaBoost:\n{acc_my_adaboost}\n")

acc_biultin_adaboost = np.mean(acc_biultin_adaboost, axis=0).round(5)
print(f"Accuracy Built-in AdaBoost:\n{acc_biultin_adaboost}\n")


Accuracy DT:
[0.54206 0.54167 0.54186 0.54186]

Accuracy My AdaBoost:
[0.55947 0.55833 0.5589  0.5589 ]

Accuracy Built-in AdaBoost:
[0.5991  0.59667 0.59788 0.59788]

