# Assignment #4 : Decision Tree & Ensemble Algorithms

## STD IDs: 9931155 & 40133693

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from DesicionTree import DT, preprocess

DS_path="Dataset/Covid19HDDT.csv"

max_depth=[2, 3, 4, 5]
cut_off=[10, 50, 500]
n_iteration=10

data, label = preprocess.read_dataset(DS_path)


## Part A (Decision Tree):  Dataset: Covid19HDDT.csv

### Step 1:

#### Convert the data to a two-class data set by keeping the smallest class as minority and the rest as majority 

In [2]:
data, label = preprocess.read_dataset(DS_path)

label_minority_majority = preprocess.minority_0_majority_1(label)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label_minority_majority, threshold=0.3
)

acc = DT.run_HDDT(
    data,
    label_minority_majority,
    n_iteration=n_iteration,
)

print(acc)


{'Precision': 0.92412, 'Recall': 0.92412, 'F-measure': 0.92412, 'AUC': 0.53514, 'G-mean': 0.92412}


#### Handle the multiclass data with OVO (One Versus One) 

In [2]:
data, label = preprocess.read_dataset(DS_path)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=0.3
)


acc = DT.OVO(
    data,
    label,
    n_iteration=n_iteration,
)

print(acc)

  - np.sqrt(n_value_negative_label / n_negative_label)


{'Precision': 0.96442, 'Recall': 0.96442, 'F-measure': 0.96442, 'G-mean': 0.96442}


#### Handle the multiclass data with OVA (One Versus All)

In [2]:
data, label = preprocess.read_dataset(DS_path)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=0.3
)


acc = DT.OVA(
    data,
    label,
    n_iteration=n_iteration,
)

print(acc)

  - np.sqrt(n_value_negative_label / n_negative_label)


{'Precision': 0.9641, 'Recall': 0.9641, 'F-measure': 0.9641, 'G-mean': 0.9641}


### Step 2:

#### Repeat  both  of  the  experiments  in  previous  parts  with  the  pruned  HDDT  trees. 

In [3]:
data, label = preprocess.read_dataset(DS_path)
data = preprocess.remove_correlated_with_label_by_hellinger(
    data=data, label=label, threshold=0.3
)

acc_metrics_precision = np.empty((4, 3))
acc_metrics_recall = np.empty((4, 3))
acc_metrics_fmeasure = np.empty((4, 3))
acc_metrics_gmean = np.empty((4, 3))

for i in range(len(max_depth)):
    for j in range(len(cut_off)):
        acc_metrics = DT.OVA(
            data=data,
            label=label,
            n_iteration=n_iteration,
            max_depth=max_depth[i],
            cut_off=cut_off[j],
        )
        acc_metrics_precision[i, j] = acc_metrics["Precision"]
        acc_metrics_recall[i, j] = acc_metrics["Recall"]
        acc_metrics_fmeasure[i, j] = acc_metrics["F-measure"]
        acc_metrics_gmean[i, j] = acc_metrics["G-mean"]

# Print table of result using tabulate
print("Precision")
print(
    tabulate(
        acc_metrics_precision,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nRecall")
print(
    tabulate(
        acc_metrics_recall,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nF-measure")
print(
    tabulate(
        acc_metrics_fmeasure,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)

print("\nG-mean")
print(
    tabulate(
        acc_metrics_gmean,
        headers=cut_off,
        showindex=max_depth,
        tablefmt="fancy_grid",
    )
)


  - np.sqrt(n_value_negative_label / n_negative_label)


Precision
╒════╤═════════╤═════════╤═════════╕
│    │      10 │      50 │     500 │
╞════╪═════════╪═════════╪═════════╡
│  2 │ 0.96851 │ 0.96952 │ 0.96897 │
├────┼─────────┼─────────┼─────────┤
│  3 │ 0.9674  │ 0.96663 │ 0.96621 │
├────┼─────────┼─────────┼─────────┤
│  4 │ 0.96498 │ 0.96535 │ 0.9653  │
├────┼─────────┼─────────┼─────────┤
│  5 │ 0.9654  │ 0.96451 │ 0.96508 │
╘════╧═════════╧═════════╧═════════╛

Recall
╒════╤═════════╤═════════╤═════════╕
│    │      10 │      50 │     500 │
╞════╪═════════╪═════════╪═════════╡
│  2 │ 0.96851 │ 0.96952 │ 0.96897 │
├────┼─────────┼─────────┼─────────┤
│  3 │ 0.9674  │ 0.96663 │ 0.96621 │
├────┼─────────┼─────────┼─────────┤
│  4 │ 0.96498 │ 0.96535 │ 0.9653  │
├────┼─────────┼─────────┼─────────┤
│  5 │ 0.9654  │ 0.96451 │ 0.96508 │
╘════╧═════════╧═════════╧═════════╛

F-measure
╒════╤═════════╤═════════╤═════════╕
│    │      10 │      50 │     500 │
╞════╪═════════╪═════════╪═════════╡
│  2 │ 0.96851 │ 0.96952 │ 0.96897 │
├────┼───

Run the HDDT algorithm for each max_depth and cut_off size and print accuracy table for each of them

In [1]:
label_minority_majority = preprocess.minority_0_majority_1(label)

for depth in max_depth:
    for cut in cut_off:
        print(f"max_depth: {depth}, cut_off: {cut}\n")
        DT.run_HDDT(
            data,
            label_minority_majority,
            n_iteration=n_iteration,
            max_depth=depth,
            cut_off=cut,
        )
        print(f"{'='*50}")

print(f"max_depth: None, cut_off: None\n")
DT.run_HDDT(
    data,
    label_minority_majority,
    n_iteration=n_iteration,
)


NameError: name 'preprocess' is not defined

In [7]:
classes = [0, 1, 2]


# Run OVO
for i in range(len(classes)):
    for j in range(i + 1, len(classes)):
        print(f"OVO {classes[i]} vs {classes[j]}:")
        OVO_data, OVO_label = preprocess.OVO(
            data=data, label=label, l1=classes[i], l2=classes[j]
        )
        run_HDDT(
            data=OVO_data,
            label=OVO_label,
            n_iteration=n_iteration,
        )
        print(f"{'='*50}")


# Run OVA
for i in range(len(classes)):
    print(f"OVA {classes[i]}:")
    OVA_label = preprocess.OVA(
        data=data, label=label, l1=classes[i]
    )
    run_HDDT(
        data=data,
        label=OVA_label,
        n_iteration=n_iteration,
    )
    print(f"{'='*50}")


OVO 0 vs 1:
avg Precision: 0.9165
avg Recall: 0.9165
avg F-measure: 0.9165
avg AUC: 0.9002
avg G-mean: 0.9165
OVO 0 vs 2:
avg Precision: 1.0
avg Recall: 1.0
avg F-measure: 1.0
avg AUC: 1.0
avg G-mean: 1.0
OVO 1 vs 2:
avg Precision: 1.0
avg Recall: 1.0
avg F-measure: 1.0
avg AUC: 1.0
avg G-mean: 1.0
OVA 0:


TypeError: OVA() got an unexpected keyword argument 'data'