In [784]:
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.inf)

In [785]:
data = pd.read_csv('advertisement.csv')
print(data['labels'].to_string())

0                            electronics clothing sports
1                                       furniture beauty
2                       clothing electronics food sports
3                                                   food
4                                                   home
5                               sports electronics books
6                              beauty furniture clothing
7                                           books beauty
8                           electronics food home sports
9                                         furniture food
10                                         clothing home
11                                            food books
12                                                  home
13                            sports furniture food home
14                                  beauty clothing food
15                                 books clothing sports
16                      electronics furniture food books
17                     furnitur

In [786]:
data.dtypes

age                   int64
gender               object
income              float64
education            object
married                bool
children              int64
city                 object
occupation           object
purchase_amount     float64
most bought item     object
labels               object
dtype: object

In [787]:
for column in data.columns:
    unique_values = data[column].nunique()
    print(column, unique_values)

age 47
gender 2
income 1000
education 4
married 2
children 4
city 969
occupation 11
purchase_amount 1000
most bought item 24
labels 397


In [788]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import LabelPowerset
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, hamming_loss
from prettytable import PrettyTable


class DecisionTree:
    def __init__(self, criterion="entropy", max_depth=10, max_features=7):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features

    def set_hyperparameter(self, criterion, max_depth, max_features):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features

    def PowerSet_classifier(self, X_train, X_test, y_train, y_test):
        model = LabelPowerset(classifier=DecisionTreeClassifier(
            criterion=self.criterion, max_depth=self.max_depth, max_features=self.max_features))
        model.fit(X_train.values, y_train)
        y_pred = model.predict(X_test.values)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(
            y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(
            y_test, y_pred, average="weighted", zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average="micro", zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)
        cm = multilabel_confusion_matrix(y_test, y_pred)
        cm = np.sum(cm, axis=0)
        return accuracy, precision, recall, f1_micro, f1_macro, cm

    def MultiOutput_classifier(self, X_train, X_test, y_train, y_test):
        model = MultiOutputClassifier(DecisionTreeClassifier(
            criterion=self.criterion, max_depth=self.max_depth, max_features=self.max_features))
        model.fit(X_train.values, y_train)
        y_pred = model.predict(X_test.values)
        accuracy = 1-hamming_loss(y_test, y_pred)
        precision = precision_score(
            y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(
            y_test, y_pred, average="weighted", zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average="micro", zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)
        cm = multilabel_confusion_matrix(y_test, y_pred)
        cm = np.sum(cm, axis=0)
        return accuracy, precision, recall, f1_micro, f1_macro, cm

In [789]:
x = data.iloc[:, :10]
y = data["labels"]

string_labels = ["gender", "married",
                 "education", "occupation", "most bought item"]

x = pd.get_dummies(x, columns=string_labels, dtype=int)
label_encoder = LabelEncoder()
x["city"] = label_encoder.fit_transform(x["city"])

y = y.str.split()
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2)

dt = DecisionTree()

In [790]:
criterion = ["gini", "entropy"]
max_depth = [3, 5, 10, 20, 30]
max_features = [3, 5, 7, 9, 11]

table_powerset = PrettyTable(["(Criterion,Max_Depth,Max_Features)", "Accuracy",
                             "Precision", "Recall", "F1_micro", "F1_macro", "Confusion_Matrix"])
table_multioutput = PrettyTable(["(Criterion,Max_Depth,Max_Features)", "Accuracy",
                                "Precision", "Recall", "F1_micro", "F1_macro", "Confusion_Matrix"])

rank_powerset = {}
rank_multioutput = {}

for crit in criterion:
    for depth in max_depth:
        for features in max_features:

            dt.set_hyperparameter(crit, depth, features)

            measures_powerset = dt.PowerSet_classifier(
                X_train, X_test, y_train, y_test)
            accuracy_powerset = measures_powerset[0]
            precision_powerset = measures_powerset[1]
            recall_powerset = measures_powerset[2]
            f1_micro_powerset = measures_powerset[3]
            f1_macro_powerset = measures_powerset[4]
            cm_powerset = measures_powerset[5]

            rank_powerset[(crit, depth, features)] = f1_macro_multioutput
            table_powerset.add_row([(crit, depth, features), accuracy_powerset, precision_powerset,
                                   recall_powerset, f1_micro_powerset, f1_macro_powerset, cm_powerset])

            measures_multioutput = dt.MultiOutput_classifier(
                X_train, X_test, y_train, y_test)
            accuracy_multioutput = measures_multioutput[0]
            precision_multioutput = measures_multioutput[1]
            recall_multioutput = measures_multioutput[2]
            f1_micro_multioutput = measures_multioutput[3]
            f1_macro_multioutput = measures_multioutput[4]
            cm_multioutput = measures_multioutput[5]

            rank_multioutput[(crit, depth, features)] = f1_macro_multioutput
            table_multioutput.add_row([(crit, depth, features), accuracy_multioutput, precision_multioutput,
                                      recall_multioutput, f1_micro_multioutput, f1_macro_multioutput, cm_multioutput])

print(table_powerset)
print(table_multioutput)

+------------------------------------+----------+---------------------+---------------------+---------------------+---------------------+------------------+
| (Criterion,Max_Depth,Max_Features) | Accuracy |      Precision      |        Recall       |       F1_micro      |       F1_macro      | Confusion_Matrix |
+------------------------------------+----------+---------------------+---------------------+---------------------+---------------------+------------------+
|           ('gini', 3, 3)           |   0.02   | 0.44963768115942027 | 0.16756756756756758 | 0.24441524310118265 | 0.13983054210262524 |    [[932 113]    |
|                                    |          |                     |                     |                     |                     |    [462  93]]    |
|           ('gini', 3, 5)           |   0.03   |  0.2681881181881182 | 0.17477477477477477 |  0.2490372272143774 |  0.1532594394523202 |    [[918 127]    |
|                                    |          |         

In [791]:
rank_powerset = dict(sorted(rank_powerset.items(),
                     key=lambda x: x[1], reverse=True))
rank_multioutput = dict(sorted(rank_multioutput.items(),
                        key=lambda x: x[1], reverse=True))

# print(rank_powerset)
print("Powerset:")
for i in range(3):
    print((list(rank_powerset.keys()))[i])
print()
print("MultiOutput:")
for i in range(3):
    print((list(rank_multioutput.keys()))[i])

best_powerset = (list(rank_powerset.keys()))[0]
best_multioutput = (list(rank_multioutput.keys()))[0]

Powerset:
('entropy', 20, 11)
('gini', 30, 3)
('gini', 20, 9)

MultiOutput:
('entropy', 20, 9)
('gini', 20, 11)
('gini', 20, 7)


In [792]:
from sklearn.model_selection import KFold

k = 8
kf = KFold(n_splits=k)
model_powerset = LabelPowerset(classifier=DecisionTreeClassifier(
    criterion=best_multioutput[0], max_depth=best_multioutput[1], max_features=best_multioutput[2]))
accuracy_score_powerset = []
for train_index, test_index in kf.split(X_train):
    X_train_kfold, X_test_kfold = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_train_kfold, y_test_kfold = y_train[train_index], y_train[test_index]
    model_powerset.fit(X_train_kfold.values, y_train_kfold)
    pred_values = model_powerset.predict(X_test_kfold.values)
    acc = accuracy_score(pred_values, y_test_kfold)
    accuracy_score_powerset.append(acc)
avg_accuracy_powerset = sum(accuracy_score_powerset)/k
print("Powerset:")
print("Accuarcy of each fold :", accuracy_score_powerset)
print("Average accuracy :", avg_accuracy_powerset)
print()
k = 8
kf = KFold(n_splits=k)
model_multioutput = MultiOutputClassifier(DecisionTreeClassifier(
    criterion=best_multioutput[0], max_depth=best_multioutput[1], max_features=best_multioutput[2]))
accuracy_score_multioutput = []
for train_index, test_index in kf.split(x):
    X_train, X_test = x.iloc[train_index, :], x.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    model_multioutput.fit(X_train.values, y_train)
    pred_values = model_multioutput.predict(X_test.values)
    acc = accuracy_score(pred_values, y_test)
    accuracy_score_multioutput.append(acc)
avg_accuracy_multioutput = sum(accuracy_score_multioutput)/k
print("MultiOutput:")
print("Accuarcy of each fold :", accuracy_score_multioutput)
print("Average accuracy :", avg_accuracy_multioutput)

Powerset:
Accuarcy of each fold : [0.01, 0.01, 0.06, 0.02, 0.0, 0.01, 0.01, 0.01]
Average accuracy : 0.01625

MultiOutput:
Accuarcy of each fold : [0.032, 0.072, 0.024, 0.024, 0.048, 0.024, 0.032, 0.056]
Average accuracy : 0.039
