# Benchmarking Hierarchical versus Flat Classification

In [1]:
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 20, 9
from IPython.display import Image

In [2]:
from dao import DataAccess, LabelGetter
from pickle import load

In [3]:
import pandas as pd

Fixes a pipeline problem I have yet to resolve.

In [284]:
from pipelines.helpers import ExplodingRecordJoiner

XX = ExplodingRecordJoiner(user=[
                'created_at',
                'favourites_count',
                'followers_count',
                'friends_count',
                'statuses_count',
                'verified'
            ]).fit_transform(X)

# Notice that there are some labels of "alcohol:1" without first person labels, we'll fix this later but for not this hack will remove that data

In [287]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    XX, y, test_size=0.33, random_state=42)

In [288]:
# Remove labels that are alcohol but dont contain first person infromation
X_test = X_test[X_test["labels"] != {"alcohol":1}]
y_test = y_test[X_test["labels"] != {"alcohol":1}]

label_map = {
    1: "first_person_looking",
    2: "first_person_reflecting",
    0: "first_person_casual",
    3: "first_person_heavy"
}

def produce_named_labels(label_dict):
    if label_dict["alcohol"] == 0:
        return (0, "non_drinking")
    elif "first_person" not in label_dict:
        return None
    
    if label_dict["first_person"] == 0:
        return (0, "alcohol_related")
    
    return (label_dict["first_person_level"], label_map[label_dict["first_person_level"]])

In [292]:
y = X_test.get("labels").apply(produce_named_labels)

# Constructing Heirarchy

Loading Classifiers

In [321]:
from classification import Taxonomy

In [320]:
clf_alch = load(open(
    "./clfs/LogisticRegression|label:alcohol|accuracy:0.7970909090909091|f1:0.8367466354593329|feature:['text', 'user', 'age']"
    , "rb+"))
clf_frst = load(open(
    "./clfs/LogisticRegression|label:first_person|accuracy:0.6640826873385013|f1:0.721030042918455|feature:['text', 'user', 'age']"
    , "rb+"))
clf_fstl = load(open(
    "./clfs/LogisticRegression|label:first_person_level|accuracy:0.4703196347031963|f1:0.46356955515952386|feature:['text']"
    , "rb+"
    ))

### Construct taxonomy

root = Taxonomy(
    "alcohol",
    {
        0:"non_drinking",
        1:"drinking"
    },
    clf_alch["clf"]
)

first_person = Taxonomy(
    "first_person",
    {
        0:"alcohol_related",
        1:"first_person"
    },
    clf_frst["clf"]
)

first_person_level = Taxonomy(
    "first_person_level",
    {
        0:"first_person_casual",
        1:"first_person_looking",
        2:"first_person_reflecting",
        3:"first_person_heavy"
    },
    clf_fstl["clf"]
)

first_person.add_children({1: first_person_level})
root.add_children({1: first_person})

### Load Flat

In [329]:
clf_flat = load(open(
    "./clfs/LogisticRegression|label:flat|accuracy:0.6749090909090909|f1:0.6193087578156977|feature:['text']"
    ,"rb+"))

In [341]:
y_test_flat = LabelGetter(X_test).get_flatlabels()[1]

In [342]:
y_pred_flat = clf_flat.predict(X_test)

### Prediction and Performance

In [346]:
y_test_tree = y_test.apply(ans2int.get)

In [347]:
y_pred_tree = root.predict(X_test, deep=True).apply(ans2int.get)

0    501
1    410
dtype: int64
1    257
0    153
dtype: int64
1    105
2     77
0     57
3     18
dtype: int64


In [None]:
ans2int = {
(0, "non_drinking"):0,
(0, "alcohol_related"):1,
(1, "first_person_looking"):3,
(2, "first_person_reflecting"):4,
(0, "first_person_casual"):2,
(3, "first_person_heavy"):5
}

# Reports

In [358]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

In [355]:
# flat
print(classification_report(y_pred_flat, y_test_flat))
print(confusion_matrix(y_pred_flat, y_test_flat))

             precision    recall  f1-score   support

          0       0.82      0.86      0.84       485
          1       0.84      0.37      0.51       418
          2       0.03      1.00      0.06         2
          3       0.06      0.83      0.12         6
          4       0.00      0.00      0.00         0
          5       0.00      0.00      0.00         0

avg / total       0.82      0.63      0.68       911

[[416  30  19  16   1   3]
 [ 93 153  43  58  53  18]
 [  0   0   2   0   0   0]
 [  1   0   0   5   0   0]
 [  0   0   0   0   0   0]
 [  0   0   0   0   0   0]]


In [356]:
# Tree
print(classification_report(y_pred_tree, y_test_tree))
print(confusion_matrix(y_pred_tree, y_test_tree))

             precision    recall  f1-score   support

          0       0.58      0.59      0.58       501
          1       0.20      0.24      0.22       153
          2       0.03      0.04      0.03        57
          3       0.11      0.09      0.10       105
          4       0.15      0.10      0.12        77
          5       0.00      0.00      0.00        18

avg / total       0.38      0.38      0.38       911

[[294  92  30  47  28  10]
 [ 80  37  17  12   5   2]
 [ 25  16   2   4   5   5]
 [ 62  19   6   9   8   1]
 [ 37  18   7   4   8   3]
 [ 12   1   2   3   0   0]]


In [357]:
# Tree
print("flat", accuracy_score(y_pred_flat, y_test_flat))
print("tree", accuracy_score(y_pred_tree, y_test_tree))

flat 0.632272228321
tree 0.384193194292


In [362]:
# Tree
print("flat", f1_score(y_pred_flat, y_test_flat, average="weighted"))
print("tree", f1_score(y_pred_tree, y_test_tree, average="weighted"))

flat 0.679692781942
tree 0.38050475817
