# References
https://pypi.org/project/sklearn-hierarchical-classification/
https://github.com/globality-corp/sklearn-hierarchical-classification/blob/develop/examples/classify_digits.py


# Connect to Shared Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/') 

# Note that we don't use '!' here, so we can use IPython's builtin cd utility. Else we'd be running cd in a subshell, which is pointless.
%cd "/content/drive/Shareddrives/Thesis/Data"

# Check if we can see our data
!ls

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/Shareddrives/Thesis/Data
a.dot
All_Amazon_100k_root_svm_title_desc.pkl
All_Amazon_Meta_chunks
All_Amazon_r375k_hierarchical_title_desc.pkl
All_Amazon_r750k_desc_cleaned.feather
All_Amazon_r750k_empty_dropped_cat_cleaned.feather
All_Amazon_r750k_empty_dropped_nonstrip.feather
All_Amazon_r750k_trimmed_cat.feather
ama_cat.jpeg
ama_cat.png
Amazon_Arts_Crafts_and_Sewing
Amazon_Books
Amazon_Clothing_Shoes_and_Jewelry
Amazon_Electronics
Amazon_Grocery_and_Gourmet_Food
Amazon_Industrial_and_Scientific
Amazon_Marketing
Amazon_Musical_Instruments
Amazon_Pet_Supplies
a.png
a.txt
Electronics_tfidf_sgd_leaf.pkl
graph.png
graph.txt
MetaDatas_Unzipped
OutputFile.png
Pet_Supplies.hdf5
Smol_Amazon
test-output
Walmart_Marketing


# Import common libraries

In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from random import shuffle

!pip install swifter
!pip install sklearn-hierarchical-classification==1.3.2

import swifter
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled



# Notebook settings



In [None]:
### Dataset configuration
# The parquet folder. It should be located inside datasets/.
DATASET_NAME   = 'Electronics.parquet'
# The input text column
DEPTH = 2 
FEATURE_COLS = ['title']
### Checkpoint configuration
# Whether to train from scratch or to load a checkpoint
TRAIN_FROM_SCRATCH=True
# Checkpoint iteration to load if not training from scratch
LOAD_ITERATION=0
# Last or best results from that iteration?
LOAD_BEST=True

### System configuration
# Will try to use your NVIDIA GPU if one is available. Set to False to force CPU computation
PREFER_GPU         = True
# If you don't have the huggingface transformers library installed, flip this to True.
# You only need to do this once. Once DistilBERT has been downloaded, it will be cached in your system's default user cache folder.
# Once it is cached, please set this to False to avoid redownloads.
INSTALL_DISTILBERT = False

# Import data
Here we'll finally be using the randomly-sampled 750k-row subset.

In [None]:
data = dd.read_parquet('../../datasets/{}'.format(DATASET_NAME))
with pd.option_context('display.max_colwidth', None):
    print (data.iloc[0])

category                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           [Camera & Photo, Video Surveillance, Surveillance Systems, Surveillance DVR Kits]
title                                                                                                                                                                                              

In [None]:
# Just keep title and description for now
data = data[['title', 'description', 'category']]

# Build full category tree
Normally this would be passed in, but since we don't have one, we'll have to generate.

We'll both generate an AnyTree (for visualisation) and a hierarchy for `sklearn-hierarchical-classifiation`.

## Import anytree stuff

In [None]:
!apt install libgraphviz-dev
!pip install graphviz
!python -m pip install anytree

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libgraphviz-dev is already the newest version (2.40.1-2).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [None]:
from anytree import Node, RenderTree
from anytree.dotexport import RenderTreeGraph
from anytree.exporter import DotExporter, UniqueDotExporter
from IPython.display import Image
from functools import reduce
import cv2
from google.colab.patches import cv2_imshow
#Guide: https://github.com/c0fec0de/anytree

def make_forest(classes,depth=None):
  ama_cat = Node("Root") #Root
  for cat_list in classes:
    root = ama_cat #Reset root each time we finish an item
    for cat in cat_list[0:depth]:
      if cat != root.name:
        if not (root.children):
          root = Node(cat, parent = root)
        else:
          for child in root.children:
            found = False
            if cat in child.name:
              root = child
              found = True
              break
          if not found: 
            root = Node(cat, parent = root)
  return ama_cat

# sklearn-hierarchical-classification hierarchy format
# Since list searches are slow, we'll build a dictionary of sets as an intermediate format, 
# then convert over.
def make_hierarchy(classes, depth=None, verbose=False):
  temp = { ROOT: set()}
  for i in range(0, len(classes)):
    path = classes[i]
    if len(path) != 0:
      if verbose: print(path)
      limit = min(depth, len(path))
      if path[0] not in temp[ROOT]:
        temp[ROOT].add(path[0])
      for i in range(0, limit - 1):
        if path[i] not in temp:
          temp[path[i]] = set()
      # add leaf into one of the generated sub-dicts.
      try:
        temp[path[limit-2]].add(path[limit-1])
      except:
        pass

  hierarchy = {}
  for key in temp.keys():
    hierarchy[key] = list(temp[key])

  return hierarchy

In [None]:
classes = data['category']
classes.iloc[0]

array(['Camera & Photo', 'Video Surveillance', 'Surveillance Systems',
       'Surveillance DVR Kits'], dtype=object)

In [None]:
print(RenderTree(make_forest(classes, DEPTH))) #demo forest

Node('/Root')
├── Node('/Root/Camera & Photo')
│   ├── Node('/Root/Camera & Photo/Video Surveillance')
│   ├── Node('/Root/Camera & Photo/Accessories')
│   ├── Node('/Root/Camera & Photo/Binoculars & Scopes')
│   ├── Node('/Root/Camera & Photo/Bags & Cases')
│   ├── Node('/Root/Camera & Photo/Lighting & Studio')
│   ├── Node('/Root/Camera & Photo/Film Photography')
│   ├── Node('/Root/Camera & Photo/Digital Cameras')
│   ├── Node('/Root/Camera & Photo/Tripods & Monopods')
│   ├── Node('/Root/Camera & Photo/Lenses')
│   ├── Node('/Root/Camera & Photo/Flashes')
│   ├── Node('/Root/Camera & Photo/Projectors')
│   ├── Node('/Root/Camera & Photo/Underwater Photography')
│   ├── Node('/Root/Camera & Photo/Printers & Scanners')
│   ├── Node('/Root/Camera & Photo/Camera Cases')
│   ├── Node('/Root/Camera & Photo/Batteries & Chargers')
│   └── Node('/Root/Camera & Photo/Surveillance Cameras')
├── Node('/Root/eBook Readers & Accessories')
│   ├── Node('/Root/eBook Readers & Accessories/eBook Rea

In [None]:
classifier_hierarchy = make_hierarchy(classes, DEPTH)
classifier_hierarchy

{'<ROOT>': ['Accessories & Supplies',
  'Computers & Accessories',
  'Headphones',
  'eBook Readers & Accessories',
  'Television & Video',
  'Home Audio',
  'Portable Audio & Video',
  'Wearable Technology',
  'Accessories',
  'Car & Vehicle Electronics',
  'Camera & Photo',
  'GPS, Finders & Accessories',
  'Security & Surveillance',
  'Service Plans'],
 'Accessories': ['Surveillance Camera Lenses'],
 'Accessories & Supplies': ['Audio & Video Accessories',
  'Mounts',
  'Cord Management',
  'Telephone Accessories',
  'Blank Media',
  'Cables',
  'DVD-RW Discs',
  'Batteries, Chargers & Accessories',
  'Office Electronics Accessories',
  'Power Strips & Surge Protectors'],
 'Camera & Photo': ['Lenses',
  'Lighting & Studio',
  'Printers & Scanners',
  'Video',
  'Tripods & Monopods',
  'Film Photography',
  'Surveillance Cameras',
  'Video Surveillance',
  'Digital Cameras',
  'Accessories',
  'Flashes',
  'Binoculars & Scopes',
  'Underwater Photography',
  'Camera Cases',
  'Batteri

# Train-test data preparation
## Splitting

In [None]:
FULL_SET = True
TRAIN_SET_RATIO = 0.8
VAL_SET_RATIO = 0.1
# The rest is test set
# Don't change this if you want a consistent sampling for easier comparisons
RANDOM_SEED = 123
small_data = None

if not FULL_SET:
    small_data = data.sample(frac = 0.25, random_state=RANDOM_SEED)

train_set = None
test_set = None
if FULL_SET:
    train_set = data.sample(frac = TRAIN_SET_RATIO, random_state=RANDOM_SEED)
    val_test_set = data.drop(train_set.index)
else:
    train_set = small_data.sample(frac = TRAIN_SET_RATIO, random_state=RANDOM_SEED)
    val_test_set = small_data.drop(train_set.index)

val_set = val_test_set.sample(frac = VAL_SET_RATIO / (1-TRAIN_SET_RATIO), random_state=RANDOM_SEED)
test_set = val_test_set.drop(val_set.index)

train_set = train_set.reset_index(drop=True)
val_set = val_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

print(train_set.shape)
print(val_set.shape)
print(test_set.shape)

(481622, 3)
(60203, 3)
(60202, 3)


## Separating X and y

In [None]:

# For now, only train against title and description
X_train = train_set[FEATURE_COLS]
X_test = test_set[FEATURE_COLS]
y_train = train_set['category'].swifter.apply(lambda row: row[min(DEPTH - 1, len(row) - 1)])
y_test = test_set['category'].swifter.apply(lambda row: row[min(DEPTH - 1, len(row) - 1)])

# Check dimensionalities
print(X_train.values.shape)
print(y_train.values.shape)

Pandas Apply:   0%|          | 0/481622 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/60202 [00:00<?, ?it/s]

(481622, 1)
(481622,)


# Build pipeline
## Helper classes and libraries

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from tempfile import mkdtemp
cachedir = mkdtemp()

class ColumnSelector(BaseEstimator, TransformerMixin):
  def __init__(self, key, verbose=False):
    self.key = key
    self.verbose = verbose

  def fit(self, x, y=None):
    return self

  def transform(self, df):
    if self.verbose:
      print('Selecting column',self.key)
    return df[self.key]

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

# These can't be put inside the class since they don't have _unload(), which prevents
# joblib from correctly parallelising the class if included.
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

class ColumnStemmer(BaseEstimator, TransformerMixin):
  def __init__(self, verbose=False):
    self.verbose = verbose

  def stem_and_concat(self, text):
    words = word_tokenize(text)
    result_list = map(lambda word: stemmer.stem(word) if word not in stop_words else word, words)
    return ' '.join(result_list)

  def fit(self, x, y=None):
    return self

  def transform(self, series):
    if self.verbose:
      print('Stemming column', series.name)
    return series.swifter.apply(self.stem_and_concat)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Base classifier
For use at every node. Currently an SVM with a linear kernel.

In [None]:
from sklearn import svm, linear_model
import joblib
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import time

bclf = make_pipeline(linear_model.SGDClassifier(
    loss='modified_huber', # Good perf in papers
    class_weight='balanced', # To fix our gross category example count imbalance
))

## Hierarchical pipeline
Takes the base classifier and construct a tree of classifiers.

In [None]:
clf = HierarchicalClassifier(
  base_estimator=bclf,
  class_hierarchy=classifier_hierarchy,
)

## Main pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from tempfile import mkdtemp
cachedir = mkdtemp()

def get_main_pipeline(clf):
  return Pipeline([
    ('selector', ColumnSelector(key='title', verbose=True)),
    ('stemmer', ColumnStemmer(verbose=True)),
    ('tfidf', TfidfVectorizer(min_df=50)),

    # Use a SVC classifier on the combined features
    ('clf', clf),
  ])
    

## Training time

In [None]:
folder_name = 'checkpoints-' + DATASET_NAME
!mkdir $folder_name
PICKLE_PATH = './{}/current.pkl'.format(folder_name)


if (TRAIN_FROM_SCRATCH):
  clf_sgd = get_main_pipeline(clf)

  start=time.time()
  clf_sgd.fit(X_train, y_train)

  end=time.time()
  print('Hierarchy took', end - start, 'ms')
  joblib.dump(clf_sgd, PICKLE_PATH)
else:
  clf_sgd = joblib.load(PICKLE_PATH)

Selecting column title
Stemming column title


Pandas Apply:   0%|          | 0/481622 [00:00<?, ?it/s]

Hierarchy took 514.923938035965 ms


# Test run

In [None]:
prediction = clf_sgd.predict(test_set.head(30))
result = pd.DataFrame(data={'name': test_set.head(30)['title'], 'desc': test_set.head(30).description, 'actual': y_test.head(30), 'predicted': prediction})
result

Selecting column title
Stemming column title


Pandas Apply:   0%|          | 0/30 [00:00<?, ?it/s]

Unnamed: 0,name,desc,actual,predicted
0,One Hot Summer,A zesty tale. (Publishers Weekly)<br /><br />G...,eBook Readers,Security Sensors
1,Kelby Training DVD: Adobe Photoshop CS5 Crash ...,DVD,Computer Accessories & Peripherals,DVD Players & Recorders
2,Natural Order,"Good, sharp, vivid writing.... When he hits th...",eBook Readers,Laptop Accessories
3,The Runes of the Earth : The Last Chronicles o...,"In 1977, Stephen Donaldson changed the face of...",eBook Readers,eBook Readers
4,Nook Hd + 9-Inch Groovy Protective Stand Cover...,"Nook HD protective stand cover slim, smart and...",Tablet Accessories,Covers
5,Nook Vara Stylus 2-in-1,IT'S BEST PRODUCTS THAT A LOT PEOPLE LOVE IT.,Bundles,Covers
6,NOOK HD+ Protective Cover Fits 9 inch NOOK HD+...,"An open box, unused.",Covers,Covers
7,Barnes & Noble Jonathan Adler Book Reader Cove...,Barnes & Noble Jonathan Adler Book Reader Cove...,Covers,Covers
8,Shmirshky: think inside the box,"<span>""It's sort of like having your funniest ...",eBook Readers,Bags & Cases
9,Passing the Nevada Math Proficiency Exam,This book was written to help students pass th...,Telephone Accessories,eBook Readers


# Evaluation

In [None]:
# from sklearn import metrics

# test_output = clf_sgd.predict(X_test)

# print("Accuracy:", metrics.accuracy_score(y_test, test_output))
# # Model Precision: what percentage of positive tuples are labeled as such?
# print("Precision:", metrics.precision_score(y_test, test_output, average='weighted'))

In [None]:
from sklearn import metrics
from sklearn import preprocessing
y_avg = preprocessing.label_binarize(y_test,classes = clf_sgd.classes_)
score = clf_sgd.predict_proba(X_test)

test_output = clf_sgd.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, test_output))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", metrics.precision_score(y_test, test_output, average='weighted'))
print("Average Precision Score: ", metrics.average_precision_score(y_avg,score,average = "micro"))

Selecting column title
Stemming column title


Pandas Apply:   0%|          | 0/60202 [00:00<?, ?it/s]

Selecting column title
Stemming column title


Pandas Apply:   0%|          | 0/60202 [00:00<?, ?it/s]

Accuracy: 0.7380485698149564


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.7067970687643473
Average Precision Score:  0.41491435142837707
