In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
# Run the ecxtract_data script to transform data into a dataframe
# %run -i 'extract_data.py'

In [5]:
# Read csv file into a dataframe
data = pd.read_csv('data/published_data.csv')
data.head(10)

Unnamed: 0,Title,Brand Name,Material,Color,Category
0,#7 luka doncic euroleague basketball jersey wh...,,polyester,,
1,#yo tambien skateboarding t-shirts print #meto...,,cotton,,
2,(ootdty)10pcs smell plastic worm soft bait art...,ootdty,,,
3,-10 degree winter outdoor camping tent cotton ...,,,,cotton sleeping bag
4,0.4mm-1.2mm yoga rubber resistance bands 50cmx...,,rubber,,
5,0.6# 0.8#jof 150m 8 strands 100% pe braided fi...,jof,,,
6,0.8#-8# 500m nylon fishing line super strong ...,,nylon,,
7,0.8m outdoor activity pvc inflatable bumper bu...,,pvc,,
8,1 cookware set outdoor stove oven cooking hiki...,,,,gas stove
9,1 deck pvc poker waterproof plastic playing ca...,,plastic,,


In [4]:
# Use tagger to tag the data with BIO
from utils.sequence_tagger import Tagger
bio_tagger = Tagger()
sentences, tags = bio_tagger.bio_tag(data)

In [5]:
# Sanity check on data
for i, (s, t) in enumerate(zip(sentences, tags)):
    if len(s) != len(t):
        print("ERROR!")
        print(i)

In [6]:
len(sentences), len(tags)

(12722, 12722)

In [7]:
# Example of a tagged product description
print(sentences[63])
print(tags[63])

['10', 'pcs/set', 'thick', 'golf', 'iron', 'headcover', 'pu', 'leather', 'golf', 'head', 'cover', 'with', 'heart', 'pattern', 'for', 'closure', '3-pw', 'club', 'protect', 'cover', 'with', 'gift']
['O', 'O', 'O', 'O', 'O', 'O', 'B-Material', 'I-Material', 'I-Material', 'I-Material', 'I-Material', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [8]:
# Process sequences from sequential models
from preprocessing.sequences import SequencePreprocessor
sequence_processor = SequencePreprocessor()

In [9]:
X_train, X_dev, X_test = sequence_processor.process_word_sequences(sentences, tags)
X_train.shape, X_dev.shape, X_test.shape

((8905, 61), (1908, 61), (1909, 61))

In [10]:
y_train, y_dev, y_test = sequence_processor.process_tag_sequences(sentences, tags)
y_train.shape, y_dev.shape, y_test.shape

((8905, 61), (1908, 61), (1909, 61))

In [11]:
X_train_char, X_dev_char, X_test_char = sequence_processor.process_characters(10)
X_train_char.shape

(8905, 61, 10)

In [12]:
# Get matrix representation of pretrained glove embeddings
from utils.embeddings import GloveEmbeddings
glove_embeddings = GloveEmbeddings()
embedding_matrix = glove_embeddings.create_embeddings(sequence_processor.token_num, sequence_processor.word_tokenizer)

In [13]:
from tf2crf import ModelWithCRFLoss
from models.sequence_models import LstmCrf
from models.sequence_models import OpenTag
from models.sequence_models import OpenBrandCNN

In [14]:
# Define lstm_crf model
lstm_crf = LstmCrf(sequence_processor, embedding_matrix)

In [15]:
lstm_crf_model = ModelWithCRFLoss(lstm_crf, sparse_target=True)
lstm_crf_model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])

In [14]:
# Define an early stopping criteria 
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss_val',patience=3,
restore_best_weights=True) 

In [17]:
lstm_crf_model.fit(X_train, y_train, batch_size=32, epochs=30, validation_data=(X_dev, y_dev), callbacks=[early_stopping_cb])

Epoch 1/30


  return py_builtins.overload_of(f)(*args)


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


<keras.callbacks.History at 0x7fb4eaae7790>

In [15]:
from models.sequence_eval import evaluate_model
from models.sequence_eval import evaluate_open_brand

In [19]:
evaluate_model(lstm_crf_model, sequence_processor)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       Brand       0.96      0.96      0.96      1420
    Category       0.64      0.64      0.64       149
       Color       0.47      0.35      0.40        82
    Material       0.73      0.78      0.75       438
           _       0.00      0.00      0.00         0

   micro avg       0.86      0.88      0.87      2089
   macro avg       0.56      0.55      0.55      2089
weighted avg       0.87      0.88      0.87      2089

0.8709907341411263


In [20]:
open_tag = OpenTag(sequence_processor, embedding_matrix)

In [21]:
open_tag = ModelWithCRFLoss(open_tag, sparse_target=True)
open_tag.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])

In [22]:
open_tag.fit(X_train, y_train, batch_size=32, epochs=30, validation_data=(X_dev, y_dev), callbacks=[early_stopping_cb])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


<keras.callbacks.History at 0x7fb4ea0834c0>

In [23]:
evaluate_model(open_tag, sequence_processor)

              precision    recall  f1-score   support

       Brand       0.95      0.97      0.96      1420
    Category       0.65      0.70      0.68       149
       Color       0.52      0.48      0.50        82
    Material       0.74      0.76      0.75       438
           _       0.00      0.00      0.00         0

   micro avg       0.87      0.89      0.88      2089
   macro avg       0.57      0.58      0.58      2089
weighted avg       0.87      0.89      0.88      2089

0.8758882046423495


In [16]:
openbrand_cnn = OpenBrandCNN(sequence_processor, embedding_matrix)
openbrand_cnn = ModelWithCRFLoss(openbrand_cnn, sparse_target=True)
openbrand_cnn.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])

In [18]:
openbrand_cnn.fit([X_train,
           X_train_char],
          y_train,
          validation_data=([X_dev, X_dev_char], y_dev),
          batch_size=32, epochs=30, callbacks=[early_stopping_cb])

Epoch 1/30

  return py_builtins.overload_of(f)(*args)


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


<keras.callbacks.History at 0x7fe197305430>

In [19]:
evaluate_open_brand(openbrand_cnn, sequence_processor)

              precision    recall  f1-score   support

       Brand       0.95      0.97      0.96      1420
    Category       0.72      0.69      0.70       149
       Color       0.60      0.45      0.51        82
    Material       0.76      0.79      0.77       438

   micro avg       0.89      0.89      0.89      2089
   macro avg       0.76      0.72      0.74      2089
weighted avg       0.88      0.89      0.89      2089

Overall f1: 0.8886765408504539
