In [1]:
import pandas as pd

products = pd.read_csv("amazon_products.csv")

In [2]:
products.head()

Unnamed: 0,title,brand,asin
0,FLT&reg; Laptop AC Adapter/Power Supply/Charge...,FLT,6666666038
1,uxcell Flexible Neck Black Three Blade Compute...,uxcell,7884139057
2,"IKEA - UPPT&Auml;CKA Backpack, dark gray, yell...",IKEA,9178910897
3,"Princeton Eo2010 21"" Monitor (Pc/Mac)",Princeton,B00001MXZ7
4,"ViewSonic E790 19"" Monitor",ViewSonic,B00004TS2P


In [3]:
UNIQUE_BRANDS = products["brand"].unique()
len(UNIQUE_BRANDS)

1301

## Split data

Split data info test and trainval subsets

In [4]:
import random

random.seed(2022)
random.shuffle(UNIQUE_BRANDS)


TEST_SPLIT_FRAC = 0.15
TEST_SPLIT = int(TEST_SPLIT_FRAC * len(UNIQUE_BRANDS))

TEST_DATA_BRANDS = UNIQUE_BRANDS[:TEST_SPLIT]
TRAINVAL_DATA_BRANDS = UNIQUE_BRANDS[TEST_SPLIT:]

test_dataframe = products[products["brand"].isin(TEST_DATA_BRANDS)]
trainval_dataframe = products[products["brand"].isin(TRAINVAL_DATA_BRANDS)]

## Build dataset

- Clean the data
- Transform data for model training

In [5]:
import re

def transform(text: str, brand: str):
    text = re.sub('&.*;', '', text)
    start_index = text.find(brand)
    if start_index == -1:
        print(f"[ERROR] No brand `{brand}` in `{text}`")
        return None
    end_index = len(brand)
    return (text, {"entities" : [(start_index, end_index, "BRAND")]})

def build_data_from_dataframe(dataframe):
    data = []
    for row in dataframe.itertuples(index=False):
        sample = transform(row.title, row.brand)
        if sample is None:
            continue
        data.append(sample)
    return data

TRAIN_DATA = build_data_from_dataframe(trainval_dataframe)

TEST_DATA = build_data_from_dataframe(test_dataframe)

[ERROR] No brand `Game Time` in ` 3 Color Backlit LED Illuminated Professional USB Gaming Keyboard (JW-200)`


In [6]:
TRAIN_DATA[:5]

[('FLT Laptop AC Adapter/Power Supply/Charger+US Power Cord for Toshiba Satellite',
  {'entities': [(0, 3, 'BRAND')]}),
 ('uxcell Flexible Neck Black Three Blade Computer USB Desk Fan',
  {'entities': [(0, 6, 'BRAND')]}),
 ('Princeton Eo2010 21" Monitor (Pc/Mac)', {'entities': [(0, 9, 'BRAND')]}),
 ('ViewSonic E790 19" Monitor', {'entities': [(0, 9, 'BRAND')]}),
 ('ViewSonic E-790B 19 Monitor', {'entities': [(0, 9, 'BRAND')]})]

In [7]:
import spacy

spacy.__version__

'2.3.7'

In [8]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from spacy.util import get_words_and_spaces
from spacy.tokens import Doc, DocBin
from pathlib import Path

# CREATE BLANK MODEL
pretrained_model = None # "en_core_web_md"
if pretrained_model:
    nlp = spacy.load(pretrained_model)
    print("Loaded model: ", pretrained_model)
else:
    nlp = spacy.blank("en")
    print("Create blank 'en' model")

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

# ADD LABEL
ner.add_label("BRAND")

# TRAINING THE MODEL
# Disable pipeline components which we don't need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):

    # reset and initialize the weights randomly – but only if we're training new blank model 
    if pretrained_model is None:
        nlp.begin_training()
    # Training for 30 iterations
    for iteration in range(200):
        # shuufling examples  before every iteration
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        # for text, annotations in TRAIN_DATA:
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,
                annotations,
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print(f"[{iteration}] Batch loss {losses}")

Create blank 'en' model
[0] Batch loss {'ner': 1450.4714119942191}
[1] Batch loss {'ner': 1549.3395661883408}
[2] Batch loss {'ner': 1256.3288791306627}
[3] Batch loss {'ner': 1198.7894069816498}
[4] Batch loss {'ner': 871.8195317934872}
[5] Batch loss {'ner': 718.0835352186884}
[6] Batch loss {'ner': 718.012841829665}
[7] Batch loss {'ner': 736.2810548618392}
[8] Batch loss {'ner': 460.033027812419}
[9] Batch loss {'ner': 517.558310427669}
[10] Batch loss {'ner': 464.6186634904239}
[11] Batch loss {'ner': 348.35220317545514}
[12] Batch loss {'ner': 320.4880800608878}
[13] Batch loss {'ner': 322.2307480936164}
[14] Batch loss {'ner': 287.07962707856115}
[15] Batch loss {'ner': 335.78631916897024}
[16] Batch loss {'ner': 236.60022061812614}
[17] Batch loss {'ner': 229.27335658112798}
[18] Batch loss {'ner': 217.1399670161359}
[19] Batch loss {'ner': 287.1779490305446}
[20] Batch loss {'ner': 191.11962758012277}
[21] Batch loss {'ner': 143.51020512510166}
[22] Batch loss {'ner': 213.7745

In [9]:
from pathlib import Path
output_dir = Path("pretrained_models/en_conrad_ner")
if not output_dir.exists():
    output_dir.mkdir(parents=True, exist_ok=True)
nlp.to_disk(output_dir)
print("Saving model to", output_dir)

Saving model to pretrained_models/en_conrad_ner


In [10]:
print("Loading model from", output_dir)
nlp2 = spacy.load(output_dir)
accurate_predictions = 0
for text, annotations in TEST_DATA:
    doc = nlp2(text)
    predictions = [(ent.text, ent.label_) for ent in doc.ents]
    gold = [(text[start:end], label) for start, end, label in annotations["entities"]]

    if set(predictions) == set(gold):
        print("Correct:", predictions)
        accurate_predictions += 1
    else:
        print('Wrong: Prediction', predictions, " -- Gold", gold)

print("Accuracy", accurate_predictions/len(TEST_DATA))

Loading model from pretrained_models/en_conrad_ner
Correct: [('IKEA', 'BRAND')]
Correct: [('Labtec', 'BRAND')]
Correct: [('APC', 'BRAND')]
Wrong: Prediction []  -- Gold [('IBM', 'BRAND')]
Wrong: Prediction []  -- Gold [('D-Link', 'BRAND')]
Correct: [('NETGEAR', 'BRAND')]
Correct: [('Rikaline', 'BRAND')]
Correct: [('Alpine', 'BRAND')]
Correct: [('Net Nanny', 'BRAND')]
Correct: [('Pocketec', 'BRAND')]
Correct: [('Targus', 'BRAND')]
Correct: [('3Com', 'BRAND')]
Wrong: Prediction [('Targus Mobile', 'BRAND')]  -- Gold [('Targus', 'BRAND')]
Correct: [('Samsonite', 'BRAND')]
Wrong: Prediction [('LAKELAND', 'BRAND')]  -- Gold [('', 'BRAND')]
Correct: [('Nokia', 'BRAND')]
Correct: [('Maxent', 'BRAND')]
Correct: [('DSI', 'BRAND')]
Correct: [('ByteRunner', 'BRAND')]
Correct: [('APACER', 'BRAND')]
Correct: [('Samsonite', 'BRAND')]
Correct: [('TrackItBack', 'BRAND')]
Correct: [('iLuv', 'BRAND')]
Wrong: Prediction [('Nuo', 'BRAND')]  -- Gold [('Chloe', 'BRAND')]
Correct: [('BandRich', 'BRAND')]
Wron