In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

from spacy.tokens import DocBin
import srsly
import spacy
import spacy
from spacy.training import offsets_to_biluo_tags

pd.set_option('display.max_colwidth', None)

## Train-test-val split

In [2]:
import os
os.chdir('/Users/anthony/Projects/retailer_nlp_challenger/data')

# read csv
brand_cleaned = pd.read_csv('brand_offer_cleaned.csv')
brand_cleaned.sample(10)

Unnamed: 0,BRAND,OFFER
240,RAOS,"Raos, Pasta OR Sauce, spend 10"
43,ALFAROS,"Sara Lee or Alfaros Artesano bread, buy 5"
232,VIZZY,"Coors Light, Miller Lite OR VIZZY 12 packs, buy 5"
72,GEORGES FARMERS MARKET,"Georges Farmers Market Chicken Wings, at Sams Club"
128,KRADLE,"Kradle, select varieties, spend 20 at Walmart"
52,RED GOLD,Red Gold Tomato Juice
23,BUTTERBALL,"Butterball, select varieties, spend 10 at King Soopers"
142,PROMISED LAND DAIRY,"Promised Land Dairy Milk, 52-ounce"
168,GORTONS,Gortons at select retailers
229,BARILLA,"Barilla pasta, select varieties, buy 4"


In [3]:
# split data into train, test, and validation (80%, 10%, 10%)
train = brand_cleaned.sample(frac=0.8, random_state=42)
dev = brand_cleaned.drop(train.index)
val = train.sample(frac=0.125, random_state=42)

In [4]:
train.sample(5)

Unnamed: 0,BRAND,OFFER
153,STAR MARKET,Spend 120 at Star Market
7,ZAXBYS,Spend 20 at Zaxbys
1,GOOD HUMOR,Good Humor Viennetta Frozen Vanilla Cake
170,SHAWS,Spend 300 at Shaws
202,BACK TO THE ROOTS,Back to the Roots Microgreens Grow Kit OR Seed Refill at Walmart


In [5]:
print(f'train size: {train.shape}')
print(f'dev size: {dev.shape}')
print(f'val size: {val.shape}')

train size: (222, 2)
dev size: (56, 2)
val size: (28, 2)


## Convert csv to json

In [6]:
# Generate a list of [start_char,end_char, label] of given label, tagname is the name of the label
# Spacy example: 
# [('Kanye love Paris',{'entities': [0,5,PERSON]})]
def label_tagger(text, label, tag="BRAND"):
    text = text.lower()
    label = label.lower().strip()
    if text.find(label) != -1:
        start_index = text.find(label)
        last_index = start_index + len(label)
        return [start_index,last_index,tag]
    else:
        print(text)
        print(f"{label} not found in the given sentence")

In [7]:
# Generate train dict
ner_training = []
for i,row in enumerate(train.itertuples(),1):
    offer = row.OFFER
    brand = row.BRAND
    brand_entity = label_tagger(offer,brand)
    train_datapoint = (offer, {'entities': [brand_entity]})
    ner_training.append(train_datapoint)

with open('train.json','w') as f:
  json.dump(ner_training, f)

ner_training[:2]

[('Hidden Valley Ranch Salad Dressing OR Secret Sauce, select varieties',
  {'entities': [[0, 19, 'BRAND']]}),
 ('Sign up for McAlisters Deli Rewards, tap for details',
  {'entities': [[12, 27, 'BRAND']]})]

In [8]:
# Generate train dict
ner_dev = []
for i,row in enumerate(dev.itertuples(),1):
    offer = row.OFFER 
    brand = row.BRAND
    brand_entity = label_tagger(offer, brand)
    dev_datapoint = (offer, {'entities': [brand_entity]})
    ner_dev.append(dev_datapoint)

with open('dev.json','w') as f:
  json.dump(ner_dev, f)

ner_dev[:2]

[('Spend 10 at Subway', {'entities': [[12, 18, 'BRAND']]}),
 ('Gillette Venus  for Pubic Hair & Skin spend 20',
  {'entities': [[0, 14, 'BRAND']]})]

In [9]:
# Generate train dict
ner_validation = []
for i,row in enumerate(val.itertuples(),1):
    offer = row.OFFER 
    brand = row.BRAND
    brand_entity = label_tagger(offer, brand)
    val_datapoint = (offer, {'entities': [brand_entity]})
    ner_validation.append(val_datapoint)

with open('val.json','w') as f:
  json.dump(ner_validation, f)

ner_validation[:2]

[('Shop 2 times at Randalls', {'entities': [[16, 24, 'BRAND']]}),
 ('Gillette Venus  for Pubic Hair & Skin, spend 20',
  {'entities': [[0, 14, 'BRAND']]})]

In [10]:
# also write them into csv
# train.to_csv('train.csv',index=False)
# dev.to_csv('test.csv',index=False)
# val.to_csv('val.csv',index=False)

## JSON to spaCy binary

In [11]:
def convert(lang, input_path, output_path):
    nlp = spacy.blank(lang)
    db = DocBin()
    skipped = 0
    for text, annot in srsly.read_json(input_path):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                skipped+=1
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                print(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    print('### Totally skipped docs:', skipped)
    db.to_disk(output_path)

In [13]:
!pwd

/Users/anthony/Projects/retailer_nlp_challenger/data


In [14]:
convert("en","train.json","/Users/anthony/Projects/retailer_nlp_challenger/corpus/train.spacy")

Skipping entity [16, 22, BRAND] in the following text because the character span 'Zaxbys' does not align with token boundaries:

'Order online at Zaxbys.com'

Skipping entity [0, 5, BRAND] in the following text because the character span 'Pepsi' does not align with token boundaries:

'PepsiCo Variety Pack, select varieties, at Amazon Storefront'

Skipping entity [0, 5, BRAND] in the following text because the character span 'Pepsi' does not align with token boundaries:

'PepsiCo Beverage, 7.5-ounce 10 pack, select varieties, at Amazon Storefront'

Skipping entity [12, 17, BRAND] in the following text because the character span 'Chewy' does not align with token boundaries:

'Spend 50 at Chewy.com'

Skipping entity [0, 5, BRAND] in the following text because the character span 'Aleve' does not align with token boundaries:

'AleveX at Walmart'

### Totally skipped docs: 5


In [15]:
convert("en","dev.json","/Users/anthony/Projects/retailer_nlp_challenger/corpus/dev.spacy")

### Totally skipped docs: 0
