In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

from spacy.tokens import DocBin
import srsly
import spacy
import spacy
from spacy.training import offsets_to_biluo_tags

pd.set_option('display.max_colwidth', None)

# Prep

In [2]:
import os
os.chdir('/Users/anthony/Projects/retailer_nlp_challenger/data')

# read csv
brand_cleaned = pd.read_csv('brand_offer_cleaned.csv')
brand_cleaned.sample(10)

Unnamed: 0,BRAND,OFFER,idx
212,TOM THUMB,Spend 115 at Tom Thumb,298
65,COOKED PERFECT,"Cooked Perfect Meatballs, Homestyle OR Turkey, at Walmart",92
207,DICKEYS BARBECUE PIT,Spend 20 at Dickeys Barbecue Pit,291
125,WELCHS FRUIT SNACKS,"Welchs Fruit Snacks, select varieties, 6 count, buy 2",176
200,CHOSEN FOODS,Chosen Foods Dressings,284
220,GLAD,Glad ForceFlex Max Strength Trash Bags,306
172,SHAWS,Spend 300 at Shaws,248
161,STUBBORN SODA,"Stubborn Soda OR Bundaberg Ginger Beer, select varieties, at Amazon Storefront",228
146,COSTCO,When you join Costco as a Gold Star Member New Members Only,206
241,BACK TO THE ROOTS,Back to the Roots Grow Hydroponic Grow Kit OR Refill Bundle at Walmart,331


In [3]:
# read csv
retailer_cleaned = pd.read_csv('retailer_offer_cleaned.csv')
retailer_cleaned.sample(10)

Unnamed: 0,RETAILER,OFFER,idx
120,SAFEWAY,Spend 210 at Safeway,262
48,TOM THUMB,Shop 2 times at Tom Thumb,99
137,TOM THUMB,Spend 115 at Tom Thumb,298
18,WALMART,"Gortons Air Fried Butterfly Shrimp, at Walmart",36
14,ZAXBYS,Order online at Zaxbys.com,30
50,SAFEWAY,Shop 2 times at Safeway,103
122,ALBERTSONS,Spend 110 at Albertsons,264
55,WALMART,AleveX at Walmart,117
49,WALMART,"Purex laundry detergent, select varieties, at Walmart",101
44,WALMART,"Back to the Roots Soils, select varieties and sizes, at Walmart",86


In [4]:
# use brand_cleaned to left join retailer_cleaned (cols: idx and retailer)
merged = pd.merge(brand_cleaned, retailer_cleaned[['idx','RETAILER']], on='idx', how='left')
merged.sample(10)

Unnamed: 0,BRAND,OFFER,idx,RETAILER
185,KLONDIKE,Klondike Cones at Walmart,263,WALMART
94,5 GUM,"RESPAWN by 5 GUM, at Walmart",136,WALMART
117,SARA LEE,"Sara Lee bread select varieties, buy 2",164,
177,KRADLE,"Kradle, select varieties, online at Amazon",254,AMAZON
234,BURTS BEES,"Burts Bees Sensitive Lotions and Creams, select varieties, at Walmart",324,WALMART
208,TGI FRIDAYS,Spend 25 at TGI Fridays,292,TGI FRIDAYS
113,STAR MARKET,Spend 90 at Star Market,159,STAR MARKET
196,TOSTITOS,Tostitos Toppers,277,
176,VIZZY,"Coors Light, Miller Lite OR VIZZY 12 packs, buy 2",253,
274,BACK TO THE ROOTS,"Back to the Roots Organic 3-In-1 Seed Starting Mix 12 quart, at Walmart or Target",372,TARGET


# Brand

## Train-test-val split

In [5]:
df_merge = merged.copy()
df_merge['RETAILER'] = df_merge['RETAILER'].fillna('').astype(str)

In [6]:
# split data into train, test, and validation (80%, 10%, 10%)
train = df_merge.sample(frac=0.8, random_state=42)
dev = df_merge.drop(train.index)
val = train.sample(frac=0.05, random_state=42)

# val is seperated from train, so it should be dropped from train
# rows in train that has the same idx as val (the feature, not the index) should be dropped
train = train[~train['idx'].isin(val['idx'])]

In [7]:
print(f'train size: {train.shape}')
print(f'dev size: {dev.shape}')
print(f'val size: {val.shape}')

train size: (217, 4)
dev size: (57, 4)
val size: (11, 4)


## Convert csv to json

Sometimes you have:
1. Retailer: ABC, OFFER: save $10 at ABC.com
2. Retailer: ABC.com, OFFER: save $10 at ABC
3.  Retailer: ABC.com, OFFER: save $10 at ABC.com
4.  Retailer: ABC, OFFER: save $10 at ABC

3 & 4 easy to fix. How do we fix 1 & 2? We can use EntityRuler.

Another headache is that some retailers also have their same-name brands. For example, Sam's Club. When a customer only searches "Sam's Club", we won't be able to know whether it is a brand or a retailer. This leads to the problem of span labeling. That is, `('Spend 15 at Burger King', [[12, 23, 'BRAND'], [12, 23, 'RETAILER'])` is labeling the same entity at the same time. This is a problem of ambiguity, which is making this task more complex and might be better to solve it in the similarity calculation stage. I am making an assumption here that these retailer-owned brands should be annotated as `RETAILER` during labeling.

Based on the EDA notebook, we know that the retailer-owned-brand is only sold by the retailers themselves. Therefore, when a customer search "Sam's Club", we

In [8]:
# Generate a list of [start_char,end_char, label] of given label, tagname is the name of the label
# Spacy example: 
# ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING")])

def label_tagger(text, label, tag):
    """Takes in a string, returns the tuple of (start_char, end_char, label)"""
    text = text.lower()
    label = label.lower().strip()
    if text.find(label) != -1:
        start_index = text.find(label)
        last_index = start_index + len(label)
        return [start_index,last_index,tag]
    else:
        print(text)
        print(f"{label} not found in the given sentence")

def df_labeller(df):
    """Convert dataframe to a list of tuples"""
    ner_output = []
    for i,row in enumerate(df.itertuples(),1):
        offer, brand, retailer = row.OFFER, row.BRAND, row.RETAILER
        brand_label = label_tagger(offer, brand, "BRAND")
        if (len(retailer) > 0) and (retailer != brand): # we filled NaN with empty string, need to filter out 
            retailer_label = label_tagger(offer, retailer, "RETAILER") 
            ner_datapoint = (offer, [brand_label, retailer_label])
        else:
            ner_datapoint = (offer, [brand_label])
        ner_output.append(ner_datapoint)
    return ner_output

In [9]:
# data convert
train_labeled = df_labeller(train)
dev_labeled = df_labeller(dev)
val_labeled = df_labeller(val)

In [10]:
train_labeled[:10]

[('DOVE Chocolate, select sizes, buy 1', [[0, 14, 'BRAND']]),
 ('The Rustik Oven bread', [[4, 15, 'BRAND']]),
 ('When you join Costco as an Executive Member New Members Only',
  [[14, 20, 'BRAND']]),
 ('Sargento Sliced Cheese, spend 18', [[0, 8, 'BRAND']]),
 ('Sargento Sliced Cheese, spend 12', [[0, 8, 'BRAND']]),
 ('Barilla pasta, select varieties, buy 4', [[0, 7, 'BRAND']]),
 ('Spend 15 at Burger King', [[12, 23, 'BRAND']]),
 ('Gortons at select retailers', [[0, 7, 'BRAND']]),
 ('Sara Lee bread, select varieties, buy 2 at Walmart',
  [[0, 8, 'BRAND'], [43, 50, 'RETAILER']]),
 ('Any Albertsons receipt', [[4, 14, 'BRAND']])]

In [11]:
# save data
ents_label = {}
ents_label['train'] = train_labeled
ents_label['dev'] = dev_labeled
ents_label['val'] = val_labeled

with open('ents_label.json', 'w') as fp:
    json.dump(ents_label, fp)

# also save val for later use
val.to_csv('val.csv',index=False)

In [12]:
# also write them into csv
# train.to_csv('train.csv',index=False)
# dev.to_csv('test.csv',index=False)
# val.to_csv('val.csv',index=False)

## JSON to spaCy binary

In [15]:
# read json
with open('ents_label.json') as json_file:
    ents_label_dict = json.load(json_file)

train_input = ents_label_dict['train']
dev_input = ents_label_dict['dev']
val_input = ents_label_dict['val']

In [16]:
import spacy
from spacy.tokens import DocBin

# test = df_labeller(train[train.RETAILER.eq("COSTCO")])
# test = train_input[:20]

def conver_data_to_spacy(data, to_path, lang='en'):
    nlp = spacy.blank(lang)
    db = DocBin()
    skipped = 0
    for text, annotations in data:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            if span is None:
                skipped += 1
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                print(msg)
            else:
                # Check if the span overlaps with any existing entity spans
                overlaps = [ent for ent in ents if ent.start < span.end and ent.end > span.start]
                if overlaps:
                    # Remove any overlapping entity spans
                    for ent in overlaps:
                        ents.remove(ent)
                        msg = f"Removed overlapping entity [{ent.start}, {ent.end}, {ent.label_}] in the following text:\n\n{repr(text)}\n"
                        print(msg)
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    print('### Totally skipped docs:', skipped)
    db.to_disk(to_path)

In [17]:
path = "/Users/anthony/Projects/retailer_nlp_challenger/corpus/"
conver_data_to_spacy(train_input, path+"train.spacy")

Skipping entity [0, 5, BRAND] in the following text because the character span 'Pepsi' does not align with token boundaries:

'PepsiCo Variety Pack, select varieties, at Amazon Storefront'

Skipping entity [16, 22, BRAND] in the following text because the character span 'Zaxbys' does not align with token boundaries:

'Order online at Zaxbys.com'

Skipping entity [0, 5, BRAND] in the following text because the character span 'Pepsi' does not align with token boundaries:

'PepsiCo Beverage, 7.5-ounce 10 pack, select varieties, at Amazon Storefront'

Skipping entity [12, 17, BRAND] in the following text because the character span 'Chewy' does not align with token boundaries:

'Spend 50 at Chewy.com'

Skipping entity [35, 40, RETAILER] in the following text because the character span 'Chewy' does not align with token boundaries:

'Kradle, select products, online at Chewy.com'

Skipping entity [0, 5, BRAND] in the following text because the character span 'Aleve' does not align with token b

In [18]:
path = "/Users/anthony/Projects/retailer_nlp_challenger/corpus/"
conver_data_to_spacy(dev_input, path+"dev.spacy")

Removed overlapping entity [3, 6, BRAND] in the following text:

'Spend 35 at Fresh Thyme Market'

### Totally skipped docs: 0
