In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

from spacy.tokens import DocBin
import srsly
import spacy
import spacy
from spacy.training import offsets_to_biluo_tags

pd.set_option('display.max_colwidth', None)

## Train-test-val split

In [3]:
import os
os.chdir('./data')

# read csv
brand_cleaned = pd.read_csv('brand_offer_cleaned.csv')
brand_cleaned.sample(10)

Unnamed: 0,BRAND,OFFER
230,GATORADE,"GATORLYTE OR GATORADE Fit Single Serve Bottle, select varieties"
19,BUTTERBALL,"Butterball, select varieties, Spend 10 at Ruler Foods"
171,FRESH THYME MARKET,Spend 50 at Fresh Thyme Market
24,FLONASE,"Flonase Allergy Relief, at Walmart"
42,SARA LEE,"Sara Lee bread, select varieties, buy 2 at Walmart"
154,VONS,Shop 2 times at Vons
0,BEYOND MEAT,"Beyond Meat Plant-Based products, spend 25"
239,BUTTERBALL,"Butterball, select varieties, spend 10 at Pick n Save OR Metro Market"
7,ZAXBYS,Spend 20 at Zaxbys
161,STAR MARKET,Spend 220 at Star Market


In [4]:
# split data into train, test, and validation (80%, 10%, 10%)
train = brand_cleaned.sample(frac=0.8, random_state=42)
test = brand_cleaned.drop(train.index)
val = train.sample(frac=0.125, random_state=42)

In [5]:
train.sample(5)

Unnamed: 0,BRAND,OFFER
217,GLAD,Glad ForceFlex Max Strength Trash Bags
41,PEPSI,"PepsiCo Beverage, 7.5-ounce 10 pack, select varieties, at Amazon Storefront"
182,SAFEWAY,Spend 210 at Safeway
240,RAOS,"Raos, Pasta OR Sauce, spend 10"
253,BARILLA,"Barilla pasta, select varieties, buy 2"


In [6]:
print(f'train size: {train.shape}')
print(f'test size: {test.shape}')
print(f'val size: {val.shape}')

train size: (222, 2)
test size: (56, 2)
val size: (28, 2)


## Convert csv to json

In [7]:
# Generate a list of [start_char,end_char, label] of given label, tagname is the name of the label
# Spacy example: 
# [('Kanye love Paris',{'entities': [0,5,PERSON]})]
def label_tagger(text, label, tag="BRAND"):
    text = text.lower()
    label = label.lower().strip()
    if text.find(label) != -1:
        start_index = text.find(label)
        last_index = start_index + len(label)
        return [start_index,last_index,tag]
    else:
        print(text)
        print(f"{label} not found in the given sentence")

In [15]:
# Generate train dict
ner_training = []
for i,row in enumerate(train.itertuples(),1):
    offer = row.OFFER
    brand = row.BRAND
    brand_entity = label_tagger(offer,brand)
    train_datapoint = (offer, {'entities': [brand_entity]})
    ner_training.append(train_datapoint)

with open('train.json','w') as f:
  json.dump(ner_training, f)

ner_training[:2]

[('Hidden Valley Ranch Salad Dressing OR Secret Sauce, select varieties',
  {'entities': [[0, 19, 'BRAND']]}),
 ('Sign up for McAlisters Deli Rewards, tap for details',
  {'entities': [[12, 27, 'BRAND']]})]

In [26]:
# Generate train dict
ner_testing = []
for i,row in enumerate(test.itertuples(),1):
    offer = row.OFFER 
    brand = row.BRAND
    brand_entity = label_tagger(offer, brand)
    test_datapoint = (offer, {'entities': [brand_entity]})
    ner_testing.append(test_datapoint)

with open('test.json','w') as f:
  json.dump(ner_testing, f)

ner_testing[:2]

[('Spend 10 at Subway', {'entities': [[12, 18, 'BRAND']]}),
 ('Gillette Venus  for Pubic Hair & Skin spend 20',
  {'entities': [[0, 14, 'BRAND']]})]

In [27]:
# Generate train dict
ner_validation = []
for i,row in enumerate(val.itertuples(),1):
    offer = row.OFFER 
    brand = row.BRAND
    brand_entity = label_tagger(offer, brand)
    val_datapoint = (offer, {'entities': [brand_entity]})
    ner_validation.append(val_datapoint)

with open('validation.json','w') as f:
  json.dump(ner_validation, f)

ner_validation[:2]

[('Shop 2 times at Randalls', {'entities': [[16, 24, 'BRAND']]}),
 ('Gillette Venus  for Pubic Hair & Skin, spend 20',
  {'entities': [[0, 14, 'BRAND']]})]