In [1]:
import pandas as pd
import numpy as np

from spacy.tokens import DocBin
import srsly
import spacy
from spacy.training import offsets_to_biluo_tags

# Importing as module.
import en_fetch_ner_spacy_tsf
nlp = en_fetch_ner_spacy_tsf.load()

import os
path = "/Users/anthony/Projects/retailer_nlp_challenger/data"
os.chdir(path)

pd.set_option('display.max_colwidth', None)


If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current 'transformers' and 'spacy-transformers' versions. For more details and available updates, run: python -m spacy validate


## Load Operation Data

In [2]:
import pickle, json

path1 = "brand_belong_category_dict.json"
path2 = "product_upper_category_dict.json"
path3 = "offered_brands.pkl"
path4 = "offer_retailer.csv"

with open(path1, 'r') as f:
    brand_belong_category_dict = json.load(f)

with open(path2, 'rb') as f:
    category_dict = json.load(f)

with open(path3, 'rb') as f:
    offered_brands = pickle.load(f)

In [3]:
df_offers_brand_retailer = pd.read_csv(path4)
df_offers_brand_retailer.head()

Unnamed: 0,OFFER,RETAILER,BRAND
0,Spend $50 on a Full-Priced new Club Membership,SAMS CLUB,SAMS CLUB
1,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT
2,Good Humor Viennetta Frozen Vanilla Cake,,GOOD HUMOR
3,"Butterball, select varieties, spend $10 at Dillons Food Store",DILLONS FOOD STORE,BUTTERBALL
4,"GATORADE® Fast Twitch®, 12-ounce 12 pack, at Amazon Storefront*",AMAZON,GATORADE


In [4]:
category_dict

{'ALCOHOL': ['Beer',
  'Malt Beverages',
  'Hard Seltzers, Sodas, Waters, Lemonades & Teas',
  'Hard Ciders',
  'Wine',
  'Spirits'],
 'ANIMALS & PET SUPPLIES': ['Dog Supplies'],
 'BABY & TODDLER': ['Baby Bathing',
  'Potty Training',
  'Baby Safety',
  'Diapering',
  'Baby Health'],
 'BEAUTY': ['Cosmetic Tools', 'Makeup', 'Nail Care', 'Body Fragrances'],
 'BEVERAGES': ['Coffee',
  'Meal Replacement Beverages',
  'Vegetable Juices',
  'Drink Mixes',
  'Tea',
  'Energy Drinks',
  'Water',
  'Fruit Juices',
  'Carbonated Soft Drinks'],
 'CANDY': ['Gum'],
 'DAIRY': ['Cream',
  'Dairy Alternatives',
  'Dairy',
  'Yogurt',
  'Milk',
  'Cheese',
  'Butter & Margarine',
  'Eggs',
  'Refrigerated Doughs'],
 'DELI & BAKERY': ['Leafy Salads', 'Deli Counter', 'Bakery', 'Prepared Meals'],
 'FROZEN': ['Frozen Fruits',
  'Frozen Desserts',
  'Frozen Sides',
  'Frozen Meals',
  'Frozen',
  'Frozen Vegetables',
  'Ice',
  'Frozen Breads & Doughs',
  'Frozen Pizza & Pizza Snacks',
  'Frozen Breakfast',

In [5]:
brand_belong_category_dict

{'ADULT INCONTINENCE': ['POISE',
  'DEPEND',
  'ALWAYS',
  'WELLNESS',
  'TENA',
  'FIORE',
  'ONE BY POISE'],
 'ALFREDO & WHITE PASTA SAUCE': ['CLASSICO',
  'PREGO',
  'RAGU',
  "RAO'S",
  'KROGER',
  'SIMPLE TRUTH',
  'PUBLIX',
  'HEB'],
 'BABY BATHING': ['BABY SOFT',
  'PREDO',
  'GERBER',
  'ENFAMIL',
  "EARTH'S BEST",
  'HAPPY BABY',
  'PAMPERS',
  'SPROUT',
  "MOTT'S",
  'JOHNSONS',
  'SHEA MOISTURE',
  'AVEENO BABY',
  'BABY DOVE',
  'HAPPY TOT',
  'PLUM ORGANICS',
  'SIMILAC',
  'THE HONEST CO.',
  'NANIT',
  'SIMPLE TRUTH',
  'MIKU',
  'BEECH NUT',
  'BABY SOFT EXPRESSIONS',
  'ORGANICS HAPPY BABY',
  'A+D',
  'MUNCHKIN',
  'VIVVI & BLOOM',
  'ENFAGROW',
  'HORIZON (WHITEWAVE FOODS)',
  "DR BROWN'S",
  'GREENWISE',
  'NATURE BABY CARE',
  'NUTRAMIGEN',
  'BABYGANICS',
  'HOT-KID',
  'CEREBELLY',
  'BABY LOVE',
  'AVEENO',
  'SMART CARE',
  "LUV N' CARE",
  'LANSINOH',
  'PIPETTE'],
 'BABY HEALTH': ['GERBER',
  'ENFAMIL',
  "EARTH'S BEST",
  'EVENFLO (EVENFLO FEEDING INC)',
  '

Assumptions:
- search input will be simple sentences or keywords (e.g. `beer`, or `beers discount Amazon`)
- users won't serach upper_category because the APP already has UI for them to filter
- `PRODUCT_CATEGORY` are not to spefici brands, so a customer might search `beer` or `white beer`
- the search functions I am using will clean the text to lower case. This is because I don't know if the `PRODUCT_CATEGORY` data gives me the write permission

Issues with dataset:
- `Alchol` should be renamed as drinks. It contains more than alcholic drinks `[Beer, Malt Beverages, Hard Seltzers, Sodas, Waters, Lemonades & Teas, Hard Ciders, Wine, Spirits]`

## Helper functions

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

additional_stop_words = {'pack'}
stop_words.update(additional_stop_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anthony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# helper functions
from typing import List, Dict, Tuple
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def single_text_cleaner(text: str, remove_stopwords: bool=False, upper_case: bool = False, remove_punctuation: bool=True) -> str:
    """Clean one single text input. By default it will convert text to lower case"""
    if upper_case:
        text = text.upper()
    else:
        text = text.lower()
    if remove_punctuation:
        text = re.sub(r'[^a-z\s]', '', text)
    if remove_stopwords:
        words = text.split()
        words = [word for word in words if word not in stop_words]
        text = ' '.join(words)
    return text

def list_text_cleaner(texts: List[str], upper_case: bool = False, remove_stopwords: bool = False, remove_punctuation: bool=True) -> List[str]:
    """Takes in a list of strings and returns a list of cleaned strings without stop words. 
    Current tasks: 
    - remove non-alphabetical characters
    - converting to lower cases
    - remove stop words (optional)"""
    cleaned_texts = [single_text_cleaner(text, remove_stopwords, upper_case, remove_punctuation) for text in texts]
    return cleaned_texts

def match_product_category(s1: list[str], s2: list[str]) -> str:
    """Find if items of a list is in one list of product categories"""
    return next((p for c in s1 for p in s2 if c in p), None) # this will stop after finding first match, which saves time

def find_category(search_input: str, search_dict: Dict) -> str:
    """Find the category of a search input based on a dictionary of categories
    Args:
    - search_input: a string
    - search_dict: a dictionary of product categories
    """
    search_list = list_text_cleaner(re.split(r'[,\s]+', search_input), remove_stopwords=True)
    search_list = [c for c in search_list if len(c)>0] # sometimes there are empty strings
    matched_category = False
    for k, v in search_dict.items():
        v = list_text_cleaner(v, remove_punctuation=False)
        search_results = match_product_category(search_list, v)
        if search_results is not None:
            matched_category = True
            return k, search_results
        else:
            # print(f'Function find_category: No category {k} has matched for input: {search_input}') 
            continue
    if not matched_category:
        print(f'Function find_category: No category has matched for input: {search_input}')
        return None
    

def check_entity(search_input) -> bool:
    """Takes in a search input and checks if it contains any entities"""
    doc = nlp(search_input)
    if len(doc.ents) > 0:
        return doc
    else:
        return False

def get_cosine_sim(input_text: str, texts: List[str]) -> pd.DataFrame:
    """Calculate the cosine similarity of the input text against a list of texts
    Takes in:
    - input_text: a string
    - texts: a list of strings
    Returns a dataframe with two columns: Sentence Text and Cosine Similarity Score
    """
    input_text_cleaned = list_text_cleaner([input_text], remove_stopwords=True)[0]
    cleaned_texts = list_text_cleaner(texts, remove_stopwords=True)
    all_texts = [input_text_cleaned] + cleaned_texts
    vectors = get_vectors(*all_texts)
    sim_matrix = cosine_similarity(vectors)
    # Get the similarity scores of the input_text against all other texts
    sim_scores = sim_matrix[0, 1:]
    data = {'OFFER': texts, 'Cosine Similarity Score': sim_scores}
    df = pd.DataFrame(data)
    df = df.sort_values(by='Cosine Similarity Score', ascending=False).reset_index(drop=True)
    return df

def get_vectors(*strs: str) -> np.ndarray:
    text = list(strs)
    vectorizer = CountVectorizer()
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

def jaccard_similarity(s1: List[str], s2: List[str]) -> float:
    """Takes in two lists and returns the Jaccard similarity score (3 digits)"""
    intersection = set(s1).intersection(set(s2))
    n = len(intersection)
    score = round(n / (len(s1) + len(s2) - n), 3)
    return score

def get_jaccard_sim(input_text: str, texts: List[str]) -> pd.DataFrame:
    """Calculate the Jaccard similarity of the input text against a list of texts
    Takes in:
    - input_text: a string
    - texts: a list of strings
    Returns a dataframe with two columns: Sentence Text and Jaccard Similarity Score
    """
    cleaned_input_text = list_text_cleaner([input_text], remove_stopwords=True)[0].split()
    cleaned_texts = list_text_cleaner(texts, remove_stopwords=True)
    
    jaccard_scores = [jaccard_similarity(cleaned_input_text, text.split()) for text in cleaned_texts]
    
    data = {'OFFER': texts, 'Jaccard Similarity Score': jaccard_scores}
    df = pd.DataFrame(data)
    # sort based on the similarity score
    df = df.sort_values(by='Jaccard Similarity Score', ascending=False).reset_index(drop=True)
    return df

def find_column(df: pd.DataFrame, keyword: str) -> str:
    """Function to find the first column containing a specific keyword. Note that we assume there will only be one score at most for a similarity score dataframe"""
    cols = [col for col in df.columns if keyword.lower() in col.lower()]
    return cols[0] if cols else None

def extract_similar_offers(data: pd.DataFrame, threshold: float = 0.0) -> pd.DataFrame:
    """Takes in the results from get_cosine_sim() and get_jaccard_sim(); returns a dataframe of similar offers with scores > threshold"""
    score = find_column(data, 'score')
    similar_offers = data[data[score] >= threshold]
    return similar_offers

def category_to_brand(category: str, offered_brands: List, brand_belong_category_dict: Dict) -> List[str]:
    """Use case: when a user searches for a category, we return a list of brands in that category"""
    # checks if the category is in the dictionary keys
    if category.upper() in brand_belong_category_dict.keys():
        search_brands = brand_belong_category_dict[category.upper()] # becase all keys are in upper case
        result = list(set(search_brands) & set(offered_brands))
        print(f"Function category_to_brand | Found {category} in offered brand") 
        return result 
    else:
        print(f"Function category_to_brand | No offered brand is found in {category}")
        return None

class CatchErros(Exception):
    class ParamsInputError(Exception):
        pass
    class SearchFailedError(Exception):
        pass
    class UnknownError(Exception):
        pass


def offer_finder_by_category(search_input: str, search_category_tuple: Tuple, category_dict: Dict, offered_brands: List, 
                             brand_belong_category_dict: Dict, score: str, threshold: float = 0.0) -> pd.DataFrame:
    """Find offers based on a category identified from search input.
    Args:
    - search_input: a string
    - search_category_tuple: a tuple of (upper_category, product_category)
    - category_dict: a dictionary of categories. Keys are upper categories and values are lists of product categories
    - offered_brands:  a dataframe of offers (OFFER, BRAND, RETAILER) that are avaialble in our database
    - brand_belong_category_dict: a dictionary of brands and the categories they belong to
    - score: a string of either 'cosine' or 'jaccard'
    - threshold: a float between 0 and 1

    Returns a dataframe of similar offers, ordered by highest score
    """
    # we assume people just search one category at a time
    # search_category_tuple = find_category(search_input, category_dict)
    product_category, upper_category = search_category_tuple[1], search_category_tuple[0] # ('Alcohol', 'beer')
    print(f'Function offer_finder_by_category | Found items:\n- Search input: {search_input}\n- Product category: {product_category}\n- Upper category: {upper_category}')
    potential_brands = category_to_brand(product_category, offered_brands, brand_belong_category_dict)
    if potential_brands is not None:
        potential_offers = offers[offers['BRAND'].isin(potential_brands)]['OFFER'].tolist()
        if score == 'cosine':
            cos_sim_score = get_cosine_sim(search_input, potential_offers)
            output = extract_similar_offers(cos_sim_score, threshold)
        elif score == 'jaccard':
            jaccard_sim_score = get_jaccard_sim(search_input, potential_offers)
            output = extract_similar_offers(jaccard_sim_score, threshold)
        elif score not in ['cosine', 'jaccard']:
            raise ValueError(f'Please enter a valid score: cosine or jaccard; Not {score}')
        else: # this means something else is worng
            raise UnknownError(f'Something must be broken. Please try again.')
        return output
    else:
        potential_product_categories = category_dict[upper_category]
        msg = f'{product_category} is not found. Do you wanna take a look at these similar offers in {upper_category}?\n We have: {potential_product_categories}' # we can still calculate similarity but this is computationally expensive
        print(msg)
        return None

def offer_finder_by_entity(search_input: str, entities: Tuple, offers_data: pd.DataFrame, score: str, threshold: float=0.0) -> pd.DataFrame:
    """Find offers based on entities identified from search input.
    Args:
    - search_input: a string
    - entities: a tuple of entities
    - offers_data: a dataframe of offers (OFFER, BRAND, RETAILER) that are avaialble in our database
    - score: a string of either 'cosine' or 'jaccard'
    - threshold: a float between 0 and 1

    Returns a dataframe of similar offers, ordered by highest score
    """
    collects = [] # collect all the results if there are more than one entity
    for ent in entities:
        ent_name, ent_label = ent.text, ent.label_
        print(f'Function offer_finder_by_entity | Found entity: {ent_name} with label: {ent_label}')
        # filter offers by entity
        df_tmp = offers_data[offers_data[ent_label.upper()] == ent_name.upper()]
        if df_tmp.shape[0] > 0:
            print(f'Function offer_finder_by_entity | Found {df_tmp.shape[0]} offer(s) for the brand/retailer: {ent_name}')
            potential_offers = df_tmp['OFFER'].drop_duplicates().tolist()
            if score == 'cosine':
                cos_sim_score = get_cosine_sim(search_input, potential_offers)
                output = extract_similar_offers(cos_sim_score, threshold)
            elif score == 'jaccard':
                jaccard_sim_score = get_jaccard_sim(search_input, potential_offers)
                output = extract_similar_offers(jaccard_sim_score, threshold)
            elif score not in ['cosine', 'jaccard']:
                raise ValueError(f'Please enter a valid score: cosine or jaccard; Not {score}')
            else: # this means something else is worng
                raise UnknownError(f'Something must be broken. Please try again.')
            collects.append(output)
        else:
            print(f'Function offer_finder_by_entity | No offer is found for the brand/retailer: {ent_name}')

    if len(collects) > 0:
        final_output = pd.concat(collects, ignore_index=True)# they should be using the same similarity score
        score = find_column(collects[0], 'score') 
        final_output = final_output.sort_values(by=score, ascending=False).reset_index(drop=True) # sort final_output by score
        return final_output
    elif len(collects) == 1:
        return collects[0]
    else:
        print('###'*5 + 'FINAL SEARCH RESULTS' + '###'*5)
        print('Function offer_finder_by_entity | No offer is found for any of the entities.')
        return None


def main(search_input: str, offers: pd.DataFrame, category_dict: Dict, brand_belong_category_dict: Dict, score: str, score_threshold: float = 0.0):
    """Main function. Takes in a serach_input and decide whether it can find entities or not. Then excecute the appropriate functions
    Inputs:
    - search_input: a string that a user enters
    - offers: a dataframe of offers (OFFER, BRAND, RETAILER) that are avaialble in our database
    - category_dict: a dictionary of categories. Keys are upper categories and values are lists of product categories
    - brand_belong_category_dict: a dictionary of brands and the categories they belong to
    - score: a string of either 'cosine' or 'jaccard'
    - score_threshold: a float between 0 and 1

    Returns a dataframe of similar offers, ordered by highest score
    """
    print(f'Function main | Search input: {search_input}')
    check_ent = check_entity(search_input)
    if not check_entity(search_input): # no entities found
       # check category
       cat_check = find_category(search_input, category_dict)
       if cat_check is None:
           raise SearchFailedError('No brand/retailer/category is found. Please try again.')
       else:
            # we assume people just search one category at a time
            cat_tuple = cat_check # ('Alcohol', 'beer')
            search_results = offer_finder_by_category(search_input, cat_tuple, category_dict, offers, brand_belong_category_dict, score, score_threshold)
            return search_results
    else:
        entities = check_ent.ents # entities will be a tuple anyways
        print(f'Found {len(entities)} entity object(s) in the search input.')
        search_results = offer_finder_by_entity(search_input, entities, offers, score, score_threshold)
        return search_results
            

## Playground to test

### Main Function Test

In [30]:
test = "jaja"

main(test, df_offers_brand_retailer, category_dict, brand_belong_category_dict, 'cosine', 0.0)

Function main | Search input: jaja
Found 1 entity object(s) in the search input.
Function offer_finder_by_entity | Found entity: jaja with label: BRAND
Function offer_finder_by_entity | No offer is found for the brand/retailer: jaja
###############FINAL SEARCH RESULTS###############
Function offer_finder_by_entity | No offer is found for any of the entities.


In [8]:
# test = "Simply Spiked Lemonade 12 pack at Walmart"
test = "EGGLIFE 12 pack"

main(test, df_offers_brand_retailer, category_dict, brand_belong_category_dict, 'cosine', 0.0)

Function main | Search input: EGGLIFE 12 pack
Found 1 entity object(s) in the search input.
Function offer_finder_by_entity | Found entity: EGGLIFE with label: BRAND
Function offer_finder_by_entity | Found 1 offer(s) for the brand/retailer: EGGLIFE


Unnamed: 0,OFFER,Cosine Similarity Score
0,Egglife Egg White Wraps at Aldi,0.447214


In [9]:
test = "Simply Spiked Lemonade 12 pack at Walmart"
# test = "EGGLIFE 12 pack"

main(test, df_offers_brand_retailer, category_dict, brand_belong_category_dict, 'cosine', 0.3)

Function main | Search input: Simply Spiked Lemonade 12 pack at Walmart
Found 2 entity object(s) in the search input.
Function offer_finder_by_entity | Found entity: Simply Spiked with label: BRAND
Function offer_finder_by_entity | Found 4 offer(s) for the brand/retailer: Simply Spiked
Function offer_finder_by_entity | Found entity: Walmart with label: RETAILER
Function offer_finder_by_entity | Found 44 offer(s) for the brand/retailer: Walmart


Unnamed: 0,OFFER,Cosine Similarity Score
0,Simply Spiked™ Lemonade 12 pack+,0.866025
1,Simply Spiked™ 12 pack+,0.707107
2,Simply Spiked™ Signature Peach 12 pack+,0.5
3,"Simply Spiked™ Signature Peach 12-pack or larger, buy 2",0.408248
4,"Arber, at Walmart",0.353553
5,AleveX™ at Walmart,0.353553


In [10]:
# test = "Sara Lee bread, select varieties, buy 2 at Walmart"
test = "Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement"

main(test, df_offers_brand_retailer, category_dict, brand_belong_category_dict, 'cosine', 0.3)

Function main | Search input: Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement
Found 2 entity object(s) in the search input.
Function offer_finder_by_entity | Found entity: Back to the Roots with label: BRAND
Function offer_finder_by_entity | Found 15 offer(s) for the brand/retailer: Back to the Roots
Function offer_finder_by_entity | Found entity: Lowe's Home Improvement with label: RETAILER
Function offer_finder_by_entity | No offer is found for the brand/retailer: Lowe's Home Improvement


Unnamed: 0,OFFER,Cosine Similarity Score
0,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",1.0
1,"Back to the Roots Potting Mix, 1 cubic foot, at Lowe's Home Improvement",0.777778
2,"Back to the Roots Soils, select varieties and sizes, at Lowes",0.377964
3,"Back to the Roots Soils, select varieties, at Walmart or Lowes",0.377964
4,Back to the Roots Grow Kits at Walmart or The Home Depot,0.377964
5,"Back to the Roots Dry Plant Food, 5 pounds, at The Home Depot",0.353553
6,"Back to the Roots Seeds, at Walmart",0.333333
7,"Back to the Roots Raised Bed Gardening Kit with Soil, Seeds and Plant Food, at Target",0.301511


In [11]:
test = "Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement"

main(test, df_offers_brand_retailer, category_dict, brand_belong_category_dict, 'jaccard', 0.1)

Function main | Search input: Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement
Found 2 entity object(s) in the search input.
Function offer_finder_by_entity | Found entity: Back to the Roots with label: BRAND
Function offer_finder_by_entity | Found 15 offer(s) for the brand/retailer: Back to the Roots
Function offer_finder_by_entity | Found entity: Lowe's Home Improvement with label: RETAILER
Function offer_finder_by_entity | No offer is found for the brand/retailer: Lowe's Home Improvement


Unnamed: 0,OFFER,Jaccard Similarity Score
0,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",1.0
1,"Back to the Roots Potting Mix, 1 cubic foot, at Lowe's Home Improvement",0.636
2,"Back to the Roots Soils, select varieties and sizes, at Lowes",0.231
3,"Back to the Roots Soils, select varieties, at Walmart or Lowes",0.231
4,Back to the Roots Grow Kits at Walmart or The Home Depot,0.231
5,"Back to the Roots Dry Plant Food, 5 pounds, at The Home Depot",0.214
6,"Back to the Roots Seeds, at Walmart",0.182
7,"Back to the Roots Raised Bed Gardening Kit with Soil, Seeds and Plant Food, at Target",0.176
8,"Back to the Roots Organic Kits and Planters, at Target",0.154
9,"Back to the Roots Soils, select varieties, at Walmart",0.154


In [12]:
test = "Membership at Costco"

main(test, df_offers_brand_retailer, category_dict, brand_belong_category_dict, 'cosine', 0.2)

Function main | Search input: Membership at Costco
Found 1 entity object(s) in the search input.
Function offer_finder_by_entity | Found entity: Costco with label: BRAND
Function offer_finder_by_entity | Found 2 offer(s) for the brand/retailer: Costco


Unnamed: 0,OFFER,Cosine Similarity Score
0,When you join Costco as an Executive Member* (New Members Only),0.288675
1,When you join Costco as a Gold Star Member* (New Members Only),0.267261


In [13]:
main(test, df_offers_brand_retailer, category_dict, brand_belong_category_dict, 'jaccard', 0.1)

Function main | Search input: Membership at Costco
Found 1 entity object(s) in the search input.
Function offer_finder_by_entity | Found entity: Costco with label: BRAND
Function offer_finder_by_entity | Found 2 offer(s) for the brand/retailer: Costco


Unnamed: 0,OFFER,Jaccard Similarity Score
0,When you join Costco as an Executive Member* (New Members Only),0.143
1,When you join Costco as a Gold Star Member* (New Members Only),0.125


### Functions test

In [14]:
# test case: find category

category_dict = brand_belong_category_dict

test1 = "White German Beer"
test2 = ["Beer", "Malt Beverages", "Hard Seltzers", "Sodas", "Waters", "Lemonades & Teas", "Hard Ciders", "Wine", "Spirits"]

find_category(test1, category_dict)


('BABY BATHING', 'horizon (whitewave foods)')

In [15]:
test_text = "Klondike Cones at Walmart"
# test_text = "Klondike Cones"
# test_test = "M&M's candy discount at Target"


doc = nlp(test_text)
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [16]:
# [(ent.text, ent.label_) for ent in text.ents]

# len(check_entity(test_text).ents)

# type(check_entity(test_text).ents)

for ent in doc.ents:
    print(f'Entity: {ent.text} | Label: {ent.label_}')

Entity: Klondike | Label: BRAND
Entity: Walmart | Label: RETAILER


In [17]:
# test on Cosine Similarity for search_offer comparison

input_text = "Membership at Costco"
texts = ["Sara Lee bread, select varieties, buy 2 at Walmart", 
         "Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement", 
         "When you join Costco as a Gold Star Member", 
         "this is the fourth text"]

cosine_sim_scores = get_cosine_sim(input_text, texts)
cosine_sim_scores

Unnamed: 0,OFFER,Cosine Similarity Score
0,When you join Costco as a Gold Star Member,0.316228
1,"Sara Lee bread, select varieties, buy 2 at Walmart",0.0
2,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",0.0
3,this is the fourth text,0.0


In [18]:
# test on Jaccard Similarity for search_offer comparison

input_text = "Membership at Costco"
texts = ["Sara Lee bread, select varieties, buy 2 at Walmart", 
         "Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement", 
         "When you join Costco as a Gold Star Member", 
         "this is the fourth text"]

result = get_jaccard_sim(input_text, texts)
result

Unnamed: 0,OFFER,Jaccard Similarity Score
0,When you join Costco as a Gold Star Member,0.167
1,"Sara Lee bread, select varieties, buy 2 at Walmart",0.0
2,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",0.0
3,this is the fourth text,0.0


In [19]:
# test case on extract_similar_offers
extract_similar_offers(get_jaccard_sim(input_text, texts))

Unnamed: 0,OFFER,Jaccard Similarity Score
0,When you join Costco as a Gold Star Member,0.167
1,"Sara Lee bread, select varieties, buy 2 at Walmart",0.0
2,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",0.0
3,this is the fourth text,0.0


In [20]:
extract_similar_offers(get_cosine_sim(input_text, texts))

Unnamed: 0,OFFER,Cosine Similarity Score
0,When you join Costco as a Gold Star Member,0.316228
1,"Sara Lee bread, select varieties, buy 2 at Walmart",0.0
2,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",0.0
3,this is the fourth text,0.0


In [21]:
# df_offers = pd.read_csv(path + 'brand_offer_cleaned.csv')
# df_offers.sample(10)

### Similarity match test

Operation data test

In [22]:
print(f'offered_brands: {type(offered_brands)}')
print(f'brand_belong_category_dict: {type(brand_belong_category_dict)}')

offered_brands: <class 'list'>
brand_belong_category_dict: <class 'dict'>


In [23]:
offered_brands[:5]

['BEYOND MEAT', 'GOOD HUMOR', 'BUTTERBALL', 'GATORADE', 'DR PEPPER']

In [24]:
brand_belong_category_dict['Baby Bathing'.upper()][:5]

['BABY SOFT', 'PREDO', 'GERBER', 'ENFAMIL', "EARTH'S BEST"]

In [25]:
# test case: I have a category, now I want to find all the brands (with offers) that belong to this category
# so I need to find a intersection of two lists

category = 'Packaged Meals & Sides'
search_brands = brand_belong_category_dict[category.upper()]

# find the intersection of two lists
result = list(set(search_brands) & set(offered_brands))
result

['BACK TO THE ROOTS', 'MCALISTERS DELI']

## Build Category Dictionary

As checks (mannually and EDA), there're no cases for one product_category that belongs to more than one categories. So, we can build a simple category dictionary.

In [26]:
df_categories = pd.read_csv('categories.csv')
print(df_categories.shape)

# rename "IS_CHILD_CATEGORY_TO" to "UPPER_CATEGORY"
df_categories.rename(columns={'IS_CHILD_CATEGORY_TO': 'UPPER_CATEGORY'}, inplace=True)
display(df_categories.head())

# group by UPPER_CATEGORY and get DISTINCT PRODUCT_CATEGORY
grouped_df = df_categories.groupby('UPPER_CATEGORY')['PRODUCT_CATEGORY'].unique().reset_index()
print(grouped_df.shape)
display(grouped_df.head())

(118, 3)


Unnamed: 0,CATEGORY_ID,PRODUCT_CATEGORY,UPPER_CATEGORY
0,1f7d2fa7-a1d7-4969-aaf4-1244f232c175,Red Pasta Sauce,Pasta Sauce
1,3e48a9b3-1ab2-4f2d-867d-4a30828afeab,Alfredo & White Pasta Sauce,Pasta Sauce
2,09f3decc-aa93-460d-936c-0ddf06b055a3,Cooking & Baking,Pantry
3,12a89b18-4c01-4048-94b2-0705e0a45f6b,Packaged Seafood,Pantry
4,2caa015a-ca32-4456-a086-621446238783,Feminine Hygeine,Health & Wellness


(23, 2)


Unnamed: 0,UPPER_CATEGORY,PRODUCT_CATEGORY
0,Alcohol,"[Beer, Malt Beverages, Hard Seltzers, Sodas, Waters, Lemonades & Teas, Hard Ciders, Wine, Spirits]"
1,Animals & Pet Supplies,[Dog Supplies]
2,Baby & Toddler,"[Baby Bathing, Potty Training, Baby Safety, Diapering, Baby Health]"
3,Beauty,"[Cosmetic Tools, Makeup, Nail Care, Body Fragrances]"
4,Beverages,"[Coffee, Meal Replacement Beverages, Vegetable Juices, Drink Mixes, Tea, Energy Drinks, Water, Fruit Juices, Carbonated Soft Drinks]"


In [27]:
# generate a dictionary of UPPER_CATEGORY (key) and PRODUCT_CATEGORY (value)'
category_dict = {row['UPPER_CATEGORY']: row['PRODUCT_CATEGORY'].tolist() for index, row in grouped_df.iterrows()}

# change all keys in category_dict to upper case
category_dict = {k.upper(): v for k, v in category_dict.items()}

# clear all non-str values for all keys, values in category_dict
print(f'Original length of category_dict: {len(category_dict)}')
category_dict_cleaned = {k: [i for i in v if isinstance(i, str)] for k, v in category_dict.items()}
print(f'Cleaned length of category_dict: {len(category_dict_cleaned)}')

# write to json
# import json
# with open('product_upper_category_dict.json', 'w') as fp:
#     json.dump(category_dict_cleaned, fp)

Original length of category_dict: 23
Cleaned length of category_dict: 23


In [28]:
df_brand_to_category = pd.read_csv('brand_category.csv')
df2 = df_brand_to_category
print(df2.shape)
# display(df2.head())

# rename "IS_CHILD_CATEGORY_TO" to "UPPER_CATEGORY"
df2.rename(columns={'BRAND_BELONGS_TO_CATEGORY': 'PRODUCT_CATEGORY'}, inplace=True)
display(df2.head())

# group by UPPER_CATEGORY and get DISTINCT PRODUCT_CATEGORY
grouped_df = df2.groupby('PRODUCT_CATEGORY')['BRAND'].unique().reset_index()
print(grouped_df.shape)
display(grouped_df.head())

(9906, 3)


Unnamed: 0,BRAND,PRODUCT_CATEGORY,RECEIPTS
0,CASEYS GEN STORE,Tobacco Products,2950931
1,CASEYS GEN STORE,Mature,2859240
2,EQUATE,Hair Removal,893268
3,PALMOLIVE,Bath & Body,542562
4,DAWN,Bath & Body,301844


(118, 2)


Unnamed: 0,PRODUCT_CATEGORY,BRAND
0,Adult Incontinence,"[POISE, DEPEND, ALWAYS, WELLNESS, TENA, FIORE, ONE BY POISE]"
1,Alfredo & White Pasta Sauce,"[CLASSICO, PREGO, RAGU, RAO'S, KROGER, SIMPLE TRUTH, PUBLIX, HEB]"
2,Baby Bathing,"[BABY SOFT, PREDO, GERBER, ENFAMIL, EARTH'S BEST, HAPPY BABY, PAMPERS, SPROUT, MOTT'S, JOHNSONS, SHEA MOISTURE, AVEENO BABY, BABY DOVE, HAPPY TOT, PLUM ORGANICS, SIMILAC, THE HONEST CO., NANIT, SIMPLE TRUTH, MIKU, BEECH NUT, BABY SOFT EXPRESSIONS, ORGANICS HAPPY BABY, A+D, MUNCHKIN, VIVVI & BLOOM, ENFAGROW, HORIZON (WHITEWAVE FOODS), DR BROWN'S, GREENWISE, NATURE BABY CARE, NUTRAMIGEN, BABYGANICS, HOT-KID, CEREBELLY, BABY LOVE, AVEENO, SMART CARE, LUV N' CARE, LANSINOH, PIPETTE]"
3,Baby Health,"[GERBER, ENFAMIL, EARTH'S BEST, EVENFLO (EVENFLO FEEDING INC), AVENT]"
4,Baby Safety,[PAMPERS]


In [29]:
# generate a dictionary of UPPER_CATEGORY (key) and PRODUCT_CATEGORY (value)'
brand_belong_category_dict = {row['PRODUCT_CATEGORY']: row['BRAND'].tolist() for index, row in grouped_df.iterrows()}

# change all keys in category_dict to upper case
brand_belong_category_dict = {k.upper(): v for k, v in category_dict.items()}

# clear all non-str values for all keys, values in category_dict
print(f'Original length of brand_belong_category_dict: {len(brand_belong_category_dict)}')
brand_belong_category_dict_cleaned = {k: [i for i in v if isinstance(i, str)] for k, v in brand_belong_category_dict.items()}
print(f'Cleaned length of category_dict: {len(brand_belong_category_dict_cleaned)}')

# # write to json
# import json
# with open('brand_belong_category_dict.json', 'w') as fp:
#     json.dump(brand_belong_category_dict_cleaned, fp)

Original length of brand_belong_category_dict: 23
Cleaned length of category_dict: 23
