In [1]:
import pandas as pd
import numpy as np

from spacy.tokens import DocBin
import srsly
import spacy
from spacy.training import offsets_to_biluo_tags

pd.set_option('display.max_colwidth', None)

In [2]:
# Importing as module.
import en_fetch_ner_spacy_tsf
nlp = en_fetch_ner_spacy_tsf.load()


If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current 'transformers' and 'spacy-transformers' versions. For more details and available updates, run: python -m spacy validate


In [3]:
import os, pickle, json

path1 = "data/brand_belong_category_dict.json"
path2 = "data/offered_brands.pkl"

with open(path1, 'r') as f:
    brand_belong_category_dict = json.load(f)

with open(path2, 'rb') as f:
    offered_brands = pickle.load(f)

In [12]:
df_offers_brand_retailer = pd.read_csv('data/offer_retailer.csv')
df_offers_brand_retailer.head()

Unnamed: 0,OFFER,RETAILER,BRAND
0,Spend $50 on a Full-Priced new Club Membership,SAMS CLUB,SAMS CLUB
1,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT
2,Good Humor Viennetta Frozen Vanilla Cake,,GOOD HUMOR
3,"Butterball, select varieties, spend $10 at Dillons Food Store",DILLONS FOOD STORE,BUTTERBALL
4,"GATORADE® Fast Twitch®, 12-ounce 12 pack, at Amazon Storefront*",AMAZON,GATORADE


Assumptions:
- search input will be simple sentences or keywords (e.g. `beer`, or `beers discount Amazon`)
- users won't serach upper_category because the APP already has UI for them to filter
- `PRODUCT_CATEGORY` are not to spefici brands, so a customer might search `beer` or `white beer`
- the search functions I am using will clean the text to lower case. This is because I don't know if the `PRODUCT_CATEGORY` data gives me the write permission

Notes about main() function:
- if we have a category and we want to recommend offers within the same category, we have to know what BRANDS one category has and then filter out the offers associated with the brands

Issues with dataset:
- `Alchol` should be renamed as drinks. It contains more than alcholic drinks `[Beer, Malt Beverages, Hard Seltzers, Sodas, Waters, Lemonades & Teas, Hard Ciders, Wine, Spirits]`

## Helper functions

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# additional_stop_words = {'my', 'additional', 'stopword'}
# stop_words.update(additional_stop_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anthony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# helper functions
from typing import List, Dict, Tuple
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def single_text_cleaner(text: str, remove_stopwords: bool=False, upper_case: bool = False, remove_punctuation: bool=True) -> str:
    """Clean one single text input. By default it will convert text to lower case"""
    if upper_case:
        text = text.upper()
    else:
        text = text.lower()
    if remove_punctuation:
        text = re.sub(r'[^a-z\s]', '', text)
    if remove_stopwords:
        words = text.split()
        words = [word for word in words if word not in stop_words]
        text = ' '.join(words)
    return text

def list_text_cleaner(texts: List[str], upper_case: bool = False, remove_stopwords: bool = False, remove_punctuation: bool=True) -> List[str]:
    """Takes in a list of strings and returns a list of cleaned strings without stop words. 
    Current tasks: 
    - remove non-alphabetical characters
    - converting to lower cases
    - remove stop words (optional)"""
    cleaned_texts = [single_text_cleaner(text, remove_stopwords, upper_case, remove_punctuation) for text in texts]
    return cleaned_texts

def match_product_category(s1: list[str], s2: list[str]) -> str:
    """Find if items of a list is in one list of product categories"""
    return next((p for c in s1 for p in s2 if c in p), None) # this will stop after finding first match, which saves time

def find_category(search_input: str, search_dict: Dict) -> str:
    """Find the category of a search input based on a dictionary of categories"""
    search_list = list_text_cleaner(re.split(r'[,\s]+', search_input), remove_stopwords=False) # because v's in search_dict are in upper case
    for k, v in search_dict.items():
        v = list_text_cleaner(v, remove_punctuation=False)
        search_results = match_product_category(search_list, v)
        if search_results is not None:
            return k, search_results
        else:
            print(f'Function find_category: No category is matched for input: {search_input}')
            return None

def check_entity(search_input) -> bool:
    """Takes in a search input and checks if it contains any entities"""
    doc = nlp(search_input)
    if len(doc.ents) > 0:
        return doc
    else:
        return False

def get_cosine_sim(input_text: str, texts: List[str]) -> pd.DataFrame:
    """Calculate the cosine similarity of the input text against a list of texts
    Takes in:
    - input_text: a string
    - texts: a list of strings
    Returns a dataframe with two columns: Sentence Text and Cosine Similarity Score
    """
    input_text_cleaned = list_text_cleaner([input_text], remove_stopwords=True)[0]
    cleaned_texts = list_text_cleaner(texts, remove_stopwords=True)
    all_texts = [input_text_cleaned] + cleaned_texts
    vectors = get_vectors(*all_texts)
    sim_matrix = cosine_similarity(vectors)
    # Get the similarity scores of the input_text against all other texts
    sim_scores = sim_matrix[0, 1:]
    data = {'OFFER': texts, 'Cosine Similarity Score': sim_scores}
    df = pd.DataFrame(data)
    df = df.sort_values(by='Cosine Similarity Score', ascending=False).reset_index(drop=True)
    return df

def get_vectors(*strs: str) -> np.ndarray:
    text = list(strs)
    vectorizer = CountVectorizer()
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

def jaccard_similarity(s1: List[str], s2: List[str]) -> float:
    """Takes in two lists and returns the Jaccard similarity score (3 digits)"""
    intersection = set(s1).intersection(set(s2))
    n = len(intersection)
    score = round(n / (len(s1) + len(s2) - n), 3)
    return score

def get_jaccard_sim(input_text: str, texts: List[str]) -> pd.DataFrame:
    """Calculate the Jaccard similarity of the input text against a list of texts
    Takes in:
    - input_text: a string
    - texts: a list of strings
    Returns a dataframe with two columns: Sentence Text and Jaccard Similarity Score
    """
    cleaned_input_text = list_text_cleaner([input_text], remove_stopwords=True)[0].split()
    cleaned_texts = list_text_cleaner(texts, remove_stopwords=True)
    
    jaccard_scores = [jaccard_similarity(cleaned_input_text, text.split()) for text in cleaned_texts]
    
    data = {'OFFER': texts, 'Jaccard Similarity Score': jaccard_scores}
    df = pd.DataFrame(data)
    # sort based on the similarity score
    df = df.sort_values(by='Jaccard Similarity Score', ascending=False).reset_index(drop=True)
    return df

def find_column(df: pd.DataFrame, keyword: str) -> str:
    """Function to find the first column containing a specific keyword. Note that we assume there will only be one score at most for a similarity score dataframe"""
    cols = [col for col in df.columns if keyword.lower() in col.lower()]
    return cols[0] if cols else None

def extract_similar_offers(data: pd.DataFrame, threshold: float = 0.0) -> pd.DataFrame:
    """Takes in the results from get_cosine_sim() and get_jaccard_sim(); returns a dataframe of similar offers with scores > threshold"""
    score = find_column(data, 'score')
    similar_offers = data[data[score] >= threshold]
    return similar_offers

def category_to_brand(category: str, offered_brands: List, brand_belong_category_dict: Dict) -> List[str]:
    """Use case: when a user searches for a category, we return a list of brands in that category"""
    search_brands = brand_belong_category_dict[category.upper()] # becase all keys are in upper case
    result = list(set(search_brands) & set(offered_brands))
    if len(result) > 0:
        print(f"Function category_to_brand | Found {category} in offered brand") 
        return result 
    else:
        print(f"Function category_to_brand | No offered brand is found in {category}")
        return None

# def offer_finder_by_score(df1: pd.DataFrame, df2: pd.DataFrame, threshold: float = 0.0) -> pd.DataFrame:
#     """Part of the main function when we calculated two scores (cos and jaccard). 
#     Takes in two dataframes and return a dataframe of offers with the top 10 rows with highest score from each dataframe"""
#     offer1 = extract_similar_offers(df1, threshold)['OFFER']
#     offer2 = extract_similar_offers(df2, threshold)['OFFER']
#     output = pd.concat([df1, df2])
#     return output

class CatchErros(Exception):
    class ParamsInputError(Exception):
        pass
    class SearchFailedError(Exception):
        pass


def offer_finder_by_category(search_input: str, search_category_tuple: Tuple, offers: pd.DataFrame, category_dict: Dict, offered_brands: List, 
                             brand_belong_category_dict: Dict, score: str, threshold: float = 0.0) -> pd.DataFrame:
    # we assume people just search one category at a time
    # search_category_tuple = find_category(search_input, category_dict)
    product_category, upper_category = search_category_tuple[1], search_category_tuple[0] # ('Alcohol', 'beer')
    print(f'Function offer_finder_by_category | Found items:\n- Search input: {search_input}\n- Product category: {product_category}\n- Upper category: {upper_category}')
    potential_brands = category_to_brand(product_category, offered_brands, brand_belong_category_dict)
    if potential_brands is not None:
        potential_offers = offers[offers['BRAND'].isin(potential_brands)]['OFFER'].tolist()
        if score == 'cosine':
            cos_sim_score = get_cosine_sim(search_input, potential_offers)
            output = extract_similar_offers(cos_sim_score, threshold)
        elif score == 'jaccard':
            jaccard_sim_score = get_jaccard_sim(search_input, potential_offers)
            output = extract_similar_offers(jaccard_sim_score, threshold)
        elif score not in ['cosine', 'jaccard']:
            raise ValueError(f'Please enter a valid score: cosine or jaccard; Not {score}')
        else: # this means something else is worng
            raise UnknownError(f'Something must be broken. Please try again.')
        return output
    else:
        potential_product_categories = category_dict[upper_category]
        msg = f'{product_category} is not found. Do you wanna take a look at these similar offers in {upper_category}?\n We have: {potential_product_categories}' # we can still calculate similarity but this is computationally expensive
        print(msg)
        return None

def offer_finder_by_entity(search_input: str, entities: Tuple, offers_data: pd.DataFrame, score: str, threshold: float=0.0) -> pd.DataFrame:
    """Find offers based on entities identified from search input.
    Takes in a search input, a dataframe of offers, and a score type (cosine or jaccard)
    Returns a dataframe of offers with the top 10 rows, ordered by highest score
    """
    for ent in entities:
        ent_name, ent_label = ent.text, ent.label_
        print(f'Function offer_finder_by_entity | Found entity: {ent_name} with label: {ent_label}')
        # filter offers by entity
        df_tmp = offers_data[offers_data[ent_label.upper()] == ent_name.upper()]
        if df_tmp.shape[0] > 0:
            print(f'Function offer_finder_by_entity | Found {df_tmp.shape[0]} offer(s) for the brand/retailer: {ent_name}')
            potential_offers = df_tmp['OFFER'].drop_duplicates().tolist()
            if score == 'cosine':
                cos_sim_score = get_cosine_sim(search_input, potential_offers)
                output = extract_similar_offers(cos_sim_score, threshold)
            elif score == 'jaccard':
                jaccard_sim_score = get_jaccard_sim(search_input, potential_offers)
                output = extract_similar_offers(jaccard_sim_score, threshold)
            elif score not in ['cosine', 'jaccard']:
                raise ValueError(f'Please enter a valid score: cosine or jaccard; Not {score}')
            else: # this means something else is worng
                raise UnknownError(f'Something must be broken. Please try again.')
            return output
        else:
            print(f'Function offer_finder_by_entity | No offer is found for the brand/retailer: {ent_name}')
            return None

# def main(search_input: str, offers: pd.DataFrame, category_dict: Dict, score_threshold: float = 0.0):
#     """Main function. Takes in a serach_input and decide whether it can find entities or not. Then excecute the appropriate functions"""
#     check_ent = check_entity(search_input)
#     if check_entity(search_input) is None:
#        # check category
#        cat_check = find_category(search_input, category_dict)
#        if cat_check is None:
#            raise SearchFailedError('No brand/retailer/category is found. Please try again.')
#        else:
#             # we assume people just search one category at a time
#             cat_tuple = cat_check # ('Alcohol', 'beer')
#             search_results = offer_finder_by_category(search_input, category_dict, cat_tuple, score_threshold)           
#             return search_results
#     else:
#         entities = check_ent.ents # entities will be a tuple anyways
#         print(f'Found {len(entities)} entity object(s) in the search input.')
#         search_results = offer_finder_by_entity(entities, offers, score_threshold)
            
            

In [13]:
df_offers_brand_retailer.head()

Unnamed: 0,OFFER,RETAILER,BRAND
0,Spend $50 on a Full-Priced new Club Membership,SAMS CLUB,SAMS CLUB
1,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT
2,Good Humor Viennetta Frozen Vanilla Cake,,GOOD HUMOR
3,"Butterball, select varieties, spend $10 at Dillons Food Store",DILLONS FOOD STORE,BUTTERBALL
4,"GATORADE® Fast Twitch®, 12-ounce 12 pack, at Amazon Storefront*",AMAZON,GATORADE


In [None]:
# text = "Klondike Cones at Walmart"
# test = "Simply Spiked Lemonade 12 pack"
test  = "Lemonade 12 pack"


# # check_entity(text)
# cat_tuple = find_category(test, category_dict)
# # category_to_brand(cat_tuple[1], offered_brands, brand_belong_category_dict)

# # offer_finder_by_category(test, cat_tuple, df_offers, category_dict, offered_brands, brand_belong_category_dict, 'jaccard', 0.0)
# offer_finder_by_category(test, cat_tuple, df_offers, category_dict, offered_brands, brand_belong_category_dict, 'cosine', 0.0)

In [None]:
test = "Simply Spiked Lemonade 12 pack"
main(test, df_offers, category_dict, 0.0)

In [31]:
test = "Simply Spiked Lemonade 12 pack"
check_ent = check_entity(test)
entities = check_ent.ents

offer_finder_by_entity(test, entities, df_offers_brand_retailer, 'cosine')

Function offer_finder_by_entity | Found entity: Simply Spiked with label: BRAND
Function offer_finder_by_entity | Found 4 offer(s) for the brand/retailer: Simply Spiked


Unnamed: 0,OFFER,Cosine Similarity Score
0,Simply Spiked™ Lemonade 12 pack+,1.0
1,Simply Spiked™ 12 pack+,0.866025
2,Simply Spiked™ Signature Peach 12 pack+,0.67082
3,"Simply Spiked™ Signature Peach 12-pack or larger, buy 2",0.566947


## Playground to test

### Functions test

In [310]:
# test case: find category

test1 = "White German Beer"
test2 = ["Beer", "Malt Beverages", "Hard Seltzers", "Sodas", "Waters", "Lemonades & Teas", "Hard Ciders", "Wine", "Spirits"]

find_category(test1, category_dict)


('Alcohol', 'beer')

In [380]:
test_text = "Klondike Cones at Walmart"
# test_text = "Klondike Cones"
# test_test = "M&M's candy discount at Target"


doc = nlp(test_text)
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [381]:
# [(ent.text, ent.label_) for ent in text.ents]

# len(check_entity(test_text).ents)

# type(check_entity(test_text).ents)

for ent in doc.ents:
    print(f'Entity: {ent.text} | Label: {ent.label_}')

Entity: Klondike | Label: BRAND
Entity: Walmart | Label: RETAILER


In [128]:
# test on Cosine Similarity for search_offer comparison

input_text = "Membership at Costco"
texts = ["Sara Lee bread, select varieties, buy 2 at Walmart", 
         "Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement", 
         "When you join Costco as a Gold Star Member", 
         "this is the fourth text"]

cosine_sim_scores = get_cosine_sim(input_text, texts)
cosine_sim_scores

Unnamed: 0,OFFER,Cosine Similarity Score
0,When you join Costco as a Gold Star Member,0.316228
1,"Sara Lee bread, select varieties, buy 2 at Walmart",0.0
2,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",0.0
3,this is the fourth text,0.0


In [129]:
cosine_sim_scores["OFFER"]

0                                 When you join Costco as a Gold Star Member
1                         Sara Lee bread, select varieties, buy 2 at Walmart
2    Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement
3                                                    this is the fourth text
Name: OFFER, dtype: object

In [309]:
# test on Jaccard Similarity for search_offer comparison

input_text = "Membership at Costco"
texts = ["Sara Lee bread, select varieties, buy 2 at Walmart", 
         "Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement", 
         "When you join Costco as a Gold Star Member", 
         "this is the fourth text"]

result = get_jaccard_sim(input_text, texts)
result

Unnamed: 0,OFFER,Jaccard Similarity Score
0,When you join Costco as a Gold Star Member,0.167
1,"Sara Lee bread, select varieties, buy 2 at Walmart",0.0
2,"Back to the Roots Garden Soil, 1 cubic foot, at Lowe's Home Improvement",0.0
3,this is the fourth text,0.0


In [308]:
# test case on extract_similar_offers
extract_similar_offers(get_jaccard_sim(input_text, texts))

Unnamed: 0,OFFER,Jaccard Similarity Score
0,When you join Costco as a Gold Star Member,0.167


In [307]:
extract_similar_offers(get_cosine_sim(input_text, texts))

Unnamed: 0,OFFER,Cosine Similarity Score
0,When you join Costco as a Gold Star Member,0.316228


In [73]:
# df_offers = pd.read_csv(path + 'brand_offer_cleaned.csv')
# df_offers.sample(10)

### Similarity match test

In [171]:
# read pickle and json files that we will use

import pickle
import json

path1 = "data/brand_belong_category_dict.json"
path2 = "data/offered_brands.pkl"

with open(path1, 'r') as f:
    brand_belong_category_dict = json.load(f)

with open(path2, 'rb') as f:
    offered_brands = pickle.load(f)

In [170]:
# change all keys in brand_belong_category_dict to upper case
# brand_belong_category_dict = {k.upper(): v for k, v in brand_belong_category_dict.items()}
# brand_belong_category_dict

# import json
# with open('data/brand_belong_category_dict.json', 'w') as fp:
#     json.dump(brand_belong_category_dict, fp)

Operation data test

In [6]:
print(f'offered_brands: {type(offered_brands)}')
print(f'brand_belong_category_dict: {type(brand_belong_category_dict)}')

offered_brands: <class 'list'>
brand_belong_category_dict: <class 'dict'>


In [7]:
offered_brands[:5]

['BEYOND MEAT', 'GOOD HUMOR', 'BUTTERBALL', 'GATORADE', 'DR PEPPER']

In [8]:
brand_belong_category_dict['Baby Bathing'][:5]

['BABY SOFT', 'PREDO', 'GERBER', 'ENFAMIL', "EARTH'S BEST"]

In [9]:
# test case: I have a category, now I want to find all the brands (with offers) that belong to this category
# so I need to find a intersection of two lists

category = 'Packaged Meals & Sides'
search_brands = brand_belong_category_dict[category]

# find the intersection of two lists
result = list(set(search_brands) & set(offered_brands))
result

['MCALISTERS DELI', 'BACK TO THE ROOTS']

## Build Category Dictionary

As checks (mannually and EDA), there're no cases for one product_category that belongs to more than one categories. So, we can build a simple category dictionary.

In [11]:
df_categories = pd.read_csv('data/categories.csv')
print(df_categories.shape)

# rename "IS_CHILD_CATEGORY_TO" to "UPPER_CATEGORY"
df_categories.rename(columns={'IS_CHILD_CATEGORY_TO': 'UPPER_CATEGORY'}, inplace=True)
display(df_categories.head())

# group by UPPER_CATEGORY and get DISTINCT PRODUCT_CATEGORY
grouped_df = df_categories.groupby('UPPER_CATEGORY')['PRODUCT_CATEGORY'].unique().reset_index()
print(grouped_df.shape)
display(grouped_df.head())

# generate a dictionary of UPPER_CATEGORY (key) and PRODUCT_CATEGORY (value)'
# category_dict = {row['UPPER_CATEGORY']: row['PRODUCT_CATEGORY'].tolist() for index, row in grouped_df.iterrows()}

(118, 3)


Unnamed: 0,CATEGORY_ID,PRODUCT_CATEGORY,UPPER_CATEGORY
0,1f7d2fa7-a1d7-4969-aaf4-1244f232c175,Red Pasta Sauce,Pasta Sauce
1,3e48a9b3-1ab2-4f2d-867d-4a30828afeab,Alfredo & White Pasta Sauce,Pasta Sauce
2,09f3decc-aa93-460d-936c-0ddf06b055a3,Cooking & Baking,Pantry
3,12a89b18-4c01-4048-94b2-0705e0a45f6b,Packaged Seafood,Pantry
4,2caa015a-ca32-4456-a086-621446238783,Feminine Hygeine,Health & Wellness


(23, 2)


Unnamed: 0,UPPER_CATEGORY,PRODUCT_CATEGORY
0,Alcohol,"[Beer, Malt Beverages, Hard Seltzers, Sodas, Waters, Lemonades & Teas, Hard Ciders, Wine, Spirits]"
1,Animals & Pet Supplies,[Dog Supplies]
2,Baby & Toddler,"[Baby Bathing, Potty Training, Baby Safety, Diapering, Baby Health]"
3,Beauty,"[Cosmetic Tools, Makeup, Nail Care, Body Fragrances]"
4,Beverages,"[Coffee, Meal Replacement Beverages, Vegetable Juices, Drink Mixes, Tea, Energy Drinks, Water, Fruit Juices, Carbonated Soft Drinks]"
