In [1]:
import pandas as pd
import numpy as np

from spacy.tokens import DocBin
import srsly
import spacy
from spacy.training import offsets_to_biluo_tags

pd.set_option('display.max_colwidth', None)

In [91]:
# Importing as module.
import en_fetch_ner_spacy_tsf
nlp = en_fetch_ner_spacy_tsf.load()

  raise IOError(Errors.E052.format(path=path.parent))

If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current 'transformers' and 'spacy-transformers' versions. For more details and available updates, run: python -m spacy validate


In [2]:
import os
path = "/Users/anthony/Projects/retailer_nlp_challenger/data/"

# # read csv
# df = pd.read_csv('brand_offer_cleaned.csv')
# df.sample(10)

## Build Category Dictionary

As checks (mannually and EDA), there're no cases for one product_category that belongs to more than one categories. So, we can build a simple category dictionary.

In [3]:
df_categories = pd.read_csv(path + 'categories.csv')
print(df_categories.shape)

(118, 3)


In [4]:
# rename "IS_CHILD_CATEGORY_TO" to "UPPER_CATEGORY"
df_categories.rename(columns={'IS_CHILD_CATEGORY_TO': 'UPPER_CATEGORY'}, inplace=True)
df_categories.head()

Unnamed: 0,CATEGORY_ID,PRODUCT_CATEGORY,UPPER_CATEGORY
0,1f7d2fa7-a1d7-4969-aaf4-1244f232c175,Red Pasta Sauce,Pasta Sauce
1,3e48a9b3-1ab2-4f2d-867d-4a30828afeab,Alfredo & White Pasta Sauce,Pasta Sauce
2,09f3decc-aa93-460d-936c-0ddf06b055a3,Cooking & Baking,Pantry
3,12a89b18-4c01-4048-94b2-0705e0a45f6b,Packaged Seafood,Pantry
4,2caa015a-ca32-4456-a086-621446238783,Feminine Hygeine,Health & Wellness


Assumptions:
- search input will be simple sentences or keywords (e.g. `beer`, or `beers discount Amazon`)
- users won't serach upper_category because the APP already has UI for them to filter
- `PRODUCT_CATEGORY` are not to spefici brands, so a customer might search `beer` or `white beer`
- the search functions I am using will clean the text to lower case. This is because I don't know if the `PRODUCT_CATEGORY` data gives me the write permission

In [9]:
# group by UPPER_CATEGORY and get DISTINCT PRODUCT_CATEGORY
grouped_df = df_categories.groupby('UPPER_CATEGORY')['PRODUCT_CATEGORY'].unique().reset_index()
print(grouped_df.shape)
grouped_df

(23, 2)


Unnamed: 0,UPPER_CATEGORY,PRODUCT_CATEGORY
0,Alcohol,"[Beer, Malt Beverages, Hard Seltzers, Sodas, Waters, Lemonades & Teas, Hard Ciders, Wine, Spirits]"
1,Animals & Pet Supplies,[Dog Supplies]
2,Baby & Toddler,"[Baby Bathing, Potty Training, Baby Safety, Diapering, Baby Health]"
3,Beauty,"[Cosmetic Tools, Makeup, Nail Care, Body Fragrances]"
4,Beverages,"[Coffee, Meal Replacement Beverages, Vegetable Juices, Drink Mixes, Tea, Energy Drinks, Water, Fruit Juices, Carbonated Soft Drinks]"
5,Candy,[Gum]
6,Dairy,"[Cream, Dairy Alternatives, Dairy, Yogurt, Milk, Cheese, Butter & Margarine, Eggs, Refrigerated Doughs]"
7,Deli & Bakery,"[Leafy Salads, Deli Counter, Bakery, Prepared Meals]"
8,Frozen,"[Frozen Fruits, Frozen Desserts, Frozen Sides, Frozen Meals, Frozen, Frozen Vegetables, Ice, Frozen Breads & Doughs, Frozen Pizza & Pizza Snacks, Frozen Breakfast, Frozen Plant-Based Meat, Frozen Appetizers]"
9,Frozen Meat,"[Frozen Turkey, Frozen Chicken, Frozen Beef, Frozen Seafood]"


In [78]:
# generate a dictionary of UPPER_CATEGORY (key) and PRODUCT_CATEGORY (value)'
category_dict = {row['UPPER_CATEGORY']: row['PRODUCT_CATEGORY'].tolist() for index, row in grouped_df.iterrows()}

In [100]:
# helper functions
from typing import List, Dict
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def list_text_cleaner(text:List[str]) -> List[str]:
    """Takes in a list of strings and returns a list of cleaned strings. 
    Current tasks: 
    - remove non-alphabetical characters
    - converting to lower cases"""
    cleaned_text = []
    for t in text:
        t = t.lower()
        t = re.sub(r'[^a-z]', ' ', t)
        cleaned_text.append(t)
    return cleaned_text

def jaccard_similarity(s1:list[str], s2:list[str]) -> float:
    """Takes in two lists and returns the Jaccard similarity score (3 digits)"""
    intersection =  set(s1).intersection(set(s2))
    n = len(intersection)
    score = round(n / (len(s1) + len(s2) - n), 3)
    return score

def match_product_category(s1: list[str], s2: list[str]) -> str:
    """Find if items of a list is in one list of product categories"""
    return next((p for c in s1 for p in s2 if c in p), None) # this will stop after finding first match, which saves time

def find_category(search_input: str, search_dict: Dict) -> str:
    """Find the category of a search input based on a dictionary of categories"""
    search_list = list_text_cleaner(search_input.split())
    for k, v in search_dict.items():
        v = list_text_cleaner(v)
        search_results = match_product_category(search_list, v)
        if search_results is not None:
            return k, search_results
    return None

def check_entity(search_input) -> bool:
    """Takes in a search input and checks if it contains any entities"""
    doc = nlp(search_input)
    if len(doc.ents) > 0:
        return doc
    else:
        return False

def get_cosine_sim(input_text: str, texts: List[str]) -> pd.DataFrame:
    all_texts = [input_text] + texts
    vectors = get_vectors(*all_texts)
    sim_matrix = cosine_similarity(vectors)
    # Get the similarity scores of the input_text against all other texts
    sim_scores = sim_matrix[0, 1:]
    data = {'Sentence Text': texts, 'Cosine Similarity Score': sim_scores}
    df = pd.DataFrame(data)
    return df

def get_vectors(*strs: str) -> np.ndarray:
    text = list(strs)
    vectorizer = CountVectorizer()
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()


# def analyze_search_input(search_input: str):
#     """Main function. Takes in a serach_input and decide whether it can find entities or not. Then excecute the appropriate functions"""
#     if check_entity(search_input) is None:
#        #xxxx
#     else:
#         entities = check_entity(search_input)
#         #

## Playground to test

In [90]:
# test case: find category

test1 = "Beer Tea"
test2 = ["Beer", "Malt Beverages", "Hard Seltzers", "Sodas", "Waters", "Lemonades & Teas", "Hard Ciders", "Wine", "Spirits"]

search_list = list_text_cleaner(test1.split())

# match_product_category(search_list, test2)

# for k, v in category_dict.items():
#     search_results = match_product_category(search_list, v)
#     if search_results is not None:
#          print(k, search_results)

find_category(search_input=test1, search_dict=category_dict)

('Alcohol', 'beer')

In [99]:
text = nlp("M&M's candy discount at Target")
spacy.displacy.render(text, style="ent", jupyter=True) # display in Jupyter

In [101]:
check_entity(text)

M&M's candy discount at Target

In [108]:
# for ent in text.ents
[(ent.text, ent.label_) for ent in text.ents]

[('M&M', 'BRAND'), ('Target', 'RETAILER')]

In [10]:
input_text = "this is the input text"
texts = ["this is the first text", "this is the second text", "this is the third text", "this is the fourth text"]

cosine_sim_scores = get_cosine_sim(input_text, texts)
data = {'Sentence Text': texts, 'Cosine Similarity Score': cosine_sim_scores}
df = pd.DataFrame(data)

df


Unnamed: 0,Sentence Text,Cosine Similarity Score
0,this is the first text,0.8
1,this is the second text,0.8
2,this is the third text,0.8
3,this is the fourth text,0.8
