## Import necessary packages and define helper functions

In [23]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
import spacy
from numpy import dot
from numpy.linalg import norm
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
def remove_stopwords(line, stopword_set):
    new_words = []
    
    for word in word_tokenize(line):
        if word.lower() in stopword_set:
            continue
        new_words.append(word.lower())
    
    return ' '.join(new_words)

def word_count(lst):
    d = {}
    for line in lst:
        for word in line.split():
            d[word] = d.get(word,0) + 1
    return d


# https://gaurav5430.medium.com/using-nltk-for-lemmatizing-sentences-c1bfff963258
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_sentence

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None


In [3]:
def find_cat(df, col, idx_lst):
    for i in idx_lst:
        string = df.loc[i,col]
        if regex_bottom.search(string):
            df.loc[i,'product_category'] = 'bottom'
        elif regex_shoe.search(string):
            df.loc[i,'product_category'] = 'shoes'
        elif regex_accessory.search(string):
            df.loc[i,'product_category'] = 'accessory'
        elif regex_top.search(string):
            df.loc[i,'product_category'] = 'top'
        else:
            df.loc[i,'product_category'] = 'other'
    return df
    

## Data Cleaning for the product documents

In [4]:
# read in the excel for products, replace null values with word 'unknown', combined features details and description together
df = pd.read_excel('Behold+product+data+04262021.xlsx')
df.fillna(value='Unknown',inplace=True)
features = ['details', 'description']
df['combined'] = df[features].apply(lambda col: ' '.join(col.astype(str)), axis=1)

In [5]:
# Assign each product with a product category using regex
regex = re.compile(r'[^\w\s\d]')
df['brand_cat'] = df['brand_category'].apply(lambda x: regex.sub(' ', x))

bottoms = r'(?i)\b(pants?|jeans?|skirts?|shorts?|bottoms?|trousers?|legs|leggings)\b'
shoes = r'(?i)\b(shoes?|heels?|sandals?|wedges?|boots?|booties?|uggs?|flats?|skates|flip-flops?|brogues?|skates?|jackboots?|sneakers?|slippers?)\b'
accessory = r'(?i)\b(case|scarf|scarves|handbags?|purse|clutch|bags?|muffs?|wristlets?|baguettes?|totes?|backpacks?|hats?|masks?|jewelry|earrings?|necklaces?|rings?|watch|bracelets?)\b'
top = r'(?i)\b(shirts?|coats?|jackets?|sweaters?|tops?|collars?|sweatshirts?|sleeves?|tshirts?|tanks?)\b'

regex_bottom = re.compile(bottoms)
regex_shoe = re.compile(shoes)
regex_accessory = re.compile(accessory)
regex_top = re.compile(top)

# first assign category based on the brand_category feature
all_idx = list(df.index)
df_withcat = find_cat(df, 'brand_cat', all_idx)
# if brand_category features did not contain useful information, assign category using details and description
other_idx = list(df_withcat[df_withcat['product_category']=='other'].index)
df_withcat = find_cat(df_withcat, 'combined', other_idx)

In [6]:
# clean the text features for each product
X = df[['combined']]
regex = re.compile(r'[^\w\s\d]')
X['cleaned'] = X['combined'].apply(lambda x: regex.sub(' ', x))

# remove stopwords in the text features
stp = set(stopwords.words('english'))
added = ['unknown','½ï']
stp.update(added)
a = X['cleaned'].apply(lambda x: remove_stopwords(x, stp))
X['removed'] = a

# lemmatize the features. We chose lemmatization over stemming to keep the meaning of the words
lemmatizer = WordNetLemmatizer()
a = X['removed'].apply(lambda x: lemmatize_sentence(x))
b = a.apply(lambda x: ' '.join(x))

# futher clean the features by using regex to remove single character words and digits
regex = re.compile(r'\d')
b = b.apply(lambda x: regex.sub(' ',x))
regex_let = re.compile(r'\b\w\b')
b = b.apply(lambda x: regex_let.sub('',x))
df['text_feature'] = b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cleaned'] = X['combined'].apply(lambda x: regex.sub(' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['removed'] = a


## Final dataframe used for all search querys

In [7]:
# this is the cleanned dataframe that used for search query. Since this dataframe is identical to all querys, 
# so we did not include data cleaning before in our search query function to save run time 
df_cleaned = df[['product_id','text_feature','product_category']]

## Search query function

In [8]:
def search(user_query: str):
    # perform datacleaning for the input query
    regex = re.compile(r'[^\w\s\d]')
    query = regex.sub(' ', user_query)
    query_df = pd.DataFrame({'product_id':['query'],'text_feature':[query]},index=[61355])
    df_withquery = pd.concat([df_cleaned,query_df])
    
    # use TF-IDF to vectorize the product table, max_features is set for 1000 to avoid noise
    products = df_withquery['text_feature'].values
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(products)
    tfidf_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names(), index=df_withquery.index)
    
    # calculate the tf-idf vector for the input query
    query_tfidf = tfidf_df.iloc[61355]
    
    # generate a simialrity table to reflect similarity between the input query and all avaliable products
    similarity_lst = {}
    for i in range(len(tfidf_df)-1):
        product = tfidf_df.iloc[i]
        if (norm(product)*norm(query_tfidf)) == 0:
            similarity_lst[i] = 0
            continue
        cos_sim = dot(product, query_tfidf)/(norm(product)*norm(query_tfidf))
        similarity_lst[i] = cos_sim
    d = list(sorted(similarity_lst.items(), key=lambda item: item[1], reverse=True))
    
    # for each category, we return one product with highest cosine similarity. 
    bottom = 'Unknow'
    shoes = 'Unknow'
    accessory = 'Unknow'
    top = 'Unknow'
    
    for i in range(10000):
        product_idx = d[i][0]
        cat = df_cleaned.loc[product_idx,'product_category']
        if bottom == 'Unknow' and cat =='bottom':
            bottom = (df.loc[product_idx,'name'], df.loc[product_idx,'product_id'])
        if shoes == 'Unknow' and cat =='shoes':
            shoes = (df.loc[product_idx,'name'], df.loc[product_idx,'product_id'])
        if accessory== 'Unknow' and cat =='accessory':
            accessory = (df.loc[product_idx,'name'], df.loc[product_idx,'product_id'])
        if top== 'Unknow' and cat =='top':
            top = (df.loc[product_idx,'name'], df.loc[product_idx,'product_id'])
        if (bottom != 'Unknow') and (shoes != 'Unknow') and (accessory != 'Unknow') and (top != 'Unknow'):
            break
    
    result_dic = {'bottom':bottom, 'shoes':shoes, 'accessory':accessory, 'top':top}
    return result_dic

In [9]:
# example query and results
query = 'black slim jean with white dot'
dic = search(query)
dic

{'bottom': ('Asymmetric Skirt', '01EMPK82BDK6E6S4KBRP77RRAC'),
 'shoes': ('Ikat Dot  V-neck caftan', '01EWC89HRBR5N9Z6JNKGFAZAD2'),
 'accessory': ('Musgrave', '01EEBHSPQXAN74785CGB7GKPZS'),
 'top': ('Dot Top', '01ECAZH7666P4KPPFGPV3PE7FB')}