## Import necessary packages and define helper functions

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
import spacy
from numpy import dot
from numpy.linalg import norm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
def remove_stopwords(line, stopword_set):
    new_words = []
    
    for word in word_tokenize(line):
        if word.lower() in stopword_set:
            continue
        new_words.append(word.lower())
    
    return ' '.join(new_words)

def word_count(lst):
    d = {}
    for line in lst:
        for word in line.split():
            d[word] = d.get(word,0) + 1
    return d


# https://gaurav5430.medium.com/using-nltk-for-lemmatizing-sentences-c1bfff963258
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_sentence

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    

def query_cat(query):
    outfit_dict = {}
    for word in word_tokenize(query):
        if word in bottom_lst:
            outfit_dict['bottom'] = outfit_dict.get('bottom', 0) + 1
        if word in one_piece_lst:
            outfit_dict['onepiece'] = outfit_dict.get('onepiece', 0) + 1
        if word in shoe_lst:
            outfit_dict['shoe'] = outfit_dict.get('shoe', 0) + 1
        if word in top_lst:
            outfit_dict['top'] = outfit_dict.get('top', 0) + 1
        if word in accessory_lst:
            outfit_dict['accessory'] = outfit_dict.get('accessory', 0) + 1

    if not outfit_dict:
        query_cat = 'Unknown'
    else:
        query_d = list(sorted(outfit_dict.items(), key=lambda item: item[1], reverse=True))
        query_cat = query_d[0][0]
    return query_cat

def find_most_sim(query_cat, d):
    if query_cat == 'Unknown':
        most_sim = d[0][0]

    else:
        for i in range(len(d)-1):
            product_idx = d[i][0]
            cat = df_cleaned.loc[product_idx,'outfit_item_type']
            if cat == query_cat:
                most_sim = product_idx
                break
    return most_sim


## Read in data and merge dataframes

In [4]:
df_outfit = pd.read_csv('outfit_combinations USC.csv')

dic = {'accessory1':'accessory','accessory2':'accessory',
       'accessory3':'accessory', 'bottom':'bottom', 'top':'top', 
       'shoe':'shoe', 'onepiece':'onepiece'}

a = df_outfit['outfit_item_type'].map(dic)
df_outfit['outfit_item_type'] = a

df_product = pd.read_excel('Behold+product+data+04262021.xlsx')

df_combined = df_outfit.merge(df_product, how = 'left', left_on = 'product_id', right_on = 'product_id')
df_final = df_combined[['product_id','outfit_id','outfit_item_type',
                        'product_full_name','description', 'details']]

df_final.fillna(value='Unknown',inplace=True)
features = ['product_full_name','details', 'description']
df_final['combined'] = df_final[features].apply(lambda col: ' '.join(col.astype(str)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['combined'] = df_final[features].apply(lambda col: ' '.join(col.astype(str)), axis=1)


## Data Cleaning for the product documents

In [5]:
# clean the text features for each product
X = df_final[['combined']]
regex = re.compile(r'[^\w\s\d]')
X['cleaned'] = X['combined'].apply(lambda x: regex.sub(' ', x))

# remove stopwords in the text features
stp = set(stopwords.words('english'))
added = ['unknown','½ï']
stp.update(added)
a = X['cleaned'].apply(lambda x: remove_stopwords(x, stp))
X['removed'] = a

# lemmatize the features. We chose lemmatization over stemming to keep the meaning of the words
lemmatizer = WordNetLemmatizer()
a = X['removed'].apply(lambda x: lemmatize_sentence(x))
b = a.apply(lambda x: ' '.join(x))

# futher clean the features by using regex to remove single character words and digits
regex = re.compile(r'\d')
b = b.apply(lambda x: regex.sub(' ',x))
regex_let = re.compile(r'\b\w\b')
b = b.apply(lambda x: regex_let.sub('',x))
df_final['text_feature'] = b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cleaned'] = X['combined'].apply(lambda x: regex.sub(' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['removed'] = a
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['text_feature'] = b


## Final dataframe used for all search querys

In [6]:
# this is the cleanned dataframe that used for search query. Since this dataframe is identical to all querys, 
# so we did not include data cleaning before in our search query function to save run time 
df_cleaned = df_final[['outfit_id','product_id','text_feature','outfit_item_type']]

## Predict outfit category for query


In [7]:
bottom_lst = ['pant', 'skirt', 'jean', 'trouser', 'short', 'tight', 'legging', 'culotte',
              'bootcut', 'leg', 'palazzo', 'bottom', 'ankle', 'capri', 'waistband',
              'waist', 'highwaist', 'beltless', 'straight']

one_piece_lst = ['dress', 'piece', 'jumpsuit', 'piece', 'bodysuit', 'slipdress', 'shirtdress',
                 'onepiece', 'onesie', 'gown', 'robe', 'romper', 'shortall', 'minidress',
                 'caftan', 'tunic', 'georgette', 'coverall', 'bikini', 'kimono', 'sundress',
                 'boilersuit']

shoe_lst = ['shoe', 'boot', 'sneaker', 'heel', 'foot', 'sandal', 'slipper', 
            'flip', 'flop', 'bootie', 'toe', 'pump', 'trainer', 'platform', 'oxford', 
            'mule', 'brogue', 'loafer', 'moccasin', 'flat', 'derby', 'slingback', 'clog',
            'heighten', 'shoetie']

top_lst = ['tank', 'top', 'blouse', 'shirt', 'tee', 'vest', 'blazer', 'crop', 
           'hoodie', 'hood', 'sweat', 'turtleneck', 'cardigan', 'camisole', 'sweatshirt', 
           'neck', 'sleeve', 'cami', 'boatneck', 'vneck', 'jersey', 'sleeves', 'tanktop',
           'sweatpants', 'bustier', 'sleeveless', 'neckline', 'crewneck', 'longsleeve']

accessory_lst = ['satchel', 'clutch', 'bag', 'tote', 'jacket', 'coat', 'scarf', 
                 'bra', 'bralette', 'backpack', 'briefcase', 'purse', 'panty', 
                 'thong', 'belt', 'hat', 'bralett', 'hobo', 'eye', 'sunglasses', 'bib', 
                 'accessory', 'sunglass', 'lens', 'trench', 'wallet', 'earring', 'barrette',
                 'pullover', 'photo', 'card', 'band', 'felt', 'hand', 'necklace',
                 'shearling', 'cream', 'lip', 'balm', 'parka', 'mask', 'bracelet',
                 'sock', 'glasswear', 'cape', 'suit', 'bandana', 'lenses', 'lingerie',
                 'collar', 'apron', 'tie', 'strap', 'ring', 'napkin', 'shawl', 'sweater', 
                 'beret', 'sapphire', 'crossbody', 'neckband', 'headband', 'headgear',
                 'outerwear', 'wrist', 'cap', 'shirtjacket', 'windbreaker', 'glove', 'mitt',
                 'bangle', 'obi', 'stud', 'earing', 'overcoat', 'trenchcoat', 'watch', 'anklet',
                 'mitts', 'choker', 'pin', 'gloves']

others_lst = ['towel', 'vase', 'chair', 'candle', 'photo', 'card', 'book', 'lamp', 
              'pottery', 'plate', 'salt', 'pillow', 'table', 'bench', 'bed', 'table',
              'couch', 'baby', 'basket', 'crochet', 'coverlet', 'upholster', 'cushion',
              'makeup', 'ceramic', 'soap', 'antique', 'sofa', 'footbed', 'goblet', 'skateboard',
              'quilt', 'washcloth', 'comb', 'fragrance', 'mat', 'swimwear', 'swimsuit', 'wetsuit',
              'pillowcase', 'perfume', 'enamel', 'insole', 'shower', 'furniture', 'toiletry',
              'pilowcases']

In [8]:
def search(user_query: str):
    # perform datacleaning for the input query
    regex = re.compile(r'[^\w\s\d]')
    query = regex.sub(' ', user_query)
    lst = []
    for word in word_tokenize(query):
        if word not in stp:
            lst.append(word)
    query = ' '.join(lst)
    query_lst = lemmatize_sentence(query)
    query = ' '.join(query_lst)
    query_df = pd.DataFrame({'product_id':['query'],'text_feature':[query]},index=[5291])
    df_withquery = pd.concat([df_cleaned,query_df])
    
    # use TF-IDF to vectorize the product table, max_features is set for 1000 to avoid noise
    products = df_withquery['text_feature'].values
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(products)
    tf_idf_lookup_table = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names(), index=df_withquery.index)
    
    DOCUMENT_SUM_COLUMN = "DOCUMENT_TF_IDF_SUM"

    # sum the tf idf scores for each document
    tf_idf_lookup_table[DOCUMENT_SUM_COLUMN] = tf_idf_lookup_table.sum(axis=1)
    available_tf_idf_scores = tf_idf_lookup_table.columns # a list of all the columns we have
    available_tf_idf_scores = set(map( lambda x: x.lower(), available_tf_idf_scores)) # lowercase everything
    
    products_vectors = []
    
    for idx, product in enumerate(products): # iterate through each review
        tokens = nlp(product) # have spacy tokenize the review text

        # initially start a running total of tf-idf scores for a document
        total_tf_idf_score_per_document = 0
        # start a running total of initially all zeroes (300 is picked since that is the word embedding size used by word2vec)
        running_total_word_embedding = np.zeros(300) 
        for token in tokens: # iterate through each token
        # if the token has a pretrained word embedding it also has a tf-idf score
            if token.has_vector and token.text.lower() in available_tf_idf_scores:
                tf_idf_score = tf_idf_lookup_table.loc[idx, token.text.lower()]
                running_total_word_embedding += tf_idf_score * token.vector
                total_tf_idf_score_per_document += tf_idf_score
                
        # divide the total embedding by the total tf-idf score for each document
        # print(total_tf_idf_score_per_document)
       
        document_embedding = running_total_word_embedding / max(1,total_tf_idf_score_per_document)
        products_vectors.append(document_embedding.tolist())

    # generate a dictionary contains the similarity between query and each product 
    query_vec = products_vectors[-1]
    similarity_lst = {}
    for i in range(len(products_vectors)-1):
        product = products_vectors[i]
        if (norm(product)*norm(query_vec)) == 0:
            similarity_lst[i] = 0
            continue
        cos_sim = dot(product, query_vec)/(norm(product)*norm(query_vec))
        similarity_lst[i] = cos_sim

    d = list(sorted(similarity_lst.items(), key=lambda item: item[1], reverse=True))
    
    # before matching the product, we first use domain knowledge to determine which category of clothes the user is trying to search for
    cat_find = query_cat(query)
    # Based on the category, we return the product with highest similarity
    found_idx = find_most_sim(cat_find, d)
    re_outfit_id = df_final.iloc[found_idx]['outfit_id']
    # Then, we return the whole outfit which contain the product selected in last step
    returned_df = df_final[df_final['outfit_id'] == re_outfit_id][['outfit_item_type', 'product_id','product_full_name']]
    returned_df = returned_df.set_index('outfit_item_type')
    final_dict = returned_df.groupby(level=0).apply(lambda x: x.to_dict('r')).to_dict()
    return final_dict

## Example query

In [9]:
# Example Query
query = 'large size, straight leg pant with a white dot'
dic = search(query)
dic

{'accessory': [{'product_id': '01DVPMF2X9M22VBARKC5G6FXB7',
   'product_full_name': 'Le Riviera leather shoulder bag'},
  {'product_id': '01DVPNB5C3973WN8R7W7YVRRKT', 'product_full_name': '#NAME?'},
  {'product_id': '01DVPNB5C3973WN8R7W7YVRRKT',
   'product_full_name': '+ Pernille Teisbaek Clara oversized belted faux fur coat'}],
 'bottom': [{'product_id': '01DVMERT64RSJQBBJS9973N7HF',
   'product_full_name': 'Femme Hi Spikes high-rise straight-leg jeans'}],
 'shoe': [{'product_id': '01DVCTFR5MA1ZDKTAFS4VG4VW4',
   'product_full_name': 'Cabria leather ankle boots'}],
 'top': [{'product_id': '01DT50PZ3D0RXNZFDSGTWMVXZW',
   'product_full_name': 'Boy striped cotton-jersey T-shirt'}]}

In [10]:
# Example Query
query = 'yellow onepiece for beach with pink flower'
dic = search(query)
dic

{'accessory': [{'product_id': '01DPEHS0XH9PDD1GH5ZE4P43A2',
   'product_full_name': 'Cassi Belt Bag'},
  {'product_id': '01DPGV0TFFJ720BT3F8ADN4V7P',
   'product_full_name': "Women's 2011 Icon trench"}],
 'onepiece': [{'product_id': '01DPD4R5X5TQCWTVTC2AEAFC10',
   'product_full_name': 'Ida Dress'}],
 'shoe': [{'product_id': '01DPKNCMSFAWF2HVQSRHHXDV0K',
   'product_full_name': 'Virginia Boot'}]}

In [11]:
# Example Query
query = 'slim fitting, straight leg pant with a center back zipper and slightly cropped leg'
dic = search(query)
dic

{'bottom': [{'product_id': '01E223GQFQFHBZR1106AE2VKJ3',
   'product_full_name': 'Wide Leg Ankle Trousers'}],
 'shoe': [{'product_id': '01E1JM43NQ3H17PB22EV3074NX',
   'product_full_name': 'Visa Mule'}],
 'top': [{'product_id': '01E223E4WZNM9BW7A6XCQMJ965',
   'product_full_name': 'Silk Button-Up Shirt'}]}