## Building Query Vector

### Data Preprocessing

In [1]:
# load data
import pandas as pd
df = pd.read_excel('data/Behold+product+data+04262021.xlsx')
feature_vectors = pd.read_csv('data/feature_vectors.csv', index_col=0)
df.drop(columns=['brand_name', 'created_at', 'brand_canonical_url', 'product_active', 'brand_description'], inplace=True)
tags = pd.read_csv('data/usc_additional_tags USC.csv')
brands = pd.read_csv('data/behold_brands USC.csv')

In [2]:
# concat information in different groups
df['all_info'] = df['brand'].fillna('').astype('str') + ' ' + \
                    df['brand_category'].fillna('').astype('str') + ' ' + \
                    df['name'].fillna('').astype('str') + ' ' + \
                    df['details'].fillna('').astype('str') + ' ' + \
                    df['description'].fillna('').astype('str')

def remove_punctuations(text):
        ''' remove unnecessary punctuations in all lines '''
        punctuations = ['\n',',','.','!','"','*','(',')', '\\','@', '#','/','\xa0',
                        ':','_','>','<',';','|','&','?','^','Unknown']
        for p in punctuations:
            text = text.replace(p,' ')    
        return text
    
df['all_info'] = df['all_info'].apply(remove_punctuations)

### Create Features for Brand Names

In [3]:
# create one-hot-encoding for top 50 brand
df2 = df.copy()
top50_col = list(df['brand'].value_counts()[:50].index)
df['brand'] = df['brand'].map(lambda x: str(x) if x in top50_col else 'other_brands')


# create vectors for words in the brand name
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words=None)
X = vectorizer.fit_transform(df['brand'])
brand_vector = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

# combine the brand vector
final_vectors = pd.concat([feature_vectors, brand_vector], axis=1)

### Categorize All Products

In [4]:
# transforming all detailed types into accessory
final_vectors.rename(columns={'accessory':'other_accessory'}, inplace=True)
final_vectors['accessory'] = final_vectors['other_accessory'] + final_vectors['bag'] + \
                            final_vectors['intimate'] + final_vectors['outerwear'] + final_vectors['jewelry']

# map all columns to product dataset
df['brand'] = df2['brand']
df = pd.concat([df, final_vectors\
                [['uncategorized', 'accessory', 'top', 'bottom', 'shoe', 'onepiece']]], axis=1)

In [5]:
# save results
final_vectors.to_csv('data/final_vectors.csv')
df.to_csv('data/rec_product.csv')

### Build a word vector for all other keywords

In [6]:
# create vectors for words in the brand name
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b(?![0-9])\w\w+\b', min_df=300)
X = vectorizer.fit_transform(df['all_info'])
word_vector = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
all_vectors = pd.concat([word_vector,final_vectors], axis=1)
all_vectors.shape

(61355, 1722)

In [7]:
all_vectors.to_csv('data/all_vectors.csv')

## Recommendation Building Process/Logic

- Recommendation System:

- Stage I. Extract Product ID

    0. Query Feature Extraction Functions
        - Testing and Checking the accuracy of Feature Extraction Functions
        - Unify all feature extraction functions: input query -> output feature vector
        - One additional word vector data based on all information in the description
    
    1. Custom Outfit Combination Assignment:
        - Extract existing outfit information from outfit dataset and make assignments
    
    2. Query Part I. Apply Cosine Similarity Measure: return the one with highest similarity
        - Using customized feature vector dataframe as reference, apply cosine similarity measure
    
       
- Stage II. Find Best Match Outfit
    
    3. if product_id match pre-existing outfit combination > return the outfit result
    
    4. if no match is found in step 4 > search for most appropriate outfit in custom outfit dictionary
    
    5. return outfit finding in step 4 or 5

## Customized Outfit Dictionary

In [None]:
# clothing category extraction
def create_clothing_category_feature(df):
    
    # load libraries
    import re
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    from collections import Counter
    
    def unfold_regex(category_find):
        new = []
        for i in category_find:
            for j in i:
                if j != '':
                    new.append(j)
        return new
    
    def find_mode_category(product_info):

        top_regex = r'(?i)\b(t(?:-)?shirt)s?\b|\b(shirt)s?\b|\b(blouse)s?\b|\b(tank)s?\b|\b(top)s?\b|'\
                        r'\b(suit)s?\b|\b(sweatshirt)s?\b|\b(knitwear)s?\b|\b(vest)s?\b|\b(suit)s?\b|\b(sweater)s?\b|'\
                        r'\b(cardigan)\b|\b(tee)s?\b|\b(hoodie)s?\b|\b(camisole)s?\b|\b(turtleneck)s?\b'
        shoe_regex = r'(?i)\b(shoe)s?\b|\b(sneaker)s?\b|\b(footie)s?\b|\b(footwear)s?\b|\b(pump)s?\b|'\
                        r'\b(flat)s?\b|\b(heel)s?\b|\b(boot)s?\b|\b(bootie)s?\b|\b(loafer)s?\b'\
                        r'|\b(mule)s?\b|\b(sandal)s?\b|\b(slipper)s?\b|\b(wedge)s?\b|\b(slide)s?\b|\b(slingback)s?\b'
        bottom_regex = r'(?i)\b(jean)s?\b|\b(short)s?\b|\b(pant)s?\b|\b(skirt)s?\b|\b(skort)s?\b|\b(sweatpant)s?\b|'\
                        r'\b(legging)s?\b|\b(trouser)s?\b|\b(bottom)s?\b|\b(jogger)s?\b|\b(tight)s?\b|\b(crop)s?\b|\b(leg)s?\b'
        bag_regex = r'(?i)\b(bag)s?\b|\b(handbag)s?\b|\b(shoulderbag)s?\b|\b(tote)s?\b|\b(clutch)(?:es)?\b|'\
                        r'\b(luggage)s?\b|\b(belt\s?bag)s?\b|\b(beach\s?bag)s?\b|\b(backpack)s?\b|\b(satchel)s?\b|'\
                        r'\b(briefcase)s?\b|\b(pouch)(?:es)?\b'
        accessory_regex = r'(?i)\b(scarf|scarves)\b|\b(hat)s?\b|\b(belt)s?\b|\b(sunglass)es?\b|\b(glove)s?\b|'\
                            r'\b(keychain)s?\b|\b(keyring)s?\b|\b(tie)s?\b|\b(phone\s?case)s?\b|'\
                            r'\b(glass)(?:es)?\b|\b(umbrella)s?\b|\b(frame)s?\b|\b(wallet)s?\b|\b(face\s?mask)s?\b|'\
                            r'\b(helmet)s?\b|\b(shawl)s?\b'
        onepiece_regex = r'(?i)\b(dress)(?:es)?\b|\b(jumpsuit)s?\b|\b(gown)s?\b|\b(robe)s?\b|\b(shirtdress)(?:es)?\b|\b(bodysuit)s?\b'
        outerwear_regex= r'(?i)\b(?:top)?(coat)s?\b|\b(jacket)s?\b|\b(parka)s?\b|\b(trench)(?:es)?\b|\b(raincoat)s?\b|\b(overcoat)s?\b|'\
                            r'\b(blazer)s?\b'
        jewelry_regex = r'(?i)\b(bracelet)s?\b|\b(brooch)(?:es)?\b|\b(pin)s?\b|\b(cufflink)s?\b|\b(earring)s?\b|'\
                            r'\b(necklace)s?\b|\b(ring)s?\b'
        intimate_regex = r'(?i)\b(underwear)s?\b|\b(bra)s?\b|\b(sock)s?\b|\b(sleepwear)s?\b|\b(loungewear)s?\b|'\
                            r'\b(boxer)s?\b|\b(brief)s?\b|\b(linger)(?:ies)?\b|\b(pantie)s?\b'

        top_find = re.findall(top_regex, product_info)
        shoe_find = re.findall(shoe_regex, product_info)
        bottom_find = re.findall(bottom_regex, product_info)
        bag_find = re.findall(bag_regex, product_info)
        accessory_find = re.findall(accessory_regex, product_info)
        onepiece_find = re.findall(onepiece_regex, product_info)
        outerwear_find = re.findall(outerwear_regex, product_info)
        jewelry_find = re.findall(jewelry_regex, product_info)
        intimate_find = re.findall(intimate_regex, product_info)

        top_find = unfold_regex(top_find)
        shoe_find = unfold_regex(shoe_find)
        bottom_find = unfold_regex(bottom_find)
        bag_find = unfold_regex(bag_find)
        accessory_find = unfold_regex(accessory_find)
        onepiece_find = unfold_regex(onepiece_find)
        outerwear_find = unfold_regex(outerwear_find)
        jewelry_find = unfold_regex(jewelry_find)
        intimate_find = unfold_regex(intimate_find)

        category_word_count_dict = {'top':len(top_find), 'bottom':len(bottom_find), 'shoe':len(shoe_find),\
                                            'bag':len(bag_find),'accessory':len(accessory_find),\
                                            'onepiece':len(onepiece_find),'outerwear':len(outerwear_find),\
                                            'jewelry':len(jewelry_find),'intimate':len(intimate_find)}
        mode_category_name = ''
        if sum(category_word_count_dict.values()) != 0:
            mode_category_name = max(category_word_count_dict, key=category_word_count_dict.get)


        return mode_category_name

    def find_sub_category(product_info):

        all_category_regex = r'(?i)\b(t(?:-)?shirt)s?\b|\b(shirt)s?\b|\b(blouse)s?\b|\b(tank)s?\b|'\
                            r'\b(suit)s?\b|\b(sweatshirt)s?\b|\b(knitwear)s?\b|\b(vest)s?\b|\b(suit)s?\b|'\
                            r'\b(sneaker)s?\b|\b(footie)s?\b|\b(footwear)s?\b|\b(pump)s?\b|'\
                            r'\b(flat)s?\b|\b(heel)s?\b|\b(boot)s?\b|\b(bootie)s?\b|\b(loafer)s?\b|'\
                            r'\b(mule)s?\b|\b(sandal)s?\b|\b(slipper)s?\b|\b(wedge)s?\b|'\
                            r'\b(jean)s?\b|\b(short)s?\b|\b(pant)s?\b|\b(skirt)s?\b|\b(skort)s?\b|'\
                            r'\b(legging)s?\b|\b(trouser)s?\b|\b(jogger)s?\b|\b(sweater)s?\b'\
                            r'\b(bag)s?\b|\b(handbag)s?\b|\b(shoulderbag)s?\b|\b(tote)s?\b|\b(clutch)(?:es)?\b|'\
                            r'\b(luggage)s?\b|\b(belt\s?bag)s?\b|\b(beach\s?bag)s?\b|\b(backpack)s?\b|'\
                            r'\b(scarf|scarves)\b|\b(hat)s?\b|\b(belt)s?\b|\b(sunglass)(?:es)?\b|\b(glove)s?\b|'\
                            r'\b(keychain)s?\b|\b(keyring)s?\b|\b(tie)s?\b|\b(phone\s?case)s?\b|'\
                            r'\b(glass)(?:es)?\b|\b(umbrella)s?\b|\b(frame)s?\b|\b(wallet)s?\b|\b(face\s?mask)s?\b|'\
                            r'\b(helmet)s?\b|\b(dress)(?:es)?\b|\b(jumpsuit)s?\b|\b(gown)s?\b|\b(robe)s?\b|'\
                            r'\b(coat)s?\b|\b(jacket)s?\b|\b(parka)s?\b|\b(trench)(?:es)?\b|\b(raincoat)s?\b|'\
                            r'\b(overcoat)s?\b|\b(bracelet)s?\b|\b(brooch)(?:es)?\b|\b(pin)s?\b|\b(cufflink)s?\b|'\
                            r'\b(earring)s?\b|\b(necklace)s?\b|\b(ring)s?\b|\b(underwear)s?\b|\b(bra)s?\b|'\
                            r'\b(sock)s?\b|\b(sleepwear)s?\b|\b(loungewear)s?\b|\b(boxer)s?\b|\b(brief)s?\b|'\
                            r'\b(linger)(?:ies)?\b|\b(pantie)s?\b|\b(satchel)s?\b|\b(sweater)s?\b|\b(tee)s?\b|'\
                            r'\b(tight)s?\b|\b(cardigan)s?\b|\b(hoodie)s?\b|\b(sweatpant)s?\b|\b(slide)s?\b|'\
                            r'\b(shirtdress)(?:es)?\b|\b(blazer)s?\b|\b(crop)s?\b|\b(leg)s?\b|\b(briefcase)s?\b|'\
                            r'\b(shawl)s?\b|\b(camisole)s?\b|\b(bodysuit)s?\b|\b(turtleneck)s?\b|\b(pouch)(?:es)?\b|' \
                            r'\b(slingback)s?\b'

        all_category_find = re.findall(all_category_regex, product_info)
        all_category_find = unfold_regex(all_category_find)
        word_counter = Counter(all_category_find)
        sub_category = sorted(word_counter, key = word_counter.get, reverse = True)

        if len(word_counter) == 0:
            return ''
        else:
            return sub_category[0]
    
    df['mode_category']=df['product_full_name'].apply(find_mode_category)
    df['sub_category'] = df['product_full_name'].apply(find_sub_category)

In [None]:
# merge outfit with category & subcategory findings
df = outfit.merge(feature_cols, how='left', on='product_id')
create_clothing_category_feature(df)
df.head(3)

In [None]:
# unify all accessory
df['outfit_item_type'] = df['outfit_item_type']\
                .apply(lambda x: 'accessory' if x in ['accessory1', 'accessory2','accessory3'] else x)

# merge all same type columns
df['category'] = df['outfit_item_type'].apply(lambda x: [x.lower()]) + \
                    + df['clothing_category'].fillna('').apply(lambda x: [x.lower()]) + \
                    + df['mode_category'].fillna('').apply(lambda x: [x.lower()])

df['sub_cat'] = df['clothing_subcategory'].fillna('').apply(lambda x: [x.lower()])\
                    + df['sub_category'].fillna('').apply(lambda x: [x.lower()])

# drop duplicates
df['sub_cat'] = df['sub_cat'].apply(lambda x: list(set(x)))\
                    .apply(lambda x: " ".join([text for text in x]))

df['category'] = df['category'].apply(lambda x: list(set(x)))\
                    .apply(lambda x: " ".join([text for text in x]))

# drop irrelevant columns
df.drop(columns=['clothing_category', 'clothing_subcategory', 
                 'mode_category', 'sub_category'], inplace=True)

In [None]:
# create combination for main category + subcategory
df['final'] = df['outfit_item_type'] + '-' + df['sub_cat'].str.strip()

# format outfit combinations into one string
sub_df = df.groupby("outfit_id")['final'].apply(lambda X: ' '.join(X.str.lower()))
sub_df = sub_df.apply(lambda X: [a.strip() for a in X.split()])
sub_df = sub_df.apply(lambda x: " ".join(x))
combo = pd.DataFrame(sub_df).reset_index()

In [None]:
# implement trigram collacation to find best combinations
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures

collocation_finder = TrigramCollocationFinder\
                        .from_documents(combo['final'].apply(lambda x: x.split()).to_list())
measures = TrigramAssocMeasures()
result = collocation_finder.nbest(measures.raw_freq, 10000)
result[:10]

- Logic for Outfit Combination Dictionary:
    - First, we extracted popular outfit decisions from the outfit dataset
    - The thinking process was that we want to match products that are not already existed in outfit dataset based on expert pairing logics
    - The granularity we chose was at subcategory level. 
    
- Subcategory Collacation Extraction Process
    - We extracted subcategory of each product and create unique identifiers as `main category - subcategory`
    - Then, based on trigram collcation results, we identified most frequent outfit combination for each `main category - subcategory` combination
    - For instance, if experts put together `bottom-skirt` most frequently with `top-shirt`, `shoe-boot`, `accessory-tote`, we would create identical outfit recommendation for other `bottom-skirt` products using the same `main category - subcategory` combination

In [None]:
# extract outfit combination recommendation based on prior experts decisions
import re
for x in df['final'].unique():
    print('\n')
    print('-----------------------------')
    print(f'For {x} subcategory:')
    bot_output = ''
    top_output = ''
    shoe_output = ''
    acc_output = ''
    one_output = ''
    for i in result:
        if x in i:
            if re.findall(r'.*bottom.*', " ".join([word for word in i])) != [] and bot_output=='':
                bot_output = i
            elif re.findall(r'.*top.*', " ".join([word for word in i])) != [] and top_output=='':
                top_output = i
            elif re.findall(r'.*shoe.*', " ".join([word for word in i])) != [] and shoe_output=='':
                shoe_output = i
            elif re.findall(r'.*onepiece.*', " ".join([word for word in i])) != [] and one_output=='':
                one_output = i
                
    print('bottom: ', bot_output, '\n', 
          'top: ', top_output, '\n', 
          'shoe: ', shoe_output, '\n', 
          'onepiece: ', one_output)

In [None]:
# final dictionary for all possible subcategory outfit combination
cat_subcat_dict = {
    'top-tank': ['bottom-short', 'shoe-sneaker', 'accessory-backpack'],
    'top-blouse': ['bottom-leg', 'shoe-boot', 'accessory-tie'],
    'top-shirt': ['bottom-leg', 'shoe-boot'],
    'top-sweater': ['bottom-leg', 'shoe-heel', 'accessory-clutch'],
    'top-tie': ['bottom-leg', 'shoe-boot', 'accessory-coat'],
    'top-cardigan': ['bottom-leg', 'shoe-heel', 'accessory-coat'],
    'top-short': ['bottom-short', 'shoe-slide', 'accessory-handbag'],
    'top-camisole': ['bottom-leg', 'shoe-boot', 'accessory-cardigan'],
    'top-tee': ['bottom-skirt', 'shoe-slide', 'accessory-handbag'],
    'top-suit': ['bottom-skirt', 'shoe-sneaker', 'accessory-shirt'],
    'top-turtleneck': ['bottom-pant', 'shoe-heel', 'accessory-blazer'],
    'top-dress': ['bottom-trench', 'shoe-slingback', 'accessory-heel'],
    'top-jumpsuit': ['shoe-boot'],
    'top-scarf': ['bottom-skirt', 'shoe-pump', 'accessory-coat'],
    'top-blazer': ['bottom-leg', 'shoe-slingback', 'accessory-briefcase'],
    'top-hoodie': ['bottom-short', 'shoe-sandal', 'accessory-tote'],
    'top-frame': ['bottom-leg', 'accessory-jacket'],
    'top-belt': ['bottom-leg', 'shoe-slingback', 'accessory-handbag'],
    'top-sweatshirt': ['bottom-pant', 'shoe-slide', 'accessory-tote'],
    'top-skirt': ['bottom-leg', 'shoe-pump', 'accessory-scarf'],
    'top-bodysuit': ['bottom-skirt', 'shoe-heel', 'accessory-tie'],
    'top-crop': ['bottom-short', 'shoe-slide', 'accessory-jean'],
    'top-bra': ['bottom-short', 'shoe-sneaker', 'accessory-backpack'],
    'bottom-skirt': ['top-shirt', 'shoe-boot', 'accessory-cardigan'],
    'bottom-pant': ['top-shirt', 'shoe-slide', 'accessory-cardigan'],
    'bottom-crop': ['top-shirt', 'shoe-boot', 'accessory-sweater'],
    'bottom-trouser': ['top-shirt', 'shoe-heel', 'accessory-jacket'],
    'bottom-leg': ['top-shirt', 'shoe-boot', 'accessory-tie'],
    'bottom-jean': ['top-shirt', 'shoe-boot', 'accessory-coat'],
    'bottom-jacket': ['shoe-boot'],
    'bottom-turtleneck': ['top-dress'],
    'bottom-dress': ['top-turtleneck', 'shoe-bootie', 'accessory-trench'],
    'bottom-jogger': ['top-tank', 'shoe-slide', 'accessory-hoodie'],
    'bottom-sweatpant': ['top-tank', 'shoe-sandal', 'accessory-hoodie'],
    'bottom-legging': ['top-shirt', 'shoe-sneaker', 'accessory-sweatshirt'],
    'bottom-short': ['top-tank', 'shoe-slide', 'accessory-sweatshirt'],
    'bottom-boot': ['top-shirt', 'shoe-pump', 'accessory-blazer'],
    'bottom-tight': ['top-tank', 'shoe-sneaker', 'accessory-sweatshirt'],
    'shoe-pump': ['top-turtleneck', 'accessory-cardigan', 'bottom-leg'],
    'shoe-flat': ['top-turtleneck', 'bottom-skirt', 'accessory-sunglass'],
    'shoe-heel': ['top-tee', 'bottom-leg', 'accessory-tote', 'onepiece-dress'],
    'shoe-boot': ['top-shirt', 'bottom-leg', 'accessory-jacket', 'onepiece-dress'],
    'shoe-slingback': ['top-belt', 'bottom-jean', 'accessory-scarf', 'onepiece-dress'],
    'shoe-frame': ['accessory-tote', 'onepiece-dress'],
    'shoe-flat sandal': ['top-blouse'],
    'shoe-bootie': ['top-cardigan', 'bottom-pant', 'accessory-cardigan', 'onepiece-dress'],
    'shoe-sandal': ['top-turtleneck', 'bottom-skirt', 'accessory-tote', 'onepiece-dress'],
    'shoe-sneaker': ['top-sweatshirt', 'bottom-leg', 'accessory-hoodie', 'onepiece-bodysuit'],
    'shoe-heel mule': ['top-shirt', 'accessory-handbag'],
    'shoe-tote': ['top-turtleneck', 'accessory-heel'],
    'shoe-slide': ['top-tee', 'bottom-pant', 'accessory-handbag', 'onepiece-dress'],
    'shoe-loafer': ['top-sweater', 'bottom-jean', 'accessory-handbag'],
    'shoe-heel pump': ['top-turtleneck', 'bottom-leg'],
    'shoe-mule': ['top-sweater', 'bottom-leg', 'accessory-coat', 'onepiece-dress'],
    'shoe-heel loafer': ['top-belt', 'bottom-skirt', 'accessory-tote', 'accessory-sunglass'],
    'shoe-heel bootie': ['top-shirt', 'bottom-leg', 'accessory-jacket', 'onepiece-dress'],
    'shoe-tie': ['accessory-scarf', 'bottom-skirt', 'top-tee'],
    'shoe-heel slide': ['accessory-coat', 'bottom-leg', 'accessory-handbag'],
    'shoe-slingback sandal': ['top-shirt', 'bottom-jean'],
    'shoe-heel flat': ['top-turtleneck', 'bottom-leg', 'accessory-handbag', 'accessory-sunglass'],
    'shoe-slide sandal': ['top-shirt', 'bottom-skirt', 'accessory-jacket', 'onepiece-dress'],
    'accessory-satchel': ['bottom-pant', 'top-tank', 'shoe-pump'],
    'accessory-clutch': ['bottom-leg', 'top-sweater', 'shoe-boot', 'onepiece-dress'],
    'accessory-tote': ['bottom-jean', 'top-sweatshirt', 'shoe-mule', 'onepiece-dress'],
    'accessory-jacket': ['bottom-pant', 'shoe-boot', 'onepiece-dress'],
    'accessory-sweater': ['bottom-jean', 'top-camisole', 'shoe-heel'],
    'accessory-handbag': ['bottom-skirt', 'top-belt', 'shoe-sneaker'],
    'accessory-trench': ['bottom-leg', 'top-turtleneck', 'shoe-sandal', 'onepiece-dress'],
    'accessory-coat': ['bottom-jean', 'top-sweater', 'shoe-boot', 'onepiece-dress'],
    'accessory-cardigan': ['bottom-leg', 'top-shirt', 'shoe-boot', 'onepiece-dress'],
    'accessory-scarf': ['bottom-leg', 'top-belt', 'shoe-boot', 'onepiece-bodysuit'],
    'accessory-wallet': ['shoe-heel', 'onepiece-dress'],
    'accessory-shirt': ['bottom-skirt', 'top-sweater', 'shoe-mule', 'onepiece-dress'],
    'accessory-blazer': ['bottom-leg', 'top-turtleneck', 'shoe-sandal', 'onepiece-dress'],
    'accessory-shawl': ['bottom-skirt', 'top-blouse', 'shoe-boot', 'onepiece-dress'],
    'accessory-tie': ['bottom-leg', 'top-blouse', 'shoe-pump'],
    'accessory-briefcase': ['bottom-skirt', 'top-tee', 'shoe-slingback'],
    'accessory-backpack': ['bottom-jean', 'top-tank', 'shoe-sneaker', 'onepiece-bodysuit'],
    'accessory-hoodie': ['bottom-pant', 'top-tank', 'shoe-sneaker'],
    'accessory-bra': ['bottom-leg', 'top-sweatshirt', 'shoe-sneaker'],
    'accessory-camisole': ['bottom-leg', 'top-turtleneck', 'shoe-pump', 'onepiece-jumpsuit'],
    'accessory-tee': ['bottom-jean', 'top-shirt', 'shoe-sneaker'],
    'accessory-vest': ['bottom-skirt', 'top-t-shirt', 'shoe-boot'],
    'accessory-tank': ['bottom-skirt', 'top-shirt', 'shoe-pump'],
    'accessory-jean': ['bottom-skirt', 'top-crop', 'shoe-flat'],
    'accessory-frame': ['bottom-skirt', 'top-sweater', 'shoe-mule'],
    'accessory-pouch': ['bottom-jean', 'top-sweater', 'shoe-boot', 'onepiece-dress'],
    'accessory-turtleneck': ['bottom-leg', 'shoe-boot', 'onepiece-dress'],
    'accessory-sunglass': ['shoe-pump', 'onepiece-dress'],
    'accessory-dress': ['bottom-leg', 'top-blouse', 'shoe-sandal'],
    'accessory-belt': ['bottom-leg', 'top-turtleneck', 'shoe-bootie', 'onepiece-dress'],
    'accessory-suit': ['bottom-skirt', 'shoe-boot'],
    'accessory-sweatshirt': ['bottom-tight', 'top-tee', 'shoe-sneaker'],
    'onepiece-dress': ['shoe-boot'],
    'onepiece-jumpsuit': ['shoe-boot', 'accessory-jacket'],
    'onepiece-shirt': ['accessory-cardigan', 'shoe-slide'],
    'onepiece-bodysuit': ['accessory-scarf', 'shoe-slide'],
    'onepiece-leg': ['accessory-jacket', 'shoe-boot'],
    'onepiece-tie': ['shoe-pump', 'accessory-turtleneck'],
    'onepiece-dress shirt': ['accessory-tote', 'shoe-flat'],
    'onepiece-shirtdress': ['accessory-tote', 'shoe-flat'],
    'onepiece-short': ['accessory-sunglass', 'shoe-pump']
}

## Recommendation System

### Query Feature Extraction Functions

In [8]:
# custom query vector
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# unfold function
def unfold_regex(category_find):
        new = []
        for i in category_find:
            for j in i:
                if j != '':
                    new.append(j)
        return new

# clothing category function
def create_clothing_category_feature(user_query):

    top_regex = r'(?i)\b(t(?:-)?shirt)s?\b|\b(shirt)s?\b|\b(blouse)s?\b|\b(tank)s?\b|\b(top)s?\b|'\
                        r'\b(suit)s?\b|\b(sweatshirt)s?\b|\b(knitwear)s?\b|\b(vest)s?\b|\b(suit)s?\b|\b(sweater)s?\b|'\
                        r'\b(cardigan)\b|\b(tee)s?\b|\b(hoodie)s?\b|\b(camisole)s?\b|\b(turtleneck)s?\b'
    shoe_regex = r'(?i)\b(shoe)s?\b|\b(sneaker)s?\b|\b(footie)s?\b|\b(footwear)s?\b|\b(pump)s?\b|'\
                    r'\b(flat)s?\b|\b(heel)s?\b|\b(boot)s?\b|\b(bootie)s?\b|\b(loafer)s?\b'\
                    r'|\b(mule)s?\b|\b(sandal)s?\b|\b(slipper)s?\b|\b(wedge)s?\b|\b(slide)s?\b|\b(slingback)s?\b'
    bottom_regex = r'(?i)\b(jean)s?\b|\b(short)s?\b|\b(pant)s?\b|\b(skirt)s?\b|\b(skort)s?\b|\b(sweatpant)s?\b|'\
                    r'\b(legging)s?\b|\b(trouser)s?\b|\b(bottom)s?\b|\b(jogger)s?\b|\b(tight)s?\b|\b(crop)s?\b|\b(leg)s?\b'
    bag_regex = r'(?i)\b(bag)s?\b|\b(handbag)s?\b|\b(shoulderbag)s?\b|\b(tote)s?\b|\b(clutch)(?:es)?\b'\
                    r'\b(luggage)s?\b|\b(belt\s?bag)s?\b|\b(beach\s?bag)s?\b|\b(backpack)s?\b|\b(satchel)s?\b|'\
                    r'\b(briefcase)s?\b|\b(pouch)(?:es)?\b'
    accessory_regex = r'(?i)\b(scarf|scarves)\b|\b(hat)s?\b|\b(belt)s?\b|\b(sunglass)es?\b|\b(glove)s?\b|'\
                        r'\b(keychain)s?\b|\b(keyring)s?\b|\b(tie)s?\b|\b(phone\s?case)s?\b|'\
                        r'\b(glass)(?:es)?\b|\b(umbrella)s?\b|\b(frame)s?\b|\b(wallet)s?\b|\b(face\s?mask)s?\b|'\
                        r'\b(helmet)s?\b|\b(shawl)s?\b'
    onepiece_regex = r'(?i)\b(dress)(?:es)?\b|\b(jumpsuit)s?\b|\b(gown)s?\b|\b(robe)s?\b|\b(shirtdress)(?:es)?\b|\b(bodysuit)s?\b'
    outerwear_regex= r'(?i)\b(?:top)?(coat)s?\b|\b(jacket)s?\b|\b(parka)s?\b|\b(trench)(?:es)?\b|\b(raincoat)s?\b|\b(overcoat)s?\b|'\
                        r'\b(blazer)s?\b'
    jewelry_regex = r'(?i)\b(bracelet)s?\b|\b(brooch)(?:es)?\b|\b(pin)s?\b|\b(cufflink)s?\b|\b(earring)s?\b|'\
                        r'\b(necklace)s?\b|\b(ring)s?\b'
    intimate_regex = r'(?i)\b(underwear)s?\b|\b(bra)s?\b|\b(sock)s?\b|\b(sleepwear)s?\b|\b(loungewear)s?\b|'\
                        r'\b(boxer)s?\b|\b(brief)s?\b|\b(linger)(?:ies)?\b|\b(pantie)s?\b'

    top_find = re.findall(top_regex, user_query)
    shoe_find = re.findall(shoe_regex, user_query)
    bottom_find = re.findall(bottom_regex, user_query)
    bag_find = re.findall(bag_regex, user_query)
    accessory_find = re.findall(accessory_regex, user_query)
    onepiece_find = re.findall(onepiece_regex, user_query)
    outerwear_find = re.findall(outerwear_regex, user_query)
    jewelry_find = re.findall(jewelry_regex, user_query)
    intimate_find = re.findall(intimate_regex, user_query)

    category_word_count_dict = {'top':len(top_find),'shoe':len(shoe_find),'bottom':len(bottom_find),\
                                        'bag':len(bag_find),'accessory':len(accessory_find),\
                                        'onepiece':len(onepiece_find),'outerwear':len(outerwear_find),\
                                        'jewelry':len(jewelry_find),'intimate':len(intimate_find)}

    cloth_result = ['']
    if sum(category_word_count_dict.values()) != 0:
        mode_category_name = max(category_word_count_dict, key=category_word_count_dict.get)
        cloth_result = [mode_category_name]

    return cloth_result

# detailed clothing category function
def find_sub_category(user_query):
    
    all_category_regex = r'(?i)\b(t(?:-)?shirt)s?\b|\b(shirt)s?\b|\b(blouse)s?\b|\b(tank)s?\b|'\
                            r'\b(suit)s?\b|\b(sweatshirt)s?\b|\b(knitwear)s?\b|\b(vest)s?\b|\b(suit)s?\b|'\
                            r'\b(sneaker)s?\b|\b(footie)s?\b|\b(footwear)s?\b|\b(pump)s?\b|'\
                            r'\b(flat)s?\b|\b(heel)s?\b|\b(boot)s?\b|\b(bootie)s?\b|\b(loafer)s?\b|'\
                            r'\b(mule)s?\b|\b(sandal)s?\b|\b(slipper)s?\b|\b(wedge)s?\b|'\
                            r'\b(jean)s?\b|\b(short)s?\b|\b(pant)s?\b|\b(skirt)s?\b|\b(skort)s?\b|'\
                            r'\b(legging)s?\b|\b(trouser)s?\b|\b(jogger)s?\b|\b(sweater)s?\b'\
                            r'\b(bag)s?\b|\b(handbag)s?\b|\b(shoulderbag)s?\b|\b(tote)s?\b|\b(clutch)(?:es)?\b|'\
                            r'\b(luggage)s?\b|\b(belt\s?bag)s?\b|\b(beach\s?bag)s?\b|\b(backpack)s?\b|'\
                            r'\b(scarf|scarves)\b|\b(hat)s?\b|\b(belt)s?\b|\b(sunglass)(?:es)?\b|\b(glove)s?\b|'\
                            r'\b(keychain)s?\b|\b(keyring)s?\b|\b(tie)s?\b|\b(phone\s?case)s?\b|'\
                            r'\b(glass)(?:es)?\b|\b(umbrella)s?\b|\b(frame)s?\b|\b(wallet)s?\b|\b(face\s?mask)s?\b|'\
                            r'\b(helmet)s?\b|\b(dress)(?:es)?\b|\b(jumpsuit)s?\b|\b(gown)s?\b|\b(robe)s?\b|'\
                            r'\b(coat)s?\b|\b(jacket)s?\b|\b(parka)s?\b|\b(trench)(?:es)?\b|\b(raincoat)s?\b|'\
                            r'\b(overcoat)s?\b|\b(bracelet)s?\b|\b(brooch)(?:es)?\b|\b(pin)s?\b|\b(cufflink)s?\b|'\
                            r'\b(earring)s?\b|\b(necklace)s?\b|\b(ring)s?\b|\b(underwear)s?\b|\b(bra)s?\b|'\
                            r'\b(sock)s?\b|\b(sleepwear)s?\b|\b(loungewear)s?\b|\b(boxer)s?\b|\b(brief)s?\b|'\
                            r'\b(linger)(?:ies)?\b|\b(pantie)s?\b|\b(satchel)s?\b|\b(sweater)s?\b|\b(tee)s?\b|'\
                            r'\b(tight)s?\b|\b(cardigan)s?\b|\b(hoodie)s?\b|\b(sweatpant)s?\b|\b(slide)s?\b|'\
                            r'\b(shirtdress)(?:es)?\b|\b(blazer)s?\b|\b(crop)s?\b|\b(leg)s?\b|\b(briefcase)s?\b|'\
                            r'\b(shawl)s?\b|\b(camisole)s?\b|\b(bodysuit)s?\b|\b(turtleneck)s?\b|\b(pouch)(?:es)?\b|' \
                            r'\b(slingback)s?\b'

    subcategory_find = re.findall(all_category_regex, user_query)
    subcategory_find = unfold_regex(subcategory_find)

    return subcategory_find



# gender feature function
def create_gender_feature(user_query):

    # gender regex patterns
    female_regex = r'\b(women|ladies|lady|woman|female|dress(?:es)?|skorts?|skirts?|blouses?|girls?|jewelr(?:ies)?|heels?)\b'
    male_regex = r'\b(man|men|gentleman|gentlemen)\b'
    kid_regex = r'\b(kids?|child(?:ren)?|girls?|boys?|babies|baby|infants?|toddlers?|teenagers?|youths?)\b'

    # initiate empty list to store results
    female_find = re.findall(female_regex, user_query)
    male_find = re.findall(male_regex, user_query)
    kid_find = re.findall(kid_regex, user_query)

    gender_list = female_find+male_find+kid_find
    gender_list = [i for i in gender_list if i != ""]
    gender_keywords = " ".join([i for i in gender_list])
    gender_word_count_dict = {'female':len(female_find),'male':len(male_find),'kid':len(kid_find)}

    gender_result = ['']
    if sum(gender_word_count_dict.values()) == 0:
        return gender_result
    else:
        mode_gender = max(gender_word_count_dict, key=gender_word_count_dict.get)
        gender_result = [mode_gender]

    return gender_result


# color feature function
def create_color_feature(user_query): # pd.DataFrame -> pd.Series, pd.DataFrame

    # define color pattern keywords
    color_pattern = r'(?i)\bRed|Orange|Yellow|Green|Blue|Purple|White|Black|Brown|Magenta|Tan|Olive|Navy|Turquoise|Silver|Lime|Teal|Indigo|Violet|Pink|Gray|Navy|Beige|Burgundy|Golden|Magenta|Cyan|Aquamarine\b'

    # find all color words from given info
    color_result = re.findall(color_pattern, user_query)

    return color_result

def create_location_feature(user_query):

    # define location keywords
    location_regex_1 = r'(?i)\b(usa|italy|ethiopia|china|peru|los angeles|nyc|spain|new york|portugal|india|america|kenya|turkey|brazil|ghana|italian|morocco|france|vietnam|germany|lima|mexico|argentina|japan|brooklyn|madagascar|bali|prc|poland)\b'
    location_regex_2 = r'(?i)\b(LA|US|JP)\b'
    # find all location words from given info
    location_result_1 = re.findall(location_regex_1, user_query)
    location_result_2 = re.findall(location_regex_2, user_query)
    location_result = location_result_1 + location_result_2

    return location_result

def create_tags_feature(user_query):

    tags_words = ['stripe vertical', 'shoulder bags', 'highneck', 'romantic', 'crossbody', 'sweatshirt', 
                  'denim', 'cotton blend', 'straightregular', 'buttonedback', 'backzipper', 'multiprint', 
                  'sportsbra', 'joggerssweatpants', 'henley', 'sheath', 'dropwaist', 'mohair', 'skinny', 
                  'jerseyknit', 'short at waistline', 'cold weather', 'corduroy', 'tank', 'splitneck', 'vneck',
                  'camisole', 'tiedye', 'shawlcollar', 'straight regular', 'shell', 'satincharmeuse', 
                  'lightbrowns', 'waist', 'roundtoe', 'bandcollar', 'sandals', 'buttondown', 'above waistline', 
                  'plus', 'purewool', 'sleeve', 'flap', 'kitten', 'businesscasualdress', 'down', 'bucketbags', 
                  'classic', 'strap', 'belted', '5pocketpantnondenim', 'boot', 'floral', 'mid length at hips', 
                  'squaretoe', 'edgy', 'tote', 'booties', 'fauxshearling', 'bucket', 'faux shearling', 
                  'lightbrown', 'straight', 'cone', 'oneshoulder', 'squareneck', 'culotte', 'beachbags', 
                  'hoodie', 'pinstripe', 'cases', 'faux leather', 'regular', 'blacks', 'zipup', 'hook', 
                  'twisted', 'fauxfur', 'crepedechine', 'sidezip', 'halfwayzipper', 'stripevertical', 
                  'suede', 'boots', 'tall', 'buttonfront', 'open', 'purecotton', 'trackpants', 'shortsleeve', 
                  'platform', 'sleeveless', 'cut', 'sweatshirthoodie', 'halfwaybuttoned', 'swraps', 'logo', 
                  'zipflywithbutton', 'pumpsheels', 'burgundies', 'twill', 'shawl collar', 'tweed', 
                  'midlengthathips', 'joggers', 'crepe de chine', 'aline', 'buckle', 'satchels', 'wedge', 
                  'capri', 'business casual', 'blouse', 'glam', 'bodycon', 'knit', 'jewel', 'purelinen', 
                  'mandarincollar', 'color block', 'tiefront', 'heels', 'fleece', 'crossbodybags', 'poncho', 
                  'wrap', 'wedges', 'duster', 'opentoe', 'vest', 'retro', 'patent leather', 'turtleneck', 
                  'short sleeve', 'light brown', 'button', 'zipfly', 'grays', 'satin charmeuse', 'multi print',
                  'funnel', 'pantsleggings', 'wideleg', 'halter', 'pinks', 'clasp', 'women', 'silk blend', 
                  'mandarin', 'casualdress', 'velcro', 'scarveswraps', 'stripehorizontal', 'dark brown', 
                  'cropped', 'daytonight', 'canvas', 'block', 'stiletto', 'surplice', 'golds', 'geometric', 
                  'faux fur', 'empirewaist', 'coldweather', 'nightout', 'maternity', 'boyfriend', 'elastane', 
                  'narrow', 'draped', 'laceup', 'silvers', 'baggy', 'sneakersathletic', 'pussybow', 
                  'casual dress', 'animal', 'drawstring', 'casual', 'cowlneck', 'lock', 'inwardcurve', 
                  'under8', 'oranges', 'shirtdress', 'tieback', 'tropical', 'sateen', 'walletscardcases', 
                  'linen blend', 'buttoned', 'belowhips', 'multi', 'day to night', 'highover9', 'plungeneck', 
                  'tee', 'platformflatform', 'tailored', 'polo', 'fauxleather', 'bags', 'linenblend', 
                  'sweatpants', 'slingback', 'androgynous', 'purples', 'sweetheart', 'tie', 'blazerdress', 
                  'purecashmere', 'slit', 'blazerscoatsjackets', 'anklestrap', 'handbags', 'closedtoe', 
                  'strapless', 'pants', 'mules', 'skirts', 'semi fitted', 'low', 'totebags', 'snap', 
                  'fittedtailored', 'sneakers athletic', 'openfront', 'puffsleeve', 'crewneck', 'synthetic',
                  'raisedsole', 'magnetic', 'zip', 'darkbrowns', 'hobobags', 'long', 'cardigan', 'calfhair', 
                  'beach bags', 'pointedtoe', 'maxi', 'peeptoe', 'laces', 'silkblend', 'wide', 'paisley',
                  'shortatwaistline', 'bustier', 'shorts', 'flats', 'colorblock', 'stripe', 
                  'blazers coats jackets', 'laptopsbriefcases', 'peplum', 'beltbagsfannypack', 'patentleather',
                  'cold', 'slipdress', 'shoulderbags', 'round toe', 'croptop', 'cargo', 'shearling', 'shoulder',
                  'midcalf', 'snaps', 'back', 'collar', 'blues', 'leggings', 'hookandloop', 'relaxed', 'greens',
                  'sweater', 'chambray', 'hookloop', 'tshirtdress', 'zipper', 'tieneck', 'longsleeve', 
                  'cashmereblend', 'longbelowhips', 'slim', 'weekend', 'halfway buttoned', 'oversized', 
                  'reds', 'slippers', 'sunglasses', 'modal', 'tiered', 'whites', 'belts', 'sports bra', 
                  'cottonblend', 'high', 'boho', 'stripe horizontal', 'sundress', 'backpacks', 'beiges', 
                  'scarve', 'graphic', 'fannypack', 'sweaterdress', 'flare', 'dots', 'boatneck', 'nondenim', 
                  'businesscasual', 'empire', 'woolblend', 'slides', 'laptops briefcases', 'houndstooth', 
                  'capsleeve', 'pure linen', 'flatform', 'athleisure', 'ponyhair', 'scoopneck', 'calf hair', 
                  'denimjeans', 'none', 'clutchespouches', 'front', 'coldshoulder', 'bodysuit', 
                  'zipflywithhook', 'buttonedfront', 'spandex', 'collared', 'puff', 'vacation', 'abstract', 
                  'side', 'snakeskin', 'yellows', 'monogram', 'keyhole', 'closed toe', 'designer', 
                  'long sleeve', 'backzip', 'chenille', 'asymmetrical', 'fitted', 'tunic', 
                  'croppedabovewaistline', 'active', 'cap', 'pointed', 'camouflage', 'toe', 'frontzip', 
                  'bootcut', 'backless', 'halfway zipper', 'tie dye', 'mockneck', 'offshoulder', 'shift', 
                  'trousers', 'puresilk', 'walletscard', 'workout', 'pumps', 'ankle', 'mulesslides', 
                  'semifitted', '5pocketpant', 'trapezeswing', 'mid89', 'modern', 'work', 'flat', 'plaid', 
                  'belt']

    tags_words = list(set(tags_words))

    # create tags regex
    super_regex = r"(?i)\b" + "|".join([words for words in tags_words]) + r"\b"

    # find all tags words from query
    tags_result = re.findall(super_regex, user_query)

    return tags_result

def create_material_feature(user_query):
    
    # define color pattern keywords
    materials = ['acetate', 'acrylic', 'alpaca', 'calf', 'cashmere','chiffon', 'cotton','kidskin', 
            'lamb', 'lambskin', 'leather','linen', 'lyocell','mercerized', 'merino','nylon',
            'organic','peruvian', 'pima','poly', 'polyamide','polyester', 'polyurethane', 
            'ramie', 'rayon','rubber', 'silk','supima', 'tencel', 'triacetate', 'uv', 'uva',
            'velvet', 'virgin', 'viscose', 'wood', 'wool','rose sylk', 'cotton']
    
    material_pattern = r"(?i)\b("+"|".join(materials)+r")\b"
    
    # find all material words from given info
    material_result = re.findall(material_pattern, user_query)
    
    return material_result

def word_vectorizer(user_query):
    vectorizer = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b(?![0-9])\w\w+\b')
    X = vectorizer.fit_transform([user_query])
    word_vector = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return word_vector

### Custom Outfit Combination Assignment

In [9]:
# load datasets
import pandas as pd
df = pd.read_csv('data/rec_product.csv',index_col=0)
outfit_df = pd.read_csv('data/outfit_combinations USC.csv')
feature_columns = pd.read_csv('data/feature_columns.csv',index_col=0)
feature_columns['clothing_category'] = feature_columns['clothing_category'].fillna('uncategorized')
feature_columns['clothing_subcategory'] = feature_columns['clothing_subcategory'].fillna('')
df = df.merge(feature_columns, on='product_id', how='left')
df['cat_subcat'] = df['clothing_category'].str.lower() + '-' + df['clothing_subcategory'].str.lower()

In [10]:
# define a function to find all existing outfit combination for products
def outfit_combo(outfit, product_df):
    # change all accessories to 'accessory'
    outfit['outfit_item_type'] = outfit['outfit_item_type'].\
                        apply(lambda x: 'accessory' if x in ['accessory1','accessory2', 'accessory3'] else x)

    merged = outfit.merge(product_df, how='inner', on='product_id').iloc[:,:3]
    merged2 = merged.merge(outfit, how='inner', on='outfit_id')
    merged3 = merged2.loc[merged2['outfit_item_type_x'] != merged2['outfit_item_type_y'],]
    product_id = outfit['product_id'].drop_duplicates()
    new_df = pd.DataFrame(0, columns = ['product_id'], index=range(1))
    row = 0
    for p_id in product_id:
        df = merged3.loc[merged3['product_id_x'] == p_id,]
        if len(df) == 0:
            new_df.loc[row, 'product_id'] = p_id
            new_df.loc[row, 'outfit'] = 'NO MATCH'
        else:
            outfitid = df['outfit_id'].value_counts().index[0]
            temp = df.loc[df['outfit_id']==outfitid,['product_id_y','outfit_item_type_y','product_full_name']].reset_index(drop=True)
            outfit_dict = dict()
            
            for i in range(temp.shape[0]):
                outfit_dict[temp.loc[i,'outfit_item_type_y']] = str(temp.loc[i,'product_full_name'])+' ('+str(temp.loc[i, 'product_id_y'])+')'
                
            new_df.loc[row,'product_id']=p_id
            new_df.loc[row, 'outfit'] = [outfit_dict]

        row += 1

    df_with_outfit = new_df[new_df['outfit']!='NO MATCH']
    
    product_df = product_df.merge(df_with_outfit, how='left', on='product_id')

    outfit_combo = product_df[product_df['outfit'].notna()][['product_id','name',
                                                             'clothing_category','outfit']].reset_index(drop=True)
    
    # add in product
    def append_self(df):
        self_dict = df['outfit'][0]
        order_dict = {}
        order_dict[df['clothing_category']] = df['name']+' ('+df['product_id']+')'
        for key in self_dict:
            if key not in order_dict:
                order_dict[key] = self_dict[key]
        return order_dict

    outfit_combo['outfit'] = outfit_combo.apply(append_self, axis=1)
    
    return outfit_combo

outfit_combo = outfit_combo(outfit_df, df)
outfit_combo.to_excel('data/outfit_comb.xlsx')

### Query Function Part I. Product ID Extraction

In [11]:
def search_product(user_query: str):
    
    # load existing feature vectors
    product = pd.read_csv('data/rec_product.csv',index_col=0)
    final_vectors = pd.read_csv('data/all_vectors.csv', index_col=0)
    
    feature_vec = pd.DataFrame()
    feature_vec['vector'] = list(final_vectors.values)
    
    # create a dataframe for query vector
    query_vector = pd.DataFrame(columns=[final_vectors.columns])
    query_vector.loc[0] = [0 for i in range(len(final_vectors.columns))]
    
    # custom functions (see above)
    
    # concat all result and mapp query vector
    cloth_result = create_clothing_category_feature(user_query)
    subcategory_result = find_sub_category(user_query)
    gender_result = create_gender_feature(user_query)
    color_result = create_color_feature(user_query)
    location_result  = create_location_feature(user_query)
    tags_result = create_tags_feature(user_query)
    material_result = create_material_feature(user_query)
    word_vector = word_vectorizer(user_query)
    
    query_feature = ['',] + cloth_result + gender_result + color_result + location_result + tags_result \
                    + material_result + subcategory_result
    
    for name in query_feature:
        if name in query_vector.columns:
            query_vector.loc[0, name] = 1
            
    for name in word_vector.columns:
        if name in query_vector.columns:
            query_vector.loc[0, name] = 1
            
    # regroup accessory groups
    if query_vector.loc[0, 'accessory'].values + \
                                        query_vector.loc[0, 'bag'].values + \
                                        query_vector.loc[0, 'intimate'].values + \
                                        query_vector.loc[0, 'outerwear'].values + \
                                        query_vector.loc[0, 'jewelry'].values > 0: 
        query_vector.loc[0, 'accessory'] = 1
    
            
    # determine the query belong group: top, bottom, accessory
    query_cat = ''
    for col in ['uncategorized', 'accessory', 'top', 'bottom', 'shoe', 'onepiece']:
        if query_vector.loc[0, col].values == 1:
            query_cat = str(col)
    
    feature_index = final_vectors.loc[product[query_cat]==1].index
    
    # calculate cosine similarity with feature vectors
    def cos_sim(a, b):
        from numpy import dot
        from numpy.linalg import norm

        cos_sim = dot(a, b)/(norm(a)*norm(b))
        
        return cos_sim
    
    # find best match in the searched category
    for i in feature_vec.index:
        if i in feature_index:
            part_b = feature_vec.loc[i, 'vector']
            if part_b.sum() >= query_vector.values.sum():
                feature_vec.loc[i, 'query_cos'] = cos_sim(query_vector, part_b)
            else:
                feature_vec.loc[i, 'query_cos'] = 0
        else:
            feature_vec.loc[i, 'query_cos'] = 0
    
    
    item_index = feature_vec['query_cos'].sort_values(ascending=False).head(1).index
    
    return product.loc[item_index, 'product_id'].values[0]

### Query Function Part II. Matching Outfit Extraction

In [12]:
# final custom outfit subcategory combination
cat_subcat_dict = {
    'top-tank': ['bottom-short', 'shoe-sneaker', 'accessory-backpack'],
    'top-blouse': ['bottom-leg', 'shoe-boot', 'accessory-tie'],
    'top-shirt': ['bottom-leg', 'shoe-boot'],
    'top-sweater': ['bottom-leg', 'shoe-heel', 'accessory-clutch'],
    'top-tie': ['bottom-leg', 'shoe-boot', 'accessory-coat'],
    'top-cardigan': ['bottom-leg', 'shoe-heel', 'accessory-coat'],
    'top-short': ['bottom-short', 'shoe-slide', 'accessory-handbag'],
    'top-camisole': ['bottom-leg', 'shoe-boot', 'accessory-cardigan'],
    'top-tee': ['bottom-skirt', 'shoe-slide', 'accessory-handbag'],
    'top-suit': ['bottom-skirt', 'shoe-sneaker', 'accessory-shirt'],
    'top-turtleneck': ['bottom-pant', 'shoe-heel', 'accessory-blazer'],
    'top-dress': ['bottom-trench', 'shoe-slingback', 'accessory-heel'],
    'top-jumpsuit': ['shoe-boot'],
    'top-scarf': ['bottom-skirt', 'shoe-pump', 'accessory-coat'],
    'top-blazer': ['bottom-leg', 'shoe-slingback', 'accessory-briefcase'],
    'top-hoodie': ['bottom-short', 'shoe-sandal', 'accessory-tote'],
    'top-frame': ['bottom-leg', 'accessory-jacket'],
    'top-belt': ['bottom-leg', 'shoe-slingback', 'accessory-handbag'],
    'top-sweatshirt': ['bottom-pant', 'shoe-slide', 'accessory-tote'],
    'top-skirt': ['bottom-leg', 'shoe-pump', 'accessory-scarf'],
    'top-bodysuit': ['bottom-skirt', 'shoe-heel', 'accessory-tie'],
    'top-crop': ['bottom-short', 'shoe-slide', 'accessory-jean'],
    'top-bra': ['bottom-short', 'shoe-sneaker', 'accessory-backpack'],
    'bottom-skirt': ['top-shirt', 'shoe-boot', 'accessory-cardigan'],
    'bottom-pant': ['top-shirt', 'shoe-slide', 'accessory-cardigan'],
    'bottom-crop': ['top-shirt', 'shoe-boot', 'accessory-sweater'],
    'bottom-trouser': ['top-shirt', 'shoe-heel', 'accessory-jacket'],
    'bottom-leg': ['top-shirt', 'shoe-boot', 'accessory-tie'],
    'bottom-jean': ['top-shirt', 'shoe-boot', 'accessory-coat'],
    'bottom-jacket': ['shoe-boot'],
    'bottom-turtleneck': ['top-dress'],
    'bottom-dress': ['top-turtleneck', 'shoe-bootie', 'accessory-trench'],
    'bottom-jogger': ['top-tank', 'shoe-slide', 'accessory-hoodie'],
    'bottom-sweatpant': ['top-tank', 'shoe-sandal', 'accessory-hoodie'],
    'bottom-legging': ['top-shirt', 'shoe-sneaker', 'accessory-sweatshirt'],
    'bottom-short': ['top-tank', 'shoe-slide', 'accessory-sweatshirt'],
    'bottom-boot': ['top-shirt', 'shoe-pump', 'accessory-blazer'],
    'bottom-tight': ['top-tank', 'shoe-sneaker', 'accessory-sweatshirt'],
    'shoe-pump': ['top-turtleneck', 'accessory-cardigan', 'bottom-leg'],
    'shoe-flat': ['top-turtleneck', 'bottom-skirt', 'accessory-sunglass'],
    'shoe-heel': ['top-tee', 'bottom-leg', 'accessory-tote', 'onepiece-dress'],
    'shoe-boot': ['top-shirt', 'bottom-leg', 'accessory-jacket', 'onepiece-dress'],
    'shoe-slingback': ['top-belt', 'bottom-jean', 'accessory-scarf', 'onepiece-dress'],
    'shoe-frame': ['accessory-tote', 'onepiece-dress'],
    'shoe-flat sandal': ['top-blouse'],
    'shoe-bootie': ['top-cardigan', 'bottom-pant', 'accessory-cardigan', 'onepiece-dress'],
    'shoe-sandal': ['top-turtleneck', 'bottom-skirt', 'accessory-tote', 'onepiece-dress'],
    'shoe-sneaker': ['top-sweatshirt', 'bottom-leg', 'accessory-hoodie', 'onepiece-bodysuit'],
    'shoe-heel mule': ['top-shirt', 'accessory-handbag'],
    'shoe-tote': ['top-turtleneck', 'accessory-heel'],
    'shoe-slide': ['top-tee', 'bottom-pant', 'accessory-handbag', 'onepiece-dress'],
    'shoe-loafer': ['top-sweater', 'bottom-jean', 'accessory-handbag'],
    'shoe-heel pump': ['top-turtleneck', 'bottom-leg'],
    'shoe-mule': ['top-sweater', 'bottom-leg', 'accessory-coat', 'onepiece-dress'],
    'shoe-heel loafer': ['top-belt', 'bottom-skirt', 'accessory-tote', 'accessory-sunglass'],
    'shoe-heel bootie': ['top-shirt', 'bottom-leg', 'accessory-jacket', 'onepiece-dress'],
    'shoe-tie': ['accessory-scarf', 'bottom-skirt', 'top-tee'],
    'shoe-heel slide': ['accessory-coat', 'bottom-leg', 'accessory-handbag'],
    'shoe-slingback sandal': ['top-shirt', 'bottom-jean'],
    'shoe-heel flat': ['top-turtleneck', 'bottom-leg', 'accessory-handbag', 'accessory-sunglass'],
    'shoe-slide sandal': ['top-shirt', 'bottom-skirt', 'accessory-jacket', 'onepiece-dress'],
    'accessory-satchel': ['bottom-pant', 'top-tank', 'shoe-pump'],
    'accessory-clutch': ['bottom-leg', 'top-sweater', 'shoe-boot', 'onepiece-dress'],
    'accessory-tote': ['bottom-jean', 'top-sweatshirt', 'shoe-mule', 'onepiece-dress'],
    'accessory-jacket': ['bottom-pant', 'shoe-boot', 'onepiece-dress'],
    'accessory-sweater': ['bottom-jean', 'top-camisole', 'shoe-heel'],
    'accessory-handbag': ['bottom-skirt', 'top-belt', 'shoe-sneaker'],
    'accessory-trench': ['bottom-leg', 'top-turtleneck', 'shoe-sandal', 'onepiece-dress'],
    'accessory-coat': ['bottom-jean', 'top-sweater', 'shoe-boot', 'onepiece-dress'],
    'accessory-cardigan': ['bottom-leg', 'top-shirt', 'shoe-boot', 'onepiece-dress'],
    'accessory-scarf': ['bottom-leg', 'top-belt', 'shoe-boot', 'onepiece-bodysuit'],
    'accessory-wallet': ['shoe-heel', 'onepiece-dress'],
    'accessory-shirt': ['bottom-skirt', 'top-sweater', 'shoe-mule', 'onepiece-dress'],
    'accessory-blazer': ['bottom-leg', 'top-turtleneck', 'shoe-sandal', 'onepiece-dress'],
    'accessory-shawl': ['bottom-skirt', 'top-blouse', 'shoe-boot', 'onepiece-dress'],
    'accessory-tie': ['bottom-leg', 'top-blouse', 'shoe-pump'],
    'accessory-briefcase': ['bottom-skirt', 'top-tee', 'shoe-slingback'],
    'accessory-backpack': ['bottom-jean', 'top-tank', 'shoe-sneaker', 'onepiece-bodysuit'],
    'accessory-hoodie': ['bottom-pant', 'top-tank', 'shoe-sneaker'],
    'accessory-bra': ['bottom-leg', 'top-sweatshirt', 'shoe-sneaker'],
    'accessory-camisole': ['bottom-leg', 'top-turtleneck', 'shoe-pump', 'onepiece-jumpsuit'],
    'accessory-tee': ['bottom-jean', 'top-shirt', 'shoe-sneaker'],
    'accessory-vest': ['bottom-skirt', 'top-t-shirt', 'shoe-boot'],
    'accessory-tank': ['bottom-skirt', 'top-shirt', 'shoe-pump'],
    'accessory-jean': ['bottom-skirt', 'top-crop', 'shoe-flat'],
    'accessory-frame': ['bottom-skirt', 'top-sweater', 'shoe-mule'],
    'accessory-pouch': ['bottom-jean', 'top-sweater', 'shoe-boot', 'onepiece-dress'],
    'accessory-turtleneck': ['bottom-leg', 'shoe-boot', 'onepiece-dress'],
    'accessory-sunglass': ['shoe-pump', 'onepiece-dress'],
    'accessory-dress': ['bottom-leg', 'top-blouse', 'shoe-sandal'],
    'accessory-belt': ['bottom-leg', 'top-turtleneck', 'shoe-bootie', 'onepiece-dress'],
    'accessory-suit': ['bottom-skirt', 'shoe-boot'],
    'accessory-sweatshirt': ['bottom-tight', 'top-tee', 'shoe-sneaker'],
    'onepiece-dress': ['shoe-boot'],
    'onepiece-jumpsuit': ['shoe-boot', 'accessory-jacket'],
    'onepiece-shirt': ['accessory-cardigan', 'shoe-slide'],
    'onepiece-bodysuit': ['accessory-scarf', 'shoe-slide'],
    'onepiece-leg': ['accessory-jacket', 'shoe-boot'],
    'onepiece-tie': ['shoe-pump', 'accessory-turtleneck'],
    'onepiece-dress shirt': ['accessory-tote', 'shoe-flat'],
    'onepiece-shirtdress': ['accessory-tote', 'shoe-flat'],
    'onepiece-short': ['accessory-sunglass', 'shoe-pump']
}

In [13]:
# define a function to recommend outfit based on category and subcategory
def cat_subcat_rec(product_id:str):
    new_dict = {}
    
    # search for product id with other information
    temp0 = df.loc[df['product_id']==product_id, ['clothing_category', 'name', 'cat_subcat']]
    pname = str(temp0['name'].values[0])
    mcat = str(temp0['clothing_category'].values[0])
    subcat = str(temp0['cat_subcat'].values[0])
    
    # add query product info
    new_dict[mcat] = pname+' ('+product_id+')'
    
    # add recommended outfit info to dictionary
    if subcat in cat_subcat_dict:
        for cat in cat_subcat_dict[subcat]:
            temp1 = df.loc[df['cat_subcat']==cat,['product_id', 'name', 'clothing_category']]
            if temp1.empty:
                continue
            else:
                temp1 = temp1.sample(1)
                new_dict[str(temp1['clothing_category'].values[0])] = str(temp1['name'].values[0])+\
                                                                        ' ('+str(temp1['product_id'].values[0])+')'
    return new_dict

In [14]:
# define a function to determine which outfit dictionary to draw results from
def search_outfit(product_id:str):
    if product_id in outfit_combo['product_id'].values:
        result = outfit_combo.loc[outfit_combo['product_id']==product_id, 'outfit'].values[0]
    else:
        result = cat_subcat_rec(product_id)
    return result

### Query Function Part III. Final Outfit Extarction

In [15]:
# final search query function
def search(user_query: str):
    """
    user_query is a string that is passed in by the user, and this function
    returns a dictionary of outfit results. Example:
    search("pleated casual skirt") -> {
    "top": "...",
    "bottom": "...",
    "shoe": "..."
    }
    """
    product_id = search_product(user_query)

    return search_outfit(product_id)

## Phase I. Testing

In [16]:
test_1 = "Two medium cotton blue shirt with cartoon pictures"
test_2 = "7 For All Mankind slim fitting, straight leg pant with a center back zipper and slightly cropped leg"
test_3 = "white and black coat with bar pattern"
test_4 = "high heels in black color"
test_5 = "footies with opentoe, black and white strip, good ploy material"
test_6 = 'white oversized graphic-print silk cotton shirt'
test_7 = "red heels with white stripes for party wearing"
test_8 = "Slim Knit Skirt"
test_9 = "white jeans with green pattern and with blue strips red stars"

In [17]:
test_1 = "Two medium cotton blue shirt with cartoon pictures"
df.loc[df['product_id']==search_product(test_1), 'name'].values[0]

'SP21-W0928-12 Sara Henley Shirt BLUE PINSTRIPE'

In [18]:
test_2 = "7 For All Mankind slim fitting, straight leg pant with a center back zipper and slightly cropped leg"
df.loc[df['product_id']==search_product(test_2), 'name'].values[0]

'Monterey Pant'

In [19]:
test_3 = "white and black coat with bar pattern"
df.loc[df['product_id']==search_product(test_3), 'name'].values[0]

'Chevron Fur Coat'

In [20]:
test_4 = "black high heels"
df.loc[df['product_id']==search_product(test_4), 'name'].values[0]

'PATRON II Black Tweed'

In [21]:
test_5 = "footies with opentoe, black and white strip, good ploy material"
df.loc[df['product_id']==search_product(test_5), 'name'].values[0]

'TANNIS Black/White'

In [22]:
test_6 = 'white oversized graphic-print silk cotton shirt'
df.loc[df['product_id']==search_product(test_6), 'name'].values[0]

'T-Shirt M.I.L Off White'

In [23]:
test_7 = "red heels with white stripes for party wearing"
df.loc[df['product_id']==search_product(test_7), 'name'].values[0]

'Hush Heel in Multi Crystal'

In [24]:
test_8 = "Slim Knit Skirt"
df.loc[df['product_id']==search_product(test_8), 'name'].values[0]

'Aquinnah Skirt'

In [25]:
test_9 = "white jeans with green pattern and with blue strips red stars"
df.loc[df['product_id']==search_product(test_9), 'name'].values[0]

'Navy & Army Skater Jean - Size 25'

In [26]:
test_10 = "Stretch Cotton & Linen Wide Leg Trousers"
df.loc[df['product_id']==search_product(test_10), 'name'].values[0]

'Stretch Cotton & Linen Wide Leg Trousers'

## Phase II. Testing

In [27]:
test_1 = "Two medium cotton blue shirt with cartoon pictures"
test_2 = "7 For All Mankind slim fitting, straight leg pant with a center back zipper and slightly cropped leg"
test_3 = "white and black coat with bar pattern"
test_4 = "high heels in black color"
test_5 = "footies with opentoe, black and white strip, good ploy material"
test_6 = 'white oversized graphic-print silk cotton shirt'
test_7 = "red heels with white stripes for party wearing"
test_8 = "Slim Knit Skirt"
test_9 = "white jeans with green pattern and with blue strips red stars"
test_10 = "Stretch Cotton & Linen Wide Leg Trousers"

In [28]:
search(test_1)

{'top': 'SP21-W0928-12 Sara Henley Shirt BLUE PINSTRIPE (01EWXM6E9SHSB4H481CRYPAYF9)',
 'bottom': 'Cropped Alexa with Cut Off Hem (01EMPJFBFDZ02Y4AZ9KQP14S76)',
 'shoe': 'Wellington Leather Chelsea Boot (01F3TFFMM349FGCBQWSGE332J1)'}

In [29]:
search(test_2)

{'bottom': 'Monterey Pant (01EXSSWZS7A922NNNA9Y2E272H)',
 'top': 'Natalie 1.0 Top - Ivory (01EYXY45PHY0W5T9S0J6TS5HJJ)',
 'shoe': 'Spinner Slide Sandal (01E2P3PNQE59KXHZHW179DZF67)',
 'accessory': 'Fringed wool, silk and cashmere-blend jacquard cardigan (01DV6M9BHK039WP9730M31DEPA)'}

In [30]:
search(test_3)

{'accessory': 'Chevron Fur Coat (01EPRS18BCZZBQBX9SHRXTVR50)',
 'bottom': 'Luxe Vintage Edie with Destroy and Grinded Hem (01EHAXTTPH6YBP0RZR0NTB7BSF)',
 'top': 'Saylor (01E4VCEZH1BXQA4WAKPEBAYEPX)',
 'shoe': 'SeaVees Mariners Boot - Peacock Navy (01EZG68JQF7YJ6AXF9812BNDBR)',
 'onepiece': 'Silvana Dress Black (01EPX375HER9D0DN1T4XYXK8S6)'}

In [31]:
search(test_4)

{'shoe': 'PATRON II Black Tweed (01E96ECH71F540T0CEWS3D54YN)',
 'top': 'Rincon Ringer Tee - White (01F1JWZ9ZBBZP7T6GZ7HEP9X76)',
 'bottom': 'The Rambler Ankle Fray Groovin (01EH5A8TKY83AXZKE6VQK8PR7A)',
 'accessory': 'Shoulder Strap (01EPAVVQ4SHWJZQY1SPYC81Q0C)',
 'onepiece': 'Ala Dress (01EW3HD11GMV2R5F1AYBNP2TQ2)'}

In [32]:
search(test_5)

{'shoe': 'TANNIS Black/White (01EC8NEP0CQSX3N3BDP89KESX6)',
 'top': 'BELLA TURTLENECK (01ED9MHGB1V7Q19KFHTH9BEDDK)',
 'bottom': 'Deana Pleated High/Low Maxi Skirt (01E1JKSQA44HYZ5ACGEP7QJQ0R)',
 'accessory': 'SQUARE TOTE Saddle (01EEBJ0XK0KQKT24W7P64FT97J)',
 'onepiece': 'Jade (01E4EASQMQN65QFAM0PD5QGVVZ)'}

In [33]:
search(test_6)

{'top': 'T-Shirt M.I.L Off White (01EWHDJG0DKSFE593AH9XZ26TV)',
 'bottom': 'The Trickster Ankle On Holy Ground (01EH58S7H9GJ2J69RW1W0XB1X9)',
 'shoe': 'Oceane Lace-Up Shearling-Lined Leather Boots (01EAJAYCA0JJBBWHRF0QNS19NF)'}

In [34]:
search(test_7)

{'shoe': 'Hush Heel in Multi Crystal (01EGVFXQW1Q0DFM0M77T8DBNXC)',
 'top': 'Original Fit Tee (01EPAVESFN6KWRE8JE5BD5S4ZT)',
 'bottom': 'Samar (01ET90MMP63RJHC8V06TAGNSNW)',
 'accessory': 'Mini OHare Italian Nylon Tote (01F3E1KD8VGX5F5DJBP76YZY6T)',
 'onepiece': 'Makena Mini Dress (01ED59ES47VHRDA0AP921W8K8S)'}

In [35]:
search(test_8)

{'bottom': 'Aquinnah Skirt (01EN1A83EAYY8PA8F02GYTZR99)',
 'top': 'LENNOX - CHARCOAL BLACK (01ENBM6JW2TC3RVS4FMB8HQ9SB)',
 'shoe': 'Point Toe Boot Black Calf (01EJS26A4G6GNR0ZTFFZG1YFF7)',
 'accessory': 'Paloma Duster Cardigan - Morning Star Jacquard (01EZG4K0HF0JYH9T3NWF25AF0X)'}

In [36]:
search(test_9)

{'bottom': 'Navy & Army Skater Jean - Size 25 (01F38RMNZVCNTXYJ4E8HR74NCS)',
 'top': 'Lagarde 3.0 Shirt - Rain Cloud (01EYXZH0YRJBRMPGB6DY1SS0D4)',
 'shoe': 'Catania Oat Chunky Boots (01EM02E3RF07HSBMFDYYHE7YGG)',
 'accessory': 'Blue Yaba Cotton Kente Hand-Woven Single - Breasted Coat (01EC8KD9W5VCQ8DHTGJPXKCBP2)'}

In [37]:
search(test_10)

{'bottom': 'Stretch Cotton & Linen Wide Leg Trousers (01E1JM0DZ54QBW4M51K7REKMDE)',
 'shoe': 'Erin Pointy Toe Slingback Pump (01E1JM1BGF75MDESXW4Q412Y8X)',
 'top': 'Textured Stripe Removable Tie Neck Blouse (01E2C4XTP7JXPA5SJ0VJMCE4T5)'}