# Import packages and files

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from scipy.spatial.distance import pdist, squareform, cosine
from collections import OrderedDict


import nltk
nltk.download('punkt') # A popular NLTK sentence tokenizer
nltk.download('stopwords') # library of common English stopwords
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
nltk_stopwords=set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# display all columns
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
!python -m spacy download en_core_web_md
import en_core_web_md
import spacy
nlp = en_core_web_md.load()

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 1.2MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp37-none-any.whl size=98051305 sha256=494f3c070281dae8ec2b7dc04415d3a9300e1175b08a914d0f92aa317f3f6823
  Stored in directory: /tmp/pip-ephem-wheel-cache-cotih3xn/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [4]:
from google.colab import files

uploaded = files.upload()

Saving outfit_combinations USC.csv to outfit_combinations USC (1).csv
Saving product.xlsx to product.xlsx


In [5]:
# import excel file 2
df2 = pd.read_excel('product.xlsx')
# import excel file 3
df3 = pd.read_csv('outfit_combinations USC.csv')

# Drop Columns
Columns 'brand_canonical_url', 'created_at' are not useful in our analysis. Using Cosine Similarity we also found that 'brand_name' and 'name', 'brand_description' and 'description' are extremely similar thus we will drop one of the two

In [6]:
df2 = df2.drop(['brand_canonical_url','created_at','brand_description','brand_name'], axis=1)

# Predict outfit_item_type for all products
From the outfit combinations provided in file 3 we can see that a type (column 'outfit_item_type') was attributed each product. This type is not available in the product file. We will use the description of products to predict the type of the product.

In [7]:
#concatenate brand, brand_category, name, details, description
cols=['brand', 'brand_category', 'name', 'details', 'description']
df2['product_description_combined']=df2[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df2.head()

Unnamed: 0,product_id,brand,brand_category,name,details,description,product_active,product_description_combined
0,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,"Our signature khadi shirt\navailable in black and white\nEasy to wear from beach to city. We promise this top will be your go-to warm\nweather item. Perfect under a blazer. Hand loomed woven stripe in khadi cotton.\n Slightly sheer and gets softer with every wash. Ships First week of April\nCOLOR: WHITE OR BLACK\n— Length 27"" and Width 26.5"" — One size fits most — Grid khadi cotton",True,"Two Unknown Khadi Stripe Shirt-our signature shirt nan Our signature khadi shirt\navailable in black and white\nEasy to wear from beach to city. We promise this top will be your go-to warm\nweather item. Perfect under a blazer. Hand loomed woven stripe in khadi cotton.\n Slightly sheer and gets softer with every wash. Ships First week of April\nCOLOR: WHITE OR BLACK\n— Length 27"" and Width 26.5"" — One size fits most — Grid khadi cotton"
1,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,,Mid-length dress with ruffles and adjustable straps. Bias cut. Side seam\ninvisible zipper\nMade in New York\nModel wears size small\n100% Rose sylk\nRose sylk is an organic cellulose fiber made from the natural waste of rose\nbushes and stems.,True,Collina Strada Unknown RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO nan Mid-length dress with ruffles and adjustable straps. Bias cut. Side seam\ninvisible zipper\nMade in New York\nModel wears size small\n100% Rose sylk\nRose sylk is an organic cellulose fiber made from the natural waste of rose\nbushes and stems.
2,01EY4Y1BW8VZW51BWG5VZY82XW,Cariuma,Unknown,IBI Slip On Raw Red Knit Sneaker Women,,IBI Slip On Raw Red Knit Sneaker Women,False,Cariuma Unknown IBI Slip On Raw Red Knit Sneaker Women nan IBI Slip On Raw Red Knit Sneaker Women
3,01EY50E27A0P5V6KCW01XPDB43,Cariuma,Unknown,IBI Slip On Black Knit Sneaker Women,,IBI Slip On Black Knit Sneaker Women,False,Cariuma Unknown IBI Slip On Black Knit Sneaker Women nan IBI Slip On Black Knit Sneaker Women
4,01EY6DWHC2W5HPNEGXKEJ4A1CX,Cariuma,Unknown,CATIBA PRO Skate Black Suede and Canvas Contrast Thread Ivory Logo Sneaker Women Left,,,False,Cariuma Unknown CATIBA PRO Skate Black Suede and Canvas Contrast Thread Ivory Logo Sneaker Women Left nan nan


In [8]:
# extract out all product id and outfit_item_type in file 3
prod_type=df3[['product_id','outfit_item_type']]

In [9]:
# get rid of duplicated products 
print('before: ',prod_type.shape)
prod_type=prod_type.drop_duplicates()
print('after: ', prod_type.shape)

before:  (5291, 2)
after:  (844, 2)


In [10]:
# add product_description_combined column
prod_type_des=pd.merge(prod_type,df2[['product_id','product_description_combined']],on ='product_id', how = 'left')
prod_type_des=prod_type_des.dropna(subset=['product_description_combined'])

In [11]:
prod_type_des.shape

(809, 3)

In [12]:
nlp = en_core_web_md.load(disable=['ner', 'parser', "tok2vec"])

In [13]:
def clean_text(df, column):
  '''
  clearn strings in dataframe
  '''
  # lowercase 
  df[column]=df[column].str.lower()
  df[column]=df[column].str.replace('\n', ' ') 
  # replace hyphen with space
  df[column]=df[column].str.replace(r"([a-zA-Z])\-([a-zA-Z])", r"\1 \2")
  # get rid of all non-words
  df[column]=df[column].str.replace(r'[^A-Za-z\s]+', '')
  # get rid of 'nan' which were recognized as text when we concatenated columns previously
  df[column]=df[column].str.replace(r'\bnan\b', '') 
  # lemmatization
  df[column] = df[column].apply(lambda x: " ".join([y.lemma_ for y in  nlp(x)]))
  df[column]=df[column].str.replace(r'-PRON-', '')  
  return df

In [14]:
# apply function clean_text to dataframe prod_type_des
prod_type_des=clean_text(prod_type_des,'product_description_combined')

In [15]:
# since accessory are numbered only when there are multiple accesories recommended in an outfit
# we will get rid of the number at this stage
prod_type_des['outfit_item_type']=prod_type_des['outfit_item_type'].str.replace(r'accessory\d',r'accessory')

In [16]:
# see if there's class imbalance issue
prod_type_des['outfit_item_type'].value_counts(sort=True)

accessory    189
shoe         185
top          180
bottom       156
onepiece      99
Name: outfit_item_type, dtype: int64

In [17]:
def sample_label(df,class_column,label,number_of_samples):
  '''
  This function will sample a number of rows with the same label
  '''
  df_label=df[df[class_column]==label]
  df_label=df_label.sample(n=number_of_samples)
  return df_label

In [18]:
# sample 99 records from each label to deal with class imbalance
df_accessory=sample_label(prod_type_des,'outfit_item_type','accessory',99)
df_shoe=sample_label(prod_type_des,'outfit_item_type','shoe',99)
df_top=sample_label(prod_type_des,'outfit_item_type','top',99)
df_bottom=sample_label(prod_type_des,'outfit_item_type','bottom',99)
df_onepiece=sample_label(prod_type_des,'outfit_item_type','onepiece',99)
training_df=pd.concat([df_accessory,df_shoe,df_top,df_bottom,df_onepiece])

In [19]:
# train prediction model
# https://github.com/coding-maniacs/text_classification/blob/master/src/main.py

pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)),
                     ('chi',  SelectKBest(chi2, k=10000)),
                     ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))])
model = pipeline.fit(training_df['product_description_combined'], training_df['outfit_item_type'])

In [20]:
# evaluate model using cross validation
from sklearn.model_selection import cross_val_score, StratifiedKFold
kfolds = StratifiedKFold(n_splits = 10, shuffle = True)
model_cv = cross_val_score(model, training_df['product_description_combined'], training_df['outfit_item_type'], cv=kfolds, scoring='accuracy')
print("Mean classification error of model:", 1-model_cv.mean())

Mean classification error of model: 0.1091428571428571


In [21]:
# now we will use the trained model to predict the outfit_item_type for all products

# first we will get rid of products that are not active
false_idx=df2[df2.product_active==False].index
print('before drop: ',df2.shape)
df2.drop(false_idx, inplace=True)
print('after drop: ',df2.shape)

before drop:  (61355, 8)
after drop:  (8481, 8)


In [22]:
# then apply function clean_text to df2
# caution: this step can be slow
df2=clean_text(df2,'product_description_combined')
# use the trained model to predict the outfit_item_type for all products
df2['outfit_item_type']=model.predict(df2['product_description_combined'].values)
# take a look at df2['product_description_combined'] to get the rough idea of how accurate is the categorization
df2.head()

Unnamed: 0,product_id,brand,brand_category,name,details,description,product_active,product_description_combined,outfit_item_type
0,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,"Our signature khadi shirt\navailable in black and white\nEasy to wear from beach to city. We promise this top will be your go-to warm\nweather item. Perfect under a blazer. Hand loomed woven stripe in khadi cotton.\n Slightly sheer and gets softer with every wash. Ships First week of April\nCOLOR: WHITE OR BLACK\n— Length 27"" and Width 26.5"" — One size fits most — Grid khadi cotton",True,two unknown khadi stripe shirt signature shirt signature khadi shirt available in black and white easy to wear from beach to city promise this top will be go to warm weather item perfect under a blazer hand loomed weave stripe in khadi cotton slightly sheer and get soft with every wash ship first week of april color white or black length and width one size fit most grid khadi cotton,top
1,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,,Mid-length dress with ruffles and adjustable straps. Bias cut. Side seam\ninvisible zipper\nMade in New York\nModel wears size small\n100% Rose sylk\nRose sylk is an organic cellulose fiber made from the natural waste of rose\nbushes and stems.,True,collina strada unknown ruffle market dress loopy pink sistine tomato mid length dress with ruffle and adjustable strap bias cut side seam invisible zipper make in new york model wear size small rose sylk rose sylk be an organic cellulose fiber make from the natural waste of rose bush and stem,onepiece
5,01EWTH4QH6RCMS20VPKNJZDNH3,Maia Bergman,Unknown,Ada,,"Easy, our smock mini dress with gorgeous puff sleeves. Feels like the first day\nof spring. Belt included\n100% LINEN\nModel is wearing size XS",True,maia bergman unknown ada easy smock mini dress with gorgeous puff sleeve feel like the first day of spring belt include linen model be wear size xs,onepiece
6,01EWTHFH4H3GP0Q34E6JBYJZNZ,Maia Bergman,Unknown,Clara,,"Picture perfect Clara, a stunning linen cover-up, calls for sandy feet and a\nspicy cocktail. Don't disappoint her\n100% LINEN\nModel wears a size S.\nThis style is dual sizing, pick S for (XS/S) and M for (M/L)",True,maia bergman unknown clara picture perfect clara a stunning linen cover up call for sandy foot and a spicy cocktail do not disappoint linen model wear a size s this style be dual sizing pick s for xss and m for ml,accessory
7,01EWTKPP4EEKZHJXEPYD7QRFP3,Maia Bergman,Unknown,Simone,,"Our best-selling, bump-friendly Simone will be your favourite in no time. It\nwill be very difficult for you to wear anything else, don't say we didn't warn\nyou.\n100% COTTON\nModel is wearing size XS",True,maia bergman unknown simone good selling bump friendly simone will be favourite in no time will be very difficult for to wear anything else do not say do not warn cotton model be wear size xs,top


# Extract New Features
We will use column 'product_description_combined' to extract some features which will be used to match user's search query.

In [23]:
def get_feature_coloum(df,corpus_column,target_column,pattern):
  df[target_column] = df[corpus_column].str.findall(pattern, flags=re.IGNORECASE)
  df[target_column]= df[target_column].apply(lambda y: np.nan if len(y)==0 else y)
  df[target_column]= df[target_column].fillna("")
  df[target_column]= df[target_column].apply(" ".join)  
  df[target_column] = (df[target_column].str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' '))
  return df

In [24]:
### Extract Detailed Products
product_type_pattern = r'\b(?P<Products>pant[s]?|bottom[s]?|jean[s]?|legging|jogger|sweatpant[s]?|trouser[s]?|short[s]?|skirt[s]?|culotte[s]?\
|jumpsuit[s]?|romper[s]?|dress|playsuit[s]?|onepiece|one-piece|unitard|gown[s]?|robe[s]?|cloak[s]?|bodysuit[s]?|swimsuit[s]?\
|shoe[s]?|sneaker[s]?|sandal[s]?|slipper[s]?|boot[s]?|heel[s]?|pump[s]?|flat[s]?|espadrille[s]?|slide[s]?|flip-flop[s]?\
|flip flop[s]?|flipflop[s]?|wedge[s]?|loafer[s]?|oxford[s]?|mule[s]?|clog[s]?|chukka[s]?|chelsea[s]?|combat[s]?|moccasin[s]?\
|driver[s]?|derby[s]?|blucher[s]?|accessory|accessories|bag[s]?|backpack[s]?|tote[s]?|wristle[s]?t|handbag[s]?|card case[s]?\
|card holder[s]?|satchel[s]?|cross\s?body[s]?|wallet[s]?|pouch[s]?|purse[s]?|clutch[s]?|bucket[s]?|pack[s]?|scarf|scarves|wrap\
|pashmina|shawl|stole|neckwear[s]?|kerchief|boa|bra[s]?|belt[s]?|bikini[s]?|shirt[s]?|coat[s]?|blazer[s]?|t\s?shirt[s]?|blouse[s]?\
|sweatshirt[s]?|tanks|camis|top[s]?|sweater[s]?|knit[s]?|knitwear[s]?|sleeve[s]?|hoody|hoodie[s]?|shoulder\s?bag[s]?|cardigan\
|turban|tee|saddle|jacket|tunic|caftan)\b'

df2 = get_feature_coloum(df2,'product_description_combined','product_type',product_type_pattern)

In [25]:
### Extract Weather & Season
weather_pattern = r'\b(?P<Weather_Season>cold|cool|cooler|warm|warmer|hot|spring|summer|fall|autumn|winter|snow[y]?|rain[y]?|wind[y]?\|cloud[y]?)\b'
df2 = get_feature_coloum(df2,'product_description_combined','Weather_Season',weather_pattern)

In [26]:
### Extract Occassion
occasion_pattern = r'\b(?P<Occassion>beach[e]?[s]?|city|business|formal|casual|professional|interview|wedding|dinner|date|dating\
|party|parties|baptism|funeral|cocktail|bar|night)\b'
df2 = get_feature_coloum(df2,'product_description_combined','Occasion',occasion_pattern)

In [27]:
### Extract Material
material_pattern = r'\b(?P<Material>100\s?organic|100\s?cotton|organic|nylon|elastane|cotton|silk|wool|woolen|leather|ramie|flax\
|denim|fur|polyester[s]?|spandex|suede|rose|nappa)\b'
df2 = get_feature_coloum(df2,'product_description_combined','material',material_pattern)

In [28]:
### Extract Made_in
made_in_pattern = r'(?:made|make)\s?in\s?(?P<Made_in>\w+\s?\w*)\b'
df2 = get_feature_coloum(df2,'product_description_combined','made_in',made_in_pattern)

In [29]:
### Extract the fit with context-window 2
fit_long_pattern = r'\b\w+\s(?:leg[s]?|raise|waist|inseam|hip|knee|ankle|thigh|neck|shoulder[s]?|back|arm|rear|calves)\s\w+\b'
df2 = get_feature_coloum(df2,'product_description_combined','fit_long',fit_long_pattern)

In [30]:
### Extract the fit
fit_pattern = r'\b(?P<fit>tight\s?(?:leg)?|skinny\s?(?:leg)?|slim\s?(?:leg)?|cropped\s?(?:leg)?|straight\s?(?:leg)?\
|flare\s?(?:leg)?|wide\s?(?:leg)?|(?:high|low|medium)\s?rise|(?:high|low|medium)\s?waist|boyfriend|girlfriend|crop|cropped|ripped)\b'
df2 = get_feature_coloum(df2,'product_description_combined','fit',fit_pattern)

In [31]:
### Extract the pattern
pattern = r'\b(?P<pattern>stripe[d]?|floral|polka\s?dot|herringbone|paisley|stripe[s]?|flora|ruffle[s]?|twist|stretch|patch|drawstring|zipper|opaque|v\s?neck)\b'
df2 = get_feature_coloum(df2,'product_description_combined','pattern',pattern)

In [32]:
### Extract the age
age_pattern = r'\b(?P<age>baby|babies|kid[s]?|child|children|teen[s]?|adult[s]?|boy[s]?|girl[s]?)\b'
df2 = get_feature_coloum(df2,'product_description_combined','age',age_pattern)

In [33]:
### Extract the size
size_pattern = r'\b(?P<size>small|medium|large|extra\s?small|extra\s?large|xs|s|m|l|xl|xxl|xxxL|one\s?size|uni\s?size)\b'
df2 = get_feature_coloum(df2,'product_description_combined','size',size_pattern)

In [34]:
### Identify Color
def identify_color(line):
    #print(line)
    beige_pattern=r'(\bbeige[s]?\b)'
    black_pattern=r'(\bblack[s]?\b)'
    blue_pattern=r'(\bblue[s]?\b)'
    brown_pattern=r'(\bbrown[s]?\b)'
    burgundy_pattern=r'(\bburgundy[s]?\b)'
    Gold_pattern=r'(\bgold[s]?\b)'
    Gray_pattern=r'\b(gray[s]?|grey[s]?)\b'
    Green_pattern=r'(\bgreen[s]?\b)'
    Navy_pattern=r'(\bnavy[s]?\b)'
    Neutral_pattern=r'(\bneutral[s]?\b)'
    Orange_pattern=r'(\borange[s]?\b)'
    Pinks_pattern=r'(\bpink[s]?\b)'
    Purple_pattern=r'(\bpurple[s]?\b)'
    Red_pattern=r'(\bred[s]?\b)'
    Silver_pattern=r'(\bsilver[s]?\b)'
    Teal_pattern=r'(\bteal[s]?\b)'
    White_pattern=r'(\bwhite[s]?\b)'
    Yellow_pattern=r'(\byellow[s]?\b)'
    
    colors=[""]
    match1 = re.search(beige_pattern, line, flags=re.IGNORECASE)
    match2 = re.search(black_pattern, line, flags=re.IGNORECASE)
    match3 = re.search(blue_pattern, line, flags=re.IGNORECASE)
    match4 = re.search(brown_pattern, line, flags=re.IGNORECASE)
    match5 = re.search(burgundy_pattern, line, flags=re.IGNORECASE)
    match6 = re.search(Gold_pattern, line, flags=re.IGNORECASE)
    match7 = re.search(Gray_pattern, line, flags=re.IGNORECASE)
    match8 = re.search(Green_pattern, line, flags=re.IGNORECASE)
    match9 = re.search(Navy_pattern, line, flags=re.IGNORECASE)
    match10 = re.search(Neutral_pattern, line, flags=re.IGNORECASE)
    match11 = re.search(Orange_pattern, line, flags=re.IGNORECASE)
    match12 = re.search(Pinks_pattern, line, flags=re.IGNORECASE)
    match13 = re.search(Purple_pattern, line, flags=re.IGNORECASE)
    match14 = re.search(Red_pattern, line, flags=re.IGNORECASE)
    match15 = re.search(Silver_pattern, line, flags=re.IGNORECASE)
    match16 = re.search(Teal_pattern, line, flags=re.IGNORECASE)
    match17 = re.search(White_pattern, line, flags=re.IGNORECASE)
    match18 = re.search(Yellow_pattern, line, flags=re.IGNORECASE)
    
    if match1:
        colors.append('beige')
    if match2:
        colors.append('black')
    if match3:
        colors.append('blue')
    if match4:
        colors.append('brown')
    if match5:
        colors.append('burgundy')
    if match6:
        colors.append('gold')
    if match7:
        colors.append('gray grey')
    if match8:
        colors.append('green')
    if match9:
        colors.append('navy')
    if match10:
        colors.append('neutral')
    if match11:
        colors.append('orange')
    if match12:
        colors.append('pinks')
    if match13:
        colors.append('purple')
    if match14:
        colors.append('red')
    if match15:
        colors.append('silver')
    if match16:
        colors.append('teal')
    if match17:
        colors.append('white')
    if match18:
        colors.append('yellow')
    # return the first identified colors
    length=len(colors)
    if length > 1:
        result= colors[1:]
    else:
        result=colors[0]
    return " ".join(result)

In [35]:
df2['color'] = df2['product_description_combined'].apply(identify_color)

In [36]:
### identify the gender
def identify_gender(line):
    woman_pattern = r'\b(woman|women|girl[s]?|lady|ladies|dress|dresses|onepiece|blouse[s]?|skirt[s]?|skort|camisole|boyfriend\
    |femme|female|unisex|she|her)\b'
    man_pattern = r'\b(man|men|male|unisex|boy[s]?|he|him|his)\b'
    gender=[""]
    
    match1 = re.search(woman_pattern, line, flags=re.IGNORECASE)
    match2 = re.search(man_pattern, line, flags=re.IGNORECASE)
    if match1:
        gender.append('woman')
    if match2:
        gender.append('man')

    return " ".join(gender)

In [37]:
df2['gender'] = df2['product_description_combined'].apply(identify_gender)

In [76]:
### Identify Categories
def identify_category(line):
    bottom_pattern = r'\b(pant[s]?|jeans|leg|legging|jogger|sweatpant[s]?|trouser[s]?|short[s]?|skirt[s]?|culotte[s]?)\b'
    one_piece_pattern=r'\b(jumpsuit[s]?|romper[s]?|dress|playsuit[s]?|one\s?piece|one-piece|unitard|gown[s]?|robe[s]?|cloak[s]?|bodysuit[s]?|swimsuit[s]?|bikini[s]?)\b'
    shoe_pattern=r'\b(shoe[s]?|sneaker[s]?|sandal[s]?|slipper[s]?|boot[s]?|heel[s]?|pump[s]?|flat[s]?|espadrille[s]?|slide[s]?|flip-flop[s]?|flip flop[s]?|flipflop[s]?|wedge[s]?|loafer[s]?|oxford[s]?|mule[s]?|clog[s]?|chukka[s]?|chelsea[s]?|combat[s]?|moccasin[s]?|driver[s]?|derby[s]?|blucher[s]?|saddle)\b'
    accessory_pattern=r'\b(accessory|accessories|bag[s]?|backpack[s]?|tote[s]?|wristle[s]?t|handbag[s]?|card case[s]?|card holder[s]?|satchel[s]?|cross\s?body[s]?|wallet[s]?|pouch[s]?|purse[s]?|clutch[s]?|bucket[s]?|pack[s]?|scarf|scarves|wrap|pashmina|shawl|stole|neckwear[s]?|kerchief|boa|bra[s]?|belts|tie|turban)\b'
    top_pattern=r'\b(tee|t\s?shirt[s]?|shirt[s]?|coat[s]?|blazer[s]?|tshirt[s]?|blouse[s]?|sweatshirt[s]?|tanks|camis|top[s]?|sweater[s]?|knitwear[s]?|hoody|hoodie[s]?|cardigan|jacket|tunic|caftan)\b'
    
    category=[np.nan]
    match1 = re.search(one_piece_pattern, line, flags=re.IGNORECASE)
    match2 = re.search(bottom_pattern, line, flags=re.IGNORECASE)
    match3 = re.search(top_pattern, line, flags=re.IGNORECASE)
    match4 = re.search(shoe_pattern, line, flags=re.IGNORECASE)
    match5 = re.search(accessory_pattern, line, flags=re.IGNORECASE)
    if match1:
        category.append('onepiece')
    if match2:
        category.append('bottom')
    if match3:
        category.append('top')
    if match4:
        category.append('shoe')
    if match5:
        category.append('accessory')
    
    length=len(category)
    if length > 1:
        a = category[1]
    else:
        a = category[0]
    # return the first identified category
    return a

In [77]:
df2['regex_cat'] = df2['product_description_combined'].apply(identify_category)
df2["regex_cat"]= df2["regex_cat"].fillna("")

In [78]:
### incorporate outfit_item_type and regex_cat

for i in df2.index:
  check = len(df2.loc[i,'regex_cat'])
  if check == 0:
    df2.loc[i, 'product_category'] = df2.loc[i, 'outfit_item_type']
  else:
    df2.loc[i, 'product_category'] = df2.loc[i,'regex_cat']

In [79]:
# can sample a few to see how regex classification is doing
df2[['name','product_description_combined','outfit_item_type','regex_cat','product_category']].sample(10)

Unnamed: 0,name,product_description_combined,outfit_item_type,regex_cat,product_category
25107,POPPY TOP – NAVY,st roche unknown poppy top navy with bracelet length puff sleeve this romantic s influence blouse have a high ruffle neck and front slit reveal a subtle glimpse of skin the original poppy embroidery design on the yoke be inspire by a delicate drawing by anita pallenberg and hand embroider by with the artisan work with in jaipur love this wear with high waiste denim ship fri april th handwoven gots certify organic cotton unlined color navy with white mustard hand embroider poppy on the sleeve relaxed fit top with a ruffle collar and soft gather from the yoke bracelet length puff sleeve with pintuck detail slip on over head length from shoulder to front hem measure on a size small be delicate wash cold wash with like color hang to dry do not tumble cool iron on reverse side or dry cleanable green dry clean if possible make in india model measure tall and be wear a size,top,top,top
31405,Spot Me Top,astr the label unknown spot top viscose nylon line polyester button closure dry clean only ruche detail puff sleeve styleact length size m model be wear size s model measurement height bust waist hip fit true to size spot new favorite top feature a semi sheer v neckline that descend to a shirred front velvet burnout polka dot print throughout long sheer sleeve with statement shoulder,top,top,top
33185,Lada Top,rachel comey unknown lada top the fluid lada top be cut from signature lash crepe feature elbow length sleeve and a slouchy turtleneck this style be easily style for casual or formal wear the frayed surface point of lash crepe result from the severing of thread in jacquard motif relaxed turtleneck blouse zipper closure at back viscose polyester make in usa,top,top,top
30604,Mojo Boot in Tango Red,simon miller unknown mojo boot in tango red round toe over the calf boot detail model f cow leather emboss lizard make in pt size mm heel,shoe,shoe,shoe
30525,Clematis Top,rachel comey unknown clematis top multi seasonal basic knit with airy pointelle stitch the clematis top be sleek and minimal feature a boat neck and cap sleeve pair this crop shell with any trouser in wardrobe slim fit knit semi sheer wool cotton viscose nylon make in china,top,bottom,bottom
34820,Devereaux Ruffle Midi Dress,astr the label unknown devereaux ruffle midi dress dry clean only self polyester line polyester elastane zipper closure style acdrb length size m model be wear size s fit true to size,onepiece,onepiece,onepiece
29460,The Hustler Ankle Fray Dragonfly,mother denim apparel the hustler ankle fray dragonfly expressive healing selfless classic bootcut with a crop and fray hem in bright blue make in los angeles style no dfy,shoe,,shoe
35459,Logo Leather Crossbody Pouch,alexander wang handbagshandbagscrossbodybags logo leather crossbody pouch sleek leather crossbody bag highlight with logo trim shoulder strap dual top zip closure silvertone hardware three interior card slot line leather import size w x h x d,accessory,top,top
32547,OSLO - GREY MIXED ANIMAL,rails unknown oslo grey mixed animal detail long sleeve over sized grey animal print heavy wool cardigan sweater feature large patch pocket and faux tortoiseshell button this chunky and fluffy cardigan will be go to cozy piece to curl up in or wear in place of a lightweight jacket superfine alpaca extrafine merino wool polyamide fiber elastan import care dry clean or hand wash cold lie flat to dry cool iron if need do not bleach fit recommend order true to size body length measure from small model be wear size s model measurement height bust waist hip,accessory,top,top
28853,Burnout Leopard Wrap Top,astr the label unknown burnout leopard wrap top viscose nylon line polyester button closure dry clean only leopard print wrap blouse styleatb length size m model be wear size s model measurement height bust waist hip fit true to size animal print take over a classic silhouette this ultra femme top feature a wrap front and v neckline define waist lead to a slightly flare hemline strong puff shoulder add interest and sheer long sleeve finish the look,top,top,top


In [80]:
# take a look at how many features we added
df2.columns

Index(['product_id', 'brand', 'brand_category', 'name', 'details',
       'description', 'product_active', 'product_description_combined',
       'outfit_item_type', 'product_type', 'Weather_Season', 'Occasion',
       'material', 'made_in', 'fit_long', 'fit', 'pattern', 'age', 'size',
       'color', 'gender', 'regex_cat', 'product_category', 'final_features'],
      dtype='object')

# Preparing Final Corpus
Join the new features into a string

In [81]:
df2['final_features'] = df2[['name',
       'product_category', 'product_type', 'Weather_Season', 'Occasion',
       'color', 'material', 'made_in', 'fit_long', 'fit', 'pattern', 'age',
       'gender', 'size']].astype(str).agg(' '.join, axis=1)

In [82]:
final_corpus= df2[['product_id','product_category','name','final_features']].copy()

In [83]:
def remove_stopwords(feature: str):
    tokens = nltk.word_tokenize(feature)
    filtered_tokens = list(filter(lambda token: token not in nltk_stopwords, tokens))
    return " ".join(filtered_tokens)

# https://gaurav5430.medium.com/using-nltk-for-lemmatizing-sentences-c1bfff963258
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence) 

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
        
def clean_text_nostop(query: str):
    query = re.sub(r'[^\w\s]+','', query)
    query = query.lower()
    query = lemmatize_sentence(query)
    return query

def clean_string(query: str):
    query = re.sub(r'[^\w\s]+','', query)
    query = query.lower()
    query = remove_stopwords(query)
    query = lemmatize_sentence(query)
    return query

### Remove duplicate words form corpus
#http://www.martinbroadhurst.com/removing-duplicates-from-a-list-while-preserving-order-in-python.html
def unique(text: str):
    tokens = nltk.word_tokenize(text)
    seen = set()
    a = [x for x in tokens if not (x in seen or seen.add(x))]
    return " ".join(a)

In [84]:
### Cleaning final_features but son't remove stopwords anymore,since it is a combination of extracted features
final_corpus=clean_text(final_corpus,'final_features')
### Removing duplicate words
final_corpus['final_features'] = final_corpus['final_features'].apply(unique)

# Calculate the Cosine Similarity

In [85]:
# https://medium.com/@armandj.olivares/building-nlp-content-based-recommender-systems-b104a709c042
docs_list = []
for line in final_corpus['final_features']:
    doc = nlp("u'"+line+"'")
    docs_list.append(doc)
final_corpus['docs'] = docs_list

In [86]:
# calculate the similarity score between user's query and every product
# https://medium.com/@armandj.olivares/building-nlp-content-based-recommender-systems-b104a709c042
def calculate_sim(df, doc_column, score_column, query):
    query = clean_string(query)
    doc2 = nlp("u'"+query+"'")
    scores = []
    for doc1 in df[doc_column]:
      score = doc1.similarity(doc2)
      scores.append(score)
    df[score_column] = scores
    result_df = df.sort_values(by=score_column, ascending=False)
    return result_df

# Search function
Based on user input, find the top 10 most similar products from our product database. If one of the products also appear in the existing outfit combination, the function will output that outfit combination. If not, then we will make a new outfit combination from the list. The outputs can only be top, bottom, shoe, (accessory) or onepiece, shoe, (accessory).

In [87]:
# search query: user input what they are looking for and this function will return an outfit combination
# The idea of this search function is:
# first, we will loop through the top 10 products sorted by score returned by the calculate_sim function
### to see whether there is a outfit recommendation from the expert.
# If not, we will look along the returning product list returned by the calculate_sim function.
### Set the first product as the target product and search from the high score to low score,
### return the first product for each category different from target category.
### Stop when dictionary has three products
# Note: we set conditions to avoid recommending "top" "bottom" with "onepiece" .

def search(query: str):
    '''
    The user passed in his/her query, the function will return a dictionary of outfit.
    For example:
                search("slim jeans") -->
                {
                "top":
                "bottom":
                "shoe":    
                }
    
    '''
    outfit_df = pd.read_csv('outfit_combinations USC.csv')
    result_df = calculate_sim(final_corpus, 'docs', 'scores', query)
    # top 10 products that are most similar to the query
    top_10 = result_df[:10]
    for j in range(len(top_10)):
      output = {}
      target_id = top_10['product_id'].iloc[j]
      target_cat = top_10['product_category'].iloc[j]
      match = (outfit_df['outfit_id']==target_id).sum()
      output[target_cat] = top_10.name.iloc[j]+ "(" + top_10.product_id.iloc[j] + ")"
      # if one of the products is present in the existing outfit combination
      if match > 0 :
        match_df = outfit_df[outfit_df['outfit_id'] == target_id]
        for i in range(len(match_df)):
          cat = match_df.outfit_item_type.iloc[i]
          if cat != target_cat:
            if cat not in output.keys():
              output[cat] = match_df.product_full_name.iloc[i] + "(" + match_df.product_id.iloc[i] + ")"
            else:
              output = output
          else:
            continue
        break
      else:
        continue
    # if none of the products is in the existing outfit combination
    check = len(output)
    if check == 1:
      output = {}
      target_id = result_df['product_id'].iloc[0]
      target_cat = result_df['product_category'].iloc[0]
      output[target_cat] = result_df.name.iloc[0]+ "(" + result_df.product_id.iloc[0] + ")"
      match_df = result_df[result_df.product_category!=target_cat]
      for m in range(len(match_df)):
        if len(output)<3:
          cat = match_df.product_category.iloc[m]
          if 'top' in output.keys():
            if cat != 'onepiece' and cat not in output.keys():
              output[cat] = match_df.name.iloc[m] + "(" + match_df.product_id.iloc[m] + ")"
            else:
              output = output
          elif 'bottom' in output.keys():
            if cat != 'onepiece' and cat not in output.keys():
              output[cat] = match_df.name.iloc[m] + "(" + match_df.product_id.iloc[m] + ")"
            else:
              output = output
          elif 'onepiece' in output.keys():
            if cat != 'top' and cat !='bottom' and cat not in output.keys():
              output[cat] = match_df.name.iloc[m] + "(" + match_df.product_id.iloc[m] + ")"
            else:
              output = output
          else:
            if cat not in output.keys():
              output[cat] = match_df.name.iloc[m] + "(" + match_df.product_id.iloc[m] + ")"
            else:
              output = output
        else:
          break
    return output

In [88]:
query = 'slim fitting, straight leg pant with a center back zipper and slightly cropped leg'
search(query)

{'bottom': 'Barrie Pant(01EF2ETR5B9DZJNXW6HGS8SB77)',
 'shoe': 'Jensen Trench - Saddle(01EYXXFTBD9GZ7D1TGTEF2QDNZ)',
 'top': 'The Iona Funnelneck(01EC0GTF7D8RMKYET3JM48C1K5)'}

In [89]:
query = 'slim jeans'
search(query)

{'bottom': 'kate jean(01ESVKXA1MVJKZPS9FS4GB07G2)',
 'shoe': 'Thais(01EZ7WGGMXY72V4P5S05NFJ0BF)',
 'top': 'Cabana Camp Shirt(01EATVH2QW4HSQNFG2MBXT71V6)'}

In [90]:
query = 'summer cotton dress floral print long red'
search(query)

{'accessory': 'Charlotte Scarf TEAL FLORAL(01EWXHR20BJ17FXN90JNAC2TES)',
 'onepiece': 'honeymoon dress(01F0M5DZ2PP4GQTDSKNVN3H8YE)',
 'shoe': 'OCA Low Stripe Burgundy Red Canvas Sneaker Women(01ET60951XE4HBG8RDBV800HPH)'}