In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
import re

ModuleNotFoundError: No module named 'spacy'

In [294]:
#read in data
behold_products = pd.read_excel('Behold+product+data+04262021.xlsx',header = 0)
attr_tags = pd.read_csv('usc_additional_tags.csv')

### Format Attribute Data

- created one row for each unique product_id
- replaced missing features with unknown

In [295]:
#pivot df
grouped_tags = pd.DataFrame(attr_tags.groupby(['product_id','attribute_name'])
                            .attribute_value.apply(lambda x:" ".join(set(x)))).reset_index()

pivoted_tags = grouped_tags.pivot(index='product_id',columns=['attribute_name'],values='attribute_value').reset_index()
pivoted_tags.fillna('unknown',inplace=True)

In [296]:
pivoted_tags.sample(5)

attribute_name,product_id,additionalcolor,beltbucklematerial,beltbuckleshape,beltclosure,beltmaterial,beltwidth,calfwidth,category,classbelts,...,subcategorysweater,subcategorysweatshirtandhoodie,subcategorytop,sunglassframematerial,sweatshirtandhoodieclosure,toeexposure,toestyle,trend,uppermaterial,wash
3068,01E4RV4R1FT9Y4825X72Q5YV8Z,whites,unknown,unknown,unknown,unknown,unknown,unknown,top,unknown,...,unknown,unknown,buttondown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
2823,01E4ED60R8KHJ6178M3XSBVT27,whites,unknown,unknown,unknown,unknown,unknown,unknown,top,unknown,...,unknown,unknown,knit,unknown,unknown,unknown,unknown,unknown,unknown,unknown
2354,01E2P1C1DE194MSMHHA199YX7F,unknown,unknown,unknown,unknown,unknown,unknown,unknown,bottom,unknown,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
151,01DPH126VRWE7ENM47QZTPC43V,unknown,unknown,unknown,unknown,unknown,unknown,unknown,accessory,unknown,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
1338,01E2KW8EV2W4J8SXK6J3VZNPE1,unknown,unknown,unknown,unknown,unknown,unknown,unknown,blazerscoatsjackets,unknown,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown


### Tokeninze Current Features

- using spacy find the lemma of each word in the description, brand_name, and name columns
- adjust custom stopwords for later feature engineering
- filter stopwords using regex and spacy predefined stopwords
- filter our punctuation, special characters, etc with regex
- rebuilt new tokenized columns

In [297]:
#function to tokenize columns, removing stopwords and returning lemmas
nlp = spacy.load("en_core_web_md", disable=['ner', 'parser', "tok2vec"])
nlp.vocab["made"].is_stop = False
nlp.vocab["in"].is_stop = False
nlp.vocab["one"].is_stop = False
nlp.vocab["two"].is_stop = False

def spacy_tokens(line):
    line = re.findall('(?u)\\b[a-zA-Z][a-zA-Z]+\\b',str(line))
    doc = nlp(" ".join(line))
    return " ".join([token.lemma_.lower() for token in doc if token.is_stop != True])
    
spacy_tokens('IBI Slip On Raw Red Knit Sneaker Women')

'ibi slip raw red knit sneaker women'

In [298]:
#build token columns
df_cols = ['description','brand_name','name']

#runs a little slow
def token_builder(df,df_cols):
    for i in df_cols:
        print('parsing token_'+i)
        df['token_'+i] = df[i].apply(spacy_tokens)
        


In [299]:
%%time
token_builder(behold_products,df_cols)

parsing token_description
parsing token_brand_name
parsing token_name
Wall time: 5min 31s


In [300]:
behold_products['tokens'] = behold_products['token_name'] + ' ' +behold_products['token_description']

### Build new Features

- stem_join to create regex pattern to search for
- use the top attributes from the attribute data to create new search patterns
- add custom stopwords to colors, products
- build new features based on each pattern

In [435]:
#identify colors and products
#from hw 3
#nlp.vocab["one"].is_stop = False
#nlp.vocab["two"].is_stop = False

def stem_join(list):
    doc = nlp(" ".join(list))
    list = [token.lemma_.lower() for token in doc]
    list = '|'.join(r"\b{}s?\b".format(x) for x in list) #from stackoverflow
    return list

In [436]:
top_attributes = (pivoted_tags != 'unknown').apply(sum).sort_values(ascending=False)[1:15].index
top_attributes

Index(['category', 'primarycolor', 'style', 'occasion', 'gender', 'fit',
       'sizing', 'materialclothing', 'drycleanonly', 'sleevelength',
       'neckline', 'additionalcolor', 'lengthtop', 'pattern'],
      dtype='object', name='attribute_name')

In [437]:
top_attr_dict = {}
for i in top_attributes[:15]:
    vals = list(pivoted_tags[i].unique())
    doc = nlp(" ".join(vals))
    vals = list(set([i.lemma_ for i in doc]))
    vals.remove('unknown')

    top_attr_dict['lemma_'+i] = vals

In [438]:
#add custom occasions
top_attr_dict['lemma_occasion'].extend(['wedding','formal','holiday','beach','hiking'])

#create seasons
top_attr_dict['lemma_seasons'] = ['summer','winter','spring','fall','autumn']

#dry clean pattern
dry_clean = r'(\s?dry\s?clean\s?)'

In [459]:
#custom terms
products =  ['Bottom','tank','backpack','capri','Shoe','Handbag','Scarf','skirt','hoodie','jacket','coat','pajamas', 'blazer','shirt','cardigan','bootie','sandal','glasses', 
            'dress','top','blouse','bag','purse','earring','lingerie','bra','jewelry','pants','jewelry','accessories','romper','tee','shorts','sweatpant','glasses','sunglasses','shades',
           'jeans','jean','belt','raincoat','outerwear','hat','sneaker','heels','pumps','camisole','outerwear','sweater','cap','sweatshirt','boot','chino','short','trouser','swimsuit']

colors = ['Beige','Black','Blue','Brown','Burgundy','Gold','Gray','Green','Multi','Navy','Neutral','Orange','Pink','Purple','Red','Silver','Teal','White','Yellow','leopard','striped','plaid','floral']

#add custom colors,products from hw3
top_attr_dict['lemma_category'].extend(products)
top_attr_dict['lemma_primarycolor'].extend(colors)


In [460]:
#function to create features from attr data
lemma = ['lemma_primarycolor','lemma_occasion','lemma_style','lemma_pattern','lemma_seasons']
col = ['colors','occasion','style','patterns','seasons']  

for i,j in zip(col,lemma):
    regex_pattern = stem_join(top_attr_dict[j])
    behold_products[i] = behold_products['tokens'].str.findall(regex_pattern)

In [461]:
#grab products
products = stem_join(top_attr_dict['lemma_category']) +'|\\bone ?piece\\b' + '|\\btwo ?piece\\b' 
behold_products['product type'] = behold_products['token_name'].str.findall(products)
behold_products['product_mentions'] = behold_products['tokens'].str.findall(products)

#decided to allow duplicates since the below selection process uses the most common occurring product to identify the category
#behold_products['product_mentions'] = behold_products['product_mentions'].apply(lambda x: list(set(x))) 

#create dry clean only column
behold_products['dry_clean_only'] = behold_products['tokens'].str.findall(dry_clean)

In [462]:
from collections import Counter
def my_mode(x):
    c = Counter(x)
    m = [k for k, v in c.items() if v == c.most_common(1)[0][1]]
    if len(m)>1:
        return [m[0]]
    else:
        return m


In [463]:
#capture product and add complimentary column
behold_products['product_category'] = behold_products['product type'] + behold_products['product_mentions']
behold_products['complimentary_product'] = behold_products['product_category'].apply(lambda x: [i for i in x if i not in my_mode(x)])
behold_products['product_category'] = behold_products['product_category'].apply(my_mode)

In [464]:
pd.options.mode.chained_assignment = None
model_data=behold_products[['brand','token_description','token_name','product_category','style','occasion','seasons','patterns','dry_clean_only','colors','complimentary_product']]

for col in model_data.loc[:,'product_category':]:
    model_data[col] = model_data.loc[:,col].apply(lambda x:" ".join(set(x)) if len(x)>0 else 'unknown')
    
model_data.sample(5)

Unnamed: 0,brand,token_description,token_name,product_category,style,occasion,seasons,patterns,dry_clean_only,colors,complimentary_product
46376,J.Crew,made in italy orsay pumps shoe closet staple,lucie suede pumps,pumps,unknown,unknown,unknown,unknown,unknown,unknown,shoe
57678,ASTR the Label,,bubble hem short sleeve,short,unknown,unknown,unknown,unknown,unknown,unknown,unknown
42588,7 For All Mankind,,skinny paxtyn,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
31075,Cynthia Rowley,floral print cotton smocked blouse puff sleeve...,alexa smocked cotton blouse,blouse,unknown,unknown,unknown,floral,dry clean,floral,unknown
16263,Studio 189,product details cotton grown in ghana slim fit...,yellow brown cotton mechanic jumpsuit,unknown,classic,unknown,unknown,unknown,unknown,brown yellow,unknown


### Results from Feature Engineering

- captures approx 40k product categories
- decent captures on colors
- around 8k-10k for styles, patterns, and dry cleaning

In [465]:
#function to calculate number of new known features
def captures(df,col):
    return sum(df[col].apply(len)>0)

new_features = ['product_category','style','occasion','seasons','patterns','dry_clean_only','colors','complimentary_product']
for i in new_features:
    print(i,' captures: ',captures(behold_products,i))

product_category  captures:  41125
style  captures:  10413
occasion  captures:  4188
seasons  captures:  4859
patterns  captures:  9494
dry_clean_only  captures:  8451
colors  captures:  23154
complimentary_product  captures:  14816
