## Customized Functions

#### clean_product_table function

In [1]:
def clean_product_table(product): # input: pd.DataFrame  output -> pd.DataFrame
    
    # remove irrelevant fields
    product = product.drop(columns=['created_at', 'brand_canonical_url', 'product_active'])
    
    # concat information in different groups
    product['product_info'] = product['name'].fillna('').astype('str') + ' ' + \
                            product['details'].fillna('').astype('str') + ' ' + \
                            product['description'].fillna('').astype('str')

    product['raw_product_info'] = product['brand_name'].fillna('').astype('str') + ' ' + \
                                    product['brand_category'].fillna('').astype('str') + ' ' + \
                                    product['brand_description'].fillna('').astype('str')

    product['all_info'] = product['product_info'] + ' ' + product['raw_product_info']
    
    product['all_info'] = product['product_info'] + ' ' + product['raw_product_info']
    
    # create a function to remove punctuations
    def remove_punctuations(text):
        ''' remove unnecessary punctuations in all lines '''
        punctuations = ['\n',',','.','!','"','*','(',')','-','—','\\','@',
                        '#','/','\xa0',':','_','>','<',';','|','&','?','^']
        for p in punctuations:
            text = text.replace(p,' ')    
        return text

    # remove punctuations
    col_list = ['product_info', 'raw_product_info', 'all_info']
    
    for col in col_list:
        product[col] = product[col].apply(remove_punctuations)
    
    return product

#### spacy_lemma_tokenize function

In [2]:
def spacy_lemma_tokenize(product): # input: pd.DataFrame  output -> pd.DataFrame
    
    # load spacy library
    import spacy
    
    nlp = spacy.load('en_core_web_md')
    # clean the product_info column
    product['clean_product_info'] = product['product_info']\
            .apply(lambda doc: " ".join([token.lemma_ for token in nlp(doc) if not token.is_stop]))
    
    # clean the raw_product_info column
    product['clean_raw_product_info'] = product['raw_product_info']\
            .apply(lambda doc: " ".join([token.lemma_ for token in nlp(doc) if not token.is_stop]))
    
    # clean the all_info column
    product['clean_all_info'] = product['all_info']\
            .apply(lambda doc: " ".join([token.lemma_ for token in nlp(doc) if not token.is_stop]))

    return product

#### create_clothing_category_feature function

In [3]:
def create_clothing_category_feature(product, outfit): 
# input:pd.DataFrame  output -> pd.Series, pd.DataFrame, pd.Series, pd.DataFrame
    
    # load libraries
    import re
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    from collections import Counter
    
    def unfold_regex(category_find):
        new = []
        for i in category_find:
            for j in i:
                if j != '':
                    new.append(j)
        return new
    
    def find_mode_category(product_info):

        top_regex = r'(?i)\b(t(?:-)?shirt)s?\b|\b(shirt)s?\b|\b(blouse)s?\b|\b(tank)s?\b|\b(top)s?\b|'\
                        r'\b(suit)s?\b|\b(sweatshirt)s?\b|\b(knitwear)s?\b|\b(vest)s?\b|\b(suit)s?\b|\b(sweater)s?\b|'\
                        r'\b(cardigan)\b|\b(tee)s?\b|\b(hoodie)s?\b|\b(camisole)s?\b|\b(turtleneck)s?\b'
        shoe_regex = r'(?i)\b(shoe)s?\b|\b(sneaker)s?\b|\b(footie)s?\b|\b(footwear)s?\b|\b(pump)s?\b|'\
                        r'\b(flat)s?\b|\b(heel)s?\b|\b(boot)s?\b|\b(bootie)s?\b|\b(loafer)s?\b'\
                        r'|\b(mule)s?\b|\b(sandal)s?\b|\b(slipper)s?\b|\b(wedge)s?\b|\b(slide)s?\b|\b(slingback)s?\b'
        bottom_regex = r'(?i)\b(jean)s?\b|\b(short)s?\b|\b(pant)s?\b|\b(skirt)s?\b|\b(skort)s?\b|\b(sweatpant)s?\b|'\
                        r'\b(legging)s?\b|\b(trouser)s?\b|\b(bottom)s?\b|\b(jogger)s?\b|\b(tight)s?\b|\b(crop)s?\b|\b(leg)s?\b'
        bag_regex = r'(?i)\b(bag)s?\b|\b(handbag)s?\b|\b(shoulderbag)s?\b|\b(tote)s?\b|\b(clutch)(?:es)?\b|'\
                        r'\b(luggage)s?\b|\b(belt\s?bag)s?\b|\b(beach\s?bag)s?\b|\b(backpack)s?\b|\b(satchel)s?\b|'\
                        r'\b(briefcase)s?\b|\b(pouch)(?:es)?\b'
        accessory_regex = r'(?i)\b(scarf|scarves)\b|\b(hat)s?\b|\b(belt)s?\b|\b(sunglass)es?\b|\b(glove)s?\b|'\
                            r'\b(keychain)s?\b|\b(keyring)s?\b|\b(tie)s?\b|\b(phone\s?case)s?\b|'\
                            r'\b(glass)(?:es)?\b|\b(umbrella)s?\b|\b(frame)s?\b|\b(wallet)s?\b|\b(face\s?mask)s?\b|'\
                            r'\b(helmet)s?\b|\b(shawl)s?\b'
        onepiece_regex = r'(?i)\b(dress)(?:es)?\b|\b(jumpsuit)s?\b|\b(gown)s?\b|\b(robe)s?\b|\b(shirtdress)(?:es)?\b|\b(bodysuit)s?\b'
        outerwear_regex= r'(?i)\b(?:top)?(coat)s?\b|\b(jacket)s?\b|\b(parka)s?\b|\b(trench)(?:es)?\b|\b(raincoat)s?\b|\b(overcoat)s?\b|'\
                            r'\b(blazer)s?\b'
        jewelry_regex = r'(?i)\b(bracelet)s?\b|\b(brooch)(?:es)?\b|\b(pin)s?\b|\b(cufflink)s?\b|\b(earring)s?\b|'\
                            r'\b(necklace)s?\b|\b(ring)s?\b'
        intimate_regex = r'(?i)\b(underwear)s?\b|\b(bra)s?\b|\b(sock)s?\b|\b(sleepwear)s?\b|\b(loungewear)s?\b|'\
                            r'\b(boxer)s?\b|\b(brief)s?\b|\b(linger)(?:ies)?\b|\b(pantie)s?\b'

        top_find = re.findall(top_regex, product_info)
        shoe_find = re.findall(shoe_regex, product_info)
        bottom_find = re.findall(bottom_regex, product_info)
        bag_find = re.findall(bag_regex, product_info)
        accessory_find = re.findall(accessory_regex, product_info)
        onepiece_find = re.findall(onepiece_regex, product_info)
        outerwear_find = re.findall(outerwear_regex, product_info)
        jewelry_find = re.findall(jewelry_regex, product_info)
        intimate_find = re.findall(intimate_regex, product_info)

        top_find = unfold_regex(top_find)
        shoe_find = unfold_regex(shoe_find)
        bottom_find = unfold_regex(bottom_find)
        bag_find = unfold_regex(bag_find)
        accessory_find = unfold_regex(accessory_find)
        onepiece_find = unfold_regex(onepiece_find)
        outerwear_find = unfold_regex(outerwear_find)
        jewelry_find = unfold_regex(jewelry_find)
        intimate_find = unfold_regex(intimate_find)

        category_word_count_dict = {'top':len(top_find), 'bottom':len(bottom_find), 'shoe':len(shoe_find),\
                                            'bag':len(bag_find),'accessory':len(accessory_find),\
                                            'onepiece':len(onepiece_find),'outerwear':len(outerwear_find),\
                                            'jewelry':len(jewelry_find),'intimate':len(intimate_find)}
        mode_category_name = ''
        if sum(category_word_count_dict.values()) != 0:
            mode_category_name = max(category_word_count_dict, key=category_word_count_dict.get)


        return mode_category_name

    def find_sub_category(product_info):

        all_category_regex = r'(?i)\b(t(?:-)?shirt)s?\b|\b(shirt)s?\b|\b(blouse)s?\b|\b(tank)s?\b|'\
                            r'\b(suit)s?\b|\b(sweatshirt)s?\b|\b(knitwear)s?\b|\b(vest)s?\b|\b(suit)s?\b|'\
                            r'\b(sneaker)s?\b|\b(footie)s?\b|\b(footwear)s?\b|\b(pump)s?\b|'\
                            r'\b(flat)s?\b|\b(heel)s?\b|\b(boot)s?\b|\b(bootie)s?\b|\b(loafer)s?\b|'\
                            r'\b(mule)s?\b|\b(sandal)s?\b|\b(slipper)s?\b|\b(wedge)s?\b|'\
                            r'\b(jean)s?\b|\b(short)s?\b|\b(pant)s?\b|\b(skirt)s?\b|\b(skort)s?\b|'\
                            r'\b(legging)s?\b|\b(trouser)s?\b|\b(jogger)s?\b|\b(sweater)s?\b'\
                            r'\b(bag)s?\b|\b(handbag)s?\b|\b(shoulderbag)s?\b|\b(tote)s?\b|\b(clutch)(?:es)?\b|'\
                            r'\b(luggage)s?\b|\b(belt\s?bag)s?\b|\b(beach\s?bag)s?\b|\b(backpack)s?\b|'\
                            r'\b(scarf|scarves)\b|\b(hat)s?\b|\b(belt)s?\b|\b(sunglass)(?:es)?\b|\b(glove)s?\b|'\
                            r'\b(keychain)s?\b|\b(keyring)s?\b|\b(tie)s?\b|\b(phone\s?case)s?\b|'\
                            r'\b(glass)(?:es)?\b|\b(umbrella)s?\b|\b(frame)s?\b|\b(wallet)s?\b|\b(face\s?mask)s?\b|'\
                            r'\b(helmet)s?\b|\b(dress)(?:es)?\b|\b(jumpsuit)s?\b|\b(gown)s?\b|\b(robe)s?\b|'\
                            r'\b(coat)s?\b|\b(jacket)s?\b|\b(parka)s?\b|\b(trench)(?:es)?\b|\b(raincoat)s?\b|'\
                            r'\b(overcoat)s?\b|\b(bracelet)s?\b|\b(brooch)(?:es)?\b|\b(pin)s?\b|\b(cufflink)s?\b|'\
                            r'\b(earring)s?\b|\b(necklace)s?\b|\b(ring)s?\b|\b(underwear)s?\b|\b(bra)s?\b|'\
                            r'\b(sock)s?\b|\b(sleepwear)s?\b|\b(loungewear)s?\b|\b(boxer)s?\b|\b(brief)s?\b|'\
                            r'\b(linger)(?:ies)?\b|\b(pantie)s?\b|\b(satchel)s?\b|\b(sweater)s?\b|\b(tee)s?\b|'\
                            r'\b(tight)s?\b|\b(cardigan)s?\b|\b(hoodie)s?\b|\b(sweatpant)s?\b|\b(slide)s?\b|'\
                            r'\b(shirtdress)(?:es)?\b|\b(blazer)s?\b|\b(crop)s?\b|\b(leg)s?\b|\b(briefcase)s?\b|'\
                            r'\b(shawl)s?\b|\b(camisole)s?\b|\b(bodysuit)s?\b|\b(turtleneck)s?\b|\b(pouch)(?:es)?\b|' \
                            r'\b(slingback)s?\b'

        all_category_find = re.findall(all_category_regex, product_info)
        all_category_find = unfold_regex(all_category_find)
        word_counter = Counter(all_category_find)
        sub_category = sorted(word_counter, key = word_counter.get, reverse = True)

        if len(word_counter) == 0:
            return ''
        else:
            return sub_category[0]
        
    df = product.copy()
    
    df['mode_category']=df['all_info'].apply(find_mode_category)
    df['sub_category'] = df['all_info'].apply(find_sub_category)
    
    temp = outfit[['product_id','outfit_item_type']].drop_duplicates()
    temp['outfit_item_type'] = temp['outfit_item_type'].str.replace('accessory1','accessory')
    temp['outfit_item_type'] = temp['outfit_item_type'].str.replace('accessory2','accessory')
    temp['outfit_item_type'] = temp['outfit_item_type'].str.replace('accessory3','accessory')
    temp1 = temp.groupby('product_id').agg({'outfit_item_type':'count'}).reset_index()
    productid_only_1 = temp1.loc[temp1['outfit_item_type']==1,['product_id']]
    new_outfit = productid_only_1.merge(temp, how='inner',on='product_id')

    # merge product df with new_outfit df
    merged = df.merge(new_outfit, how='left', on='product_id')

    ind = merged.loc[merged['outfit_item_type'].notnull()==True,].index

    for i in ind:
        merged.loc[i,'mode_category'] = merged.loc[i,'outfit_item_type']

    vectorizer = CountVectorizer(binary=True)
    X_mode = vectorizer.fit_transform(merged['mode_category'])
    vectorized_df_mode = pd.DataFrame(X_mode.toarray(), columns=vectorizer.get_feature_names())
    vectorizer = CountVectorizer(binary=True)
    X_sub = vectorizer.fit_transform(merged['sub_category'])
    vectorized_df_sub = pd.DataFrame(X_sub.toarray(), columns=vectorizer.get_feature_names())
    
    return merged['mode_category'], vectorized_df_mode, merged['sub_category'], vectorized_df_sub

#### create_gender_feature function

In [4]:
def create_gender_feature(product): # input:pd.DataFrame  output -> pd.Series, pd.DataFrame
    
    # load libraries
    import re
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    
    # gender regex patterns
    female_regex = r'\b(women|ladies|lady|woman|female|dress(?:es)?|skorts?|skirts?|blouses?|girls?|jewelr(?:ies)?|heels?)\b'
    male_regex = r'\b(man|men|gentleman|gentlemen)\b'
    kid_regex = r'\b(kids?|child(?:ren)?|girls?|boys?|babies|baby|infants?|toddlers?|teenagers?|youths?)\b'
    
    def find_all_gender_words(product_info):
    
        # initiate an empty dictionary to store words count in each category
        # initiate empty list to store results
        female_find = re.findall(female_regex, product_info)
        male_find = re.findall(male_regex, product_info)
        kid_find = re.findall(kid_regex, product_info)

        gender_list = female_find+male_find+kid_find
        gender_list = [i for i in gender_list if i != ""]
        clean_gender_results = " ".join([i for i in gender_list])

        return clean_gender_results

    def find_mode_gender(product_info):
    
        gender_word_count_dict = {}
        female_find = re.findall(female_regex, product_info)
        female_find = [i for i in female_find if i != ""]
        male_find = re.findall(male_regex, product_info)
        male_find = [i for i in male_find if i != ""]
        kid_find = re.findall(kid_regex, product_info)
        kid_find = [i for i in kid_find if i != ""]
        gender_list = female_find+male_find+kid_find
        gender_list = [i for i in gender_list if i != ""]

        gender_word_count_dict = {'women':len(female_find),'men':len(male_find),'kid':len(kid_find)}

        if sum(gender_word_count_dict.values()) == 0:
            return ''
        else:
            mode_gender = max(gender_word_count_dict, key=gender_word_count_dict.get)
            return mode_gender
    
    df = product.copy()
    df['all_gender_words'] = df['raw_product_info'].apply(find_all_gender_words)
    df['mode_gender'] = df['raw_product_info'].apply(find_mode_gender)
    
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform(df['mode_gender'])
    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return df['mode_gender'], vectorized_df

#### create_material_feature function

In [5]:
def create_material_feature(df): # input:pd.DataFrame  output -> pd.Series, pd.DataFrame
    # load libraries
    import re
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer
    
    # define color pattern keywords
    materials = ['acetate', 'acrylic', 'alpaca', 'calf', 'cashmere','chiffon', 'cotton','kidskin', 
            'lamb', 'lambskin', 'leather','linen', 'lyocell','mercerized', 'merino','nylon',
            'organic','peruvian', 'pima','poly', 'polyamide','polyester', 'polyurethane', 
            'ramie', 'rayon','rubber', 'silk','supima', 'tencel', 'triacetate', 'uv', 'uva',
            'velvet', 'virgin', 'viscose', 'wood', 'wool','rose sylk']
    
    material_pattern = r"\b("+"|".join(materials)+r")\b"
    
    # find all material words from given info
    df['material'] = df['all_info'].str.lower().str.findall(material_pattern)
    
    # remove all duplicates
    df['material'] = df['material'].map(lambda x: list(set(word.lower() for word in x)))
    
    # convert material finding to vectors
    vectorizer = CountVectorizer()
    temp = df['material'].apply(lambda doc: " ".join([word for word in doc]))
    X = vectorizer.fit_transform(temp)
    material_result = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return df['material'],material_result

#### create_percent_material_feature function

In [6]:
def create_percent_material_feature(df): # input:pd.DataFrame  output -> pd.Series, pd.DataFrame
    
    # load libraries
    import re
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer
    
    # package up codes for extracting p_material features from product description and details
    def func2(row):
        """
        this function is for the case where one of the "p_material_desc" and "p_material_details"
        is null.
        """
        if type(row['p_material_desc']) == list:
            return row['p_material_desc']
        else:
            return row['p_material_details']
        
    # defined specific regex to extract percent_material from details and description
    # examined by looking into some special cases of text to make regex more precise

    p_material_regex = r'([\d]{1,3}%\s(?!of|for|Deadstock|dead|Hand)[\w\s]+?)(?=\n|\d|\,|\/|$|:|\.|Care|\bis\b|Ma|Hand|Pr|grown|for|from|with|into|Category|\bin\b|\band\b|Dry|\bto\b|\bAnd\b|\bCollection\b)'

    # look through both deescription and details columns
    df['p_material_desc'] = df['description'].str.findall(p_material_regex)
    df['p_material_details'] = df['details'].str.findall(p_material_regex)
    
    # combine features extracted from description and details using "func2" to avoid null value distraction
    df['new'] = df[['p_material_desc','p_material_details']].apply(lambda x: x['p_material_desc']+x['p_material_details'] if x.notnull().all() else func2(x), axis=1)
    
    # replace empty list with na
    df['new_1'] = df['new'].apply(lambda x: np.nan if type(x)==list and len(x)==0 else x)
    # lower case, strip white space and remove duplicates
    df['new_2'] = df['new_1'].apply(lambda x: list(set([a.lower().strip() for a in x])) if type(x)==list else x)

    df.rename(columns={'new_2': 'p_material'}, inplace=True)
    
    # change list type value to string type
    df['p_material'] = df['p_material'].apply(lambda x: ' '.join(x) if type(x)==list else x)
    # fill null value with empty string
    df['p_material'] = df['p_material'].fillna('')
    
    # replace remaining abnormal symbols 
    df['p_material'] = df['p_material'].str.replace(r'\n|\xa0',' ')
    
    # only count the first word after percentage symbol as p_material 
    df['p_material'] = df['p_material'].str.findall(r'[\d]{1,3}%\s[\w]+').apply(lambda x: ' '.join(x))
    
    # using regex to keep over 60% percent_material as our final feature
    df['p_material'] = df['p_material'].str.findall(r'[1]?[0|6-9][0-9]%\s[\w]+').apply(lambda x: ' '.join(x) if type(x)==list else x)
    
    # finally fill null value with empty string to avoid future process
    df['p_material'] = df['p_material'].fillna('')
    
    # vectorize result
    docs = df['p_material'].str.replace(r'[\d]+%\s','').str.split().apply(lambda x: list(set(x))).apply(lambda x: ' '.join(x)).tolist()

    vectorizer = CountVectorizer(min_df=20)
    X = vectorizer.fit_transform(docs)
    data = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return df['p_material'], data

#### create_sizes_features function

In [7]:
def create_sizes_features(data): # input:pd.DataFrame  output -> pd.Series, pd.DataFrame
    
    # load libraries
    import re
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    
    #read data

    #lowercase 2 columns
    lower_description = []
    for des in data['description']:
        lower_description.append(str(des).lower())
    lower_details = []
    for det in data['details']:
        lower_details.append(str(det).lower())

    #finding size in every sentense
    sizes_des = []
    sizes_det = []
    for i in lower_description:
        sizes_des.append(str(set(re.findall(r'(?<=\bsize )(?:\d+|big|large|middle|medium|small|l|m|s|x{1,}s|x{1,}l|\b)',i))))
    for i in lower_details:
        sizes_det.append(str(set(re.findall(r'(?<=\bsize )(?:\d+|big|large|middle|medium|small|l|m|s|x{1,}s|x{1,}l|\b)',i))))

    #Getting rid of puncuations 
    exp = r'(\'|[\{\}]|,)|set\(\)'
    new_1 = []
    for i in sizes_des:
        new_1.append(re.sub(exp,'',i))
    new_2 = []
    for i in sizes_det:
        new_2.append(re.sub(exp,'',i))

    l = []
    for i in range(len(new_1)):
        l.append(new_1[i] + " " + new_2[i])

    #Transfer into dataframe
    size = pd.DataFrame()
    size['sizes'] = l

    #Function of turning none value in to string
    def changenull(x):
        if len(x) == 0:
            return "None"
        else:
            return x
    size['sizes'] = size['sizes'].apply(changenull)

    #Vectorizing with one-hot encoding
    vectorizer = CountVectorizer(binary=True,min_df=10)
    X = vectorizer.fit_transform(size['sizes'])
    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    return size, vectorized_df

#### create_color_feature function

In [8]:
def create_color_feature(df): # input:pd.DataFrame  output -> pd.Series, pd.DataFrame
    
    # load libraries
    import re
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    
    # define color pattern keywords
    color_pattern = r'(?i)\bRed|Orange|Yellow|Green|Blue|Purple|White|Black|Brown|Magenta|Tan|Olive|Navy|Turquoise|Silver|Lime|Teal|Indigo|Violet|Pink|Gray|Navy|Beige|Burgundy|Golden|Magenta|Cyan|Aquamarine\b'
    
    # find all color words from given info
    data = pd.DataFrame([i for i in range(len(df))], columns=['Document'])
    data['color'] = df['all_info'].str.findall(color_pattern)
    
    # fill null value with `[]`
    data.loc[data['color'].isnull(), 'color'] = data.loc[data['color'].isnull(), 'color'].apply(lambda x: [])
    
    # remove all duplicates
    data['color'] = data['color'].map(lambda x: list(set(word.lower() for word in x)))
    
    # convert color finding to vectors
    vectorizer = CountVectorizer()
    temp = data['color'].apply(lambda doc: " ".join([word for word in doc]))
    X = vectorizer.fit_transform(temp)
    color_result = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return data['color'],color_result

#### create_location_feature function

In [9]:
def create_location_feature(df): # input:pd.DataFrame  output -> pd.Series, pd.DataFrame
    
    # load libraries
    import re
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    
    # define location keywords
    # check for false postives: us, la, case insensitive
    # define location keywords
    location_regex_1 = r'(?i)\b(usa|italy|ethiopia|china|peru|los angeles|nyc|spain|new york|portugal|india|america|kenya|turkey|brazil|ghana|italian|morocco|france|vietnam|germany|lima|mexico|argentina|japan|brooklyn|madagascar|bali|prc|poland)\b'
    location_regex_2 = r'\b(LA|US|JP)\b'
    
    # find all location words from given info
    data = pd.DataFrame([i for i in range(len(df))], columns=['Document'])
    data['location_1'] = df['all_info'].str.findall(location_regex_1)
    data['location_2'] = df['all_info'].str.findall(location_regex_2)
    
    # fill null value with `[]`
    data.loc[data['location_1'].isnull(), 'location_1'] = data.loc[data['location_1'].isnull(), 'location_1'].apply(lambda x: [])
    data.loc[data['location_2'].isnull(), 'location_2'] = data.loc[data['location_2'].isnull(), 'location_2'].apply(lambda x: [])
    
    # concat two findings
    data['location'] = data['location_1'] + data['location_2']
    
    # remove all duplicates
    data['location'] = data['location'].map(lambda x: list(set(word.lower() for word in x)))
    
    # create fields for each color
    vectorizer = CountVectorizer()
    temp = data['location'].apply(lambda doc: " ".join([word for word in doc]))
    X = vectorizer.fit_transform(temp)
    location_result = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return data['location'], location_result

#### create_tags_feature_function

In [10]:
def create_tags_feature(df): # input:pd.DataFrame  output -> pd.Series, pd.DataFrame
    
    # load libraries
    import re
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    
    tags_words = ['stripe vertical', 'shoulder bags', 'highneck', 'romantic', 'crossbody', 'sweatshirt', 
                  'denim', 'cotton blend', 'straightregular', 'buttonedback', 'backzipper', 'multiprint', 
                  'sportsbra', 'joggerssweatpants', 'henley', 'sheath', 'dropwaist', 'mohair', 'skinny', 
                  'jerseyknit', 'short at waistline', 'cold weather', 'corduroy', 'tank', 'splitneck', 'vneck',
                  'camisole', 'tiedye', 'shawlcollar', 'straight regular', 'shell', 'satincharmeuse', 
                  'lightbrowns', 'waist', 'roundtoe', 'bandcollar', 'sandals', 'buttondown', 'above waistline', 
                  'plus', 'purewool', 'sleeve', 'flap', 'kitten', 'businesscasualdress', 'down', 'bucketbags', 
                  'classic', 'strap', 'belted', '5pocketpantnondenim', 'boot', 'floral', 'mid length at hips', 
                  'squaretoe', 'edgy', 'tote', 'booties', 'fauxshearling', 'bucket', 'faux shearling', 
                  'lightbrown', 'straight', 'cone', 'oneshoulder', 'squareneck', 'culotte', 'beachbags', 
                  'hoodie', 'pinstripe', 'cases', 'faux leather', 'regular', 'blacks', 'zipup', 'hook', 
                  'twisted', 'fauxfur', 'crepedechine', 'sidezip', 'halfwayzipper', 'stripevertical', 
                  'suede', 'boots', 'tall', 'buttonfront', 'open', 'purecotton', 'trackpants', 'shortsleeve', 
                  'platform', 'sleeveless', 'cut', 'sweatshirthoodie', 'halfwaybuttoned', 'swraps', 'logo', 
                  'zipflywithbutton', 'pumpsheels', 'burgundies', 'twill', 'shawl collar', 'tweed', 
                  'midlengthathips', 'joggers', 'crepe de chine', 'aline', 'buckle', 'satchels', 'wedge', 
                  'capri', 'business casual', 'blouse', 'glam', 'bodycon', 'knit', 'jewel', 'purelinen', 
                  'mandarincollar', 'color block', 'tiefront', 'heels', 'fleece', 'crossbodybags', 'poncho', 
                  'wrap', 'wedges', 'duster', 'opentoe', 'vest', 'retro', 'patent leather', 'turtleneck', 
                  'short sleeve', 'light brown', 'button', 'zipfly', 'grays', 'satin charmeuse', 'multi print',
                  'funnel', 'pantsleggings', 'wideleg', 'halter', 'pinks', 'clasp', 'women', 'silk blend', 
                  'mandarin', 'casualdress', 'velcro', 'scarveswraps', 'stripehorizontal', 'dark brown', 
                  'cropped', 'daytonight', 'canvas', 'block', 'stiletto', 'surplice', 'golds', 'geometric', 
                  'faux fur', 'empirewaist', 'coldweather', 'nightout', 'maternity', 'boyfriend', 'elastane', 
                  'narrow', 'draped', 'laceup', 'silvers', 'baggy', 'sneakersathletic', 'pussybow', 
                  'casual dress', 'animal', 'drawstring', 'casual', 'cowlneck', 'lock', 'inwardcurve', 
                  'under8', 'oranges', 'shirtdress', 'tieback', 'tropical', 'sateen', 'walletscardcases', 
                  'linen blend', 'buttoned', 'belowhips', 'multi', 'day to night', 'highover9', 'plungeneck', 
                  'tee', 'platformflatform', 'tailored', 'polo', 'fauxleather', 'bags', 'linenblend', 
                  'sweatpants', 'slingback', 'androgynous', 'purples', 'sweetheart', 'tie', 'blazerdress', 
                  'purecashmere', 'slit', 'blazerscoatsjackets', 'anklestrap', 'handbags', 'closedtoe', 
                  'strapless', 'pants', 'mules', 'skirts', 'semi fitted', 'low', 'totebags', 'snap', 
                  'fittedtailored', 'sneakers athletic', 'openfront', 'puffsleeve', 'crewneck', 'synthetic',
                  'raisedsole', 'magnetic', 'zip', 'darkbrowns', 'hobobags', 'long', 'cardigan', 'calfhair', 
                  'beach bags', 'pointedtoe', 'maxi', 'peeptoe', 'laces', 'silkblend', 'wide', 'paisley',
                  'shortatwaistline', 'bustier', 'shorts', 'flats', 'colorblock', 'stripe', 
                  'blazers coats jackets', 'laptopsbriefcases', 'peplum', 'beltbagsfannypack', 'patentleather',
                  'cold', 'slipdress', 'shoulderbags', 'round toe', 'croptop', 'cargo', 'shearling', 'shoulder',
                  'midcalf', 'snaps', 'back', 'collar', 'blues', 'leggings', 'hookandloop', 'relaxed', 'greens',
                  'sweater', 'chambray', 'hookloop', 'tshirtdress', 'zipper', 'tieneck', 'longsleeve', 
                  'cashmereblend', 'longbelowhips', 'slim', 'weekend', 'halfway buttoned', 'oversized', 
                  'reds', 'slippers', 'sunglasses', 'modal', 'tiered', 'whites', 'belts', 'sports bra', 
                  'cottonblend', 'high', 'boho', 'stripe horizontal', 'sundress', 'backpacks', 'beiges', 
                  'scarve', 'graphic', 'fannypack', 'sweaterdress', 'flare', 'dots', 'boatneck', 'nondenim', 
                  'businesscasual', 'empire', 'woolblend', 'slides', 'laptops briefcases', 'houndstooth', 
                  'capsleeve', 'pure linen', 'flatform', 'athleisure', 'ponyhair', 'scoopneck', 'calf hair', 
                  'denimjeans', 'none', 'clutchespouches', 'front', 'coldshoulder', 'bodysuit', 
                  'zipflywithhook', 'buttonedfront', 'spandex', 'collared', 'puff', 'vacation', 'abstract', 
                  'side', 'snakeskin', 'yellows', 'monogram', 'keyhole', 'closed toe', 'designer', 
                  'long sleeve', 'backzip', 'chenille', 'asymmetrical', 'fitted', 'tunic', 
                  'croppedabovewaistline', 'active', 'cap', 'pointed', 'camouflage', 'toe', 'frontzip', 
                  'bootcut', 'backless', 'halfway zipper', 'tie dye', 'mockneck', 'offshoulder', 'shift', 
                  'trousers', 'puresilk', 'walletscard', 'workout', 'pumps', 'ankle', 'mulesslides', 
                  'semifitted', '5pocketpant', 'trapezeswing', 'mid89', 'modern', 'work', 'flat', 'plaid', 
                  'belt']
    
    tags_words = list(set(tags_words))
    
    # create tags regex
    super_regex = r"\b" + "|".join([words for words in tags_words]) + r"\b"
    
    # find all tags words from given info
    data = pd.DataFrame([i for i in range(len(df))], columns=['Document'])
    data['tags'] = df['all_info'].str.findall(super_regex)
    
    # fill null value with `[]`
    data.loc[data['tags'].isnull(), 'tags'] = data.loc[data['tags'].isnull(), 'tags'].apply(lambda x: [])
    
    # remove all duplicates
    data['tags'] = data['tags'].map(lambda x: list(set(word.lower() for word in x)))
    
    # convert tags finding to vectors
    vectorizer = CountVectorizer()
    temp = data['tags'].apply(lambda doc: " ".join([word for word in doc]))
    X = vectorizer.fit_transform(temp)
    tags_result = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return data['tags'],tags_result

## Cleaning Process

In [11]:
# load data
import pandas as pd
product = pd.read_excel('data/Behold+product+data+04262021.xlsx')
brand = pd.read_csv('data/behold_brands USC.csv')
combination = pd.read_csv('data/outfit_combinations USC.csv')
tags = pd.read_csv('data/usc_additional_tags USC.csv')

In [12]:
# initial cleaning
clean_product = clean_product_table(product)

In [13]:
# spacy lemmatizaiton & tokenization
# clean_product = spacy_lemma_tokenize(clean_product)
# clean_product.to_csv('cleaned_product_data.csv')

# use above function to obtain this cleaned data (about 30min to run)
clean_product = pd.read_csv('data/cleaned_product_data.csv', index_col=0)

##  Feature Engineering

In [14]:
# clothing_category feature creation
features = pd.DataFrame()
features['clothing_category'], clothing_vector, features['clothing_subcategory'], subcategory_vector \
                                = create_clothing_category_feature(clean_product, combination)

In [15]:
# gender feature creation
features['gender'], gender_vector = create_gender_feature(clean_product)

In [16]:
# material feature creation
features['material'], material_vector = create_material_feature(clean_product)

In [17]:
# material feature creation
features['percent_material'], percent_material_vector = create_percent_material_feature(clean_product)

In [18]:
# size feature creation 
features['size'], size_vector = create_sizes_features(clean_product)

In [19]:
# color feature creation
features['color'], color_vector = create_color_feature(clean_product)

In [20]:
# location feature creation
features['location'], location_vector = create_location_feature(clean_product)

In [21]:
# tags feature creation
features['tags'], tags_vector = create_tags_feature(clean_product)

In [22]:
# concat all features
feature_vectors = pd.concat([clothing_vector, subcategory_vector, gender_vector, material_vector, 
                             percent_material_vector, size_vector, color_vector, location_vector, 
                             tags_vector], axis=1)

# create a final feature for all product that does not belong to any category
index = features.loc[features['clothing_category']=='',].index
feature_vectors.loc[index, 'uncategorized'] = 1
feature_vectors['uncategorized'] =  feature_vectors['uncategorized'].fillna(int(0))

# save results
features['product_id'] = product['product_id']
features[['product_id', 'clothing_category', 'clothing_subcategory']].to_csv('feature_columns.csv')
feature_vectors.to_csv('data/feature_vectors.csv')
feature_vectors.shape

(61355, 540)

## Classification

### Load Libraries

In [23]:
# load functions
# basic functional packages
import pandas as pd
import re
import numpy as np
from random import sample
from numpy.random import seed
from collections import Counter

# modeling packages
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# nltk package
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Tensorflow Keras Toolkit
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences

from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers import Flatten, Masking

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# set random seed
import tensorflow
tensorflow.random.set_seed(32)

### Custom Functions

In [24]:
# logistic regression with manual cross validation
def make_logreg_function(X, y, cv=10, random_seed=42, max_iter=100, test_size=0.2):
    
    from sklearn.linear_model import LogisticRegression
    accuracy_result = []
    np.random.seed(random_seed)
    i=1
    for iters in np.random.choice(range(0,200,1), size=cv, replace=False):
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size,
                                                            random_state=iters,
                                                            stratify=y)
        # transform train test
        X_train_std = vectorizer.fit_transform(X_train)
        X_test_std = vectorizer.transform(X_test)

        # logistic regression model
        lr = LogisticRegression(max_iter=max_iter, multi_class='auto')
        lr.fit(X_train_std, y_train)

        # obtain accuracy
        accuracy_result.append(lr.score(X_test_std, y_test))
        print(f'Iteration {i} Accuracy: {lr.score(X_test_std, y_test)}');i+=1

    print(f'Average Accuracy over 10-Fold Cross Validation: {np.mean(accuracy_result)}')
    
    return accuracy_result

# logistic regression with custom X and manual cross validation
def make_custom_logreg_function(X, y, cv=10, random_seed=42, max_iter=100, test_size=0.2):
    accuracy_result = []
    np.random.seed(random_seed)
    i=1
    for iters in np.random.choice(range(0,200,1), size=cv, replace=False):
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size,
                                                            random_state=iters,
                                                            stratify=y)
        # save index
        train_index = X_train.index.to_list()
        test_index = X_test.index.to_list()

        # transform train test
        X_train_std = vectorizer.fit_transform(X_train)
        X_test_std = vectorizer.transform(X_test)

        # create train test features
        feature_train = feature_vectors.loc[train_index]
        feature_test = feature_vectors.loc[test_index]

        # restore train test table
        X_train_table = pd.DataFrame(X_train_std.toarray(), columns=vectorizer.get_feature_names())
        X_test_table = pd.DataFrame(X_test_std.toarray(), columns=vectorizer.get_feature_names())

        # add back index & correct index
        X_train_table['index'] = train_index
        X_test_table['index'] = test_index
        X_train_table.set_index('index', inplace=True)
        X_test_table.set_index('index', inplace=True)

        X_train_final = pd.concat([X_train_table, feature_train], axis=1)
        X_test_final = pd.concat([X_test_table, feature_test], axis=1)

        # logistic regression model
        lr = LogisticRegression(max_iter=max_iter, multi_class='auto')
        lr.fit(X_train_final, y_train)

        # obtain accuracy
        accuracy_result.append(lr.score(X_test_final, y_test))
        print(f'Iteration {i} Accuracy: {lr.score(X_test_final, y_test)}');i+=1

    print(f'Average Accuracy over 10-Fold Cross Validation: {np.mean(accuracy_result)}')

    return accuracy_result

In [25]:
# LSTM multi classificaiton model
def make_lstm_classification_model(embedding_matrix, plot=False,class_value=2, weight_shape=100):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, weight_shape, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(LSTM(units=32, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(64))
    model.add(Dense(class_value, activation='softmax'))
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

# Simple RNN classsificaiton model
def make_multi_classification_rnn_model(embedding_matrix, plot=False,class_value=2, weight_shape=100):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, weight_shape, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(64))
    model.add(Dense(class_value, activation='softmax'))
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [26]:
#  Glove vectors
def load_glove_vectors():
    embeddings_index = {}
    with open('data/glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index

### Logistic Regression ( average 93.8% accuracy on 20% validaiton data)

In [27]:
# load data
df = pd.read_csv('data/cleaned_product_data.csv', index_col=0)
feature_vectors = pd.read_csv('data/feature_vectors.csv', index_col=0)
df.drop(columns=['brand_category', 'name', 'details', 'description','product_id','product_info',
                    'brand_description', 'brand_name', 'all_info', 'raw_product_info'], inplace=True)

In [28]:
# encode top 50 brand labels
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

top50_col = list(df['brand'].value_counts()[:50].index)
df['brand'] = df['brand'].map(lambda x: x if x in top50_col else 'other')


encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(df['brand'].values))

In [29]:
# count vectorizer
stopword_list = list(stopwords.words('English'))
vectorizer = CountVectorizer(ngram_range=(1, 2), 
                             stop_words=stopword_list, 
                             max_features=1000,
                             token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')

In [30]:
# use 10 fold cv to validate - without additional features
X = df['clean_product_info']
y = df["brand"].values

result = make_logreg_function(X, y, cv=10, random_seed=42, max_iter=500, test_size=0.2)
print(result)

Iteration 1 Accuracy: 0.9290196398011572
Iteration 2 Accuracy: 0.9260044006193464
Iteration 3 Accuracy: 0.9277972455382609
Iteration 4 Accuracy: 0.9237225979952735
Iteration 5 Accuracy: 0.932360850786407
Iteration 6 Accuracy: 0.9258414147176269
Iteration 7 Accuracy: 0.9278787384891207
Iteration 8 Accuracy: 0.9268193301279439
Iteration 9 Accuracy: 0.9244560345530112
Iteration 10 Accuracy: 0.9242930486512917
Average Accuracy over 10-Fold Cross Validation: 0.9268193301279439
[0.9290196398011572, 0.9260044006193464, 0.9277972455382609, 0.9237225979952735, 0.932360850786407, 0.9258414147176269, 0.9278787384891207, 0.9268193301279439, 0.9244560345530112, 0.9242930486512917]


In [31]:
# use 10 fold cv to validate - with additional features
result = make_custom_logreg_function(X, y, cv=10, random_seed=42, max_iter=500, test_size=0.2)
print(result)

Iteration 1 Accuracy: 0.939858202265504
Iteration 2 Accuracy: 0.938554315051748
Iteration 3 Accuracy: 0.9390432727569066
Iteration 4 Accuracy: 0.9358650476733763
Iteration 5 Accuracy: 0.9426289625947356
Iteration 6 Accuracy: 0.9379838643957298
Iteration 7 Accuracy: 0.9387987939043273
Iteration 8 Accuracy: 0.9401841740689431
Iteration 9 Accuracy: 0.9360280335750958
Iteration 10 Accuracy: 0.9353760899682177
Average Accuracy over 10-Fold Cross Validation: 0.9384320756254583
[0.939858202265504, 0.938554315051748, 0.9390432727569066, 0.9358650476733763, 0.9426289625947356, 0.9379838643957298, 0.9387987939043273, 0.9401841740689431, 0.9360280335750958, 0.9353760899682177]


### LSTM Deep Learning Model (over 93.0% accuracy on 20% validation data)

In [32]:
# tokenize text
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(df['clean_product_info'])

VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

In [33]:
# integer encode documents
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

# integer encode the documents
MAX_SEQUENCE_LENGTH = 300
encoded_docs = integer_encode_documents(df['clean_product_info'], tokenizer)
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# train test split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2)

In [34]:
# create custom embeddings
embeddings_index = load_glove_vectors()

# Glove Vectors
# create a weight matrix for words in training docs
glove_embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        glove_embedding_matrix[i] = embedding_vector
        
# Spacy Vectors
import spacy
nlp = spacy.load('en_core_web_md')
spacy_embedding_matrix = zeros((VOCAB_SIZE, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = nlp(word).vector
    if embedding_vector is not None:
        spacy_embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [35]:
# Fit LSTM model
lstm_model = make_lstm_classification_model(class_value=51, weight_shape=300, embedding_matrix=spacy_embedding_matrix)
lstm_model.fit(X_train, y_train,validation_split = 0.1, epochs=5, verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          10678800  
_________________________________________________________________
masking (Masking)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 32)                42624     
_________________________________________________________________
dense (Dense)                (None, 64)                2112      
_________________________________________________________________
dense_1 (Dense)              (None, 51)                3315      
Total params: 10,726,851
Trainable params: 48,051
Non-trainable params: 10,678,800
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f98df6f90a0>

In [36]:
# check result
loss, accuracy = lstm_model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 93.024206
