# Group Project (Team Purple)

In [1]:
# Importing relevant libraries
import pandas as pd
import numpy as np
import re
from collections import Counter
import nltk
import spacy
import functools
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load("en_core_web_sm")

# Data Exploration

In [2]:
# Reading data into pandas
full_data = pd.read_csv("Full Data.csv")
tagged_data = pd.read_csv("Tagged Product Attributes.csv")

### Preparing Full Data File

In [3]:
full_data.head(2)

Unnamed: 0,product_id,brand,mpn,product_full_name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,514683,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,2019-11-11 22:37:15.719107+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}",
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,526676,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,2019-11-11 22:36:50.682513+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}",


In [4]:
# Deleting irrelevant columns in full data
full_data.drop(['mpn', 'created_at', 'updated_at', 'deleted_at',
                'brand_canonical_url', 'bc_product_id'],axis = 1,inplace = True)

In [5]:
# Checking NA values
full_data.isnull().sum()

product_id              0
brand                   0
product_full_name       0
description          7974
brand_category        238
details              9866
labels                  0
dtype: int64

In [6]:
# Replacing NA values with Unknown
full_data=full_data.fillna("Unknown")

In [7]:
# Dropping duplicate rows based on product id (This makes product_id unique)
full_data = full_data.drop_duplicates(subset="product_id")
full_data.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,labels
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}"
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}"
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,100% UV protection\nCase and cleaning cloth in...,"{""Needs Review""}"
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",Canvas upper\nRound toe\nLace-up vamp\nSmartFO...,"{""Needs Review""}"
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,100% UV protection\nGradient lenses\nAdjustabl...,"{""Needs Review""}"


### Preparing Tagged Products File

In [8]:
# Retaining only the relevant labels
tagged_data = tagged_data[tagged_data['attribute_name'].isin(["style", "occasion","fit","Primary Color"])]

In [9]:
# Converting misspelled labels to a standard format
tagged_data['attribute_value'] = tagged_data['attribute_value'].replace({"semifitted": "Semi-Fitted",
                                                                   "straightregular": "Straight / Regular",
                                                                   "fittedtailored": "Fitted / Tailored",
                                                                   "daytonight": "Day to Night",
                                                                   "nightout": "Night Out",
                                                                   "businesscasual": "Business Casual",
                                                                       })
tagged_data['attribute_name'] = tagged_data['attribute_name'].replace({"Primary Color": "color",
                                                                       })
tagged_data.head()

Unnamed: 0,product_id,product_color_id,attribute_name,attribute_value,file
1,01DVA7QRXM928ZM0WWR7HFNTC1,01DVA7QRXXR9F0TWVE1HMC5ZQ3,color,Blacks,initial_tags
2,01DPGV4YRP3Z8J85DASGZ1Y99W,01DPGVGBK6YGNYGNF2S6FSH02T,style,Casual,initial_tags
3,01E1JM43NQ3H17PB22EV3074NX,01E1JM5WFWWCCCH3JTTTCYQCEQ,style,Modern,initial_tags
6,01E2C3YN4KQ36A0REWZJ89ZN73,01E2C3YN56ZCJ8TN45V3EC8CPS,color,Blacks,initial_tags
8,01E223GDRKR84THXZ54GJEW60Y,01E223GKFAFZ5HTVBQJ82TAEZH,fit,Semi-Fitted,initial_tags


In [10]:
# Grouping attribute value based on product id

tagged_data['attribute_value'] = tagged_data['attribute_value'].str.lower()
tagged_data['attribute_name'] = tagged_data['attribute_name'].str.lower()

tagged_data1 = tagged_data.groupby(['product_id'])['attribute_value'].apply(set).reset_index()
tagged_data2 = tagged_data.groupby(['product_id'])['attribute_name'].apply(set).reset_index()

tagged_data3 = pd.merge(tagged_data2,tagged_data1,on='product_id', how='left')
tagged_data3.head()

Unnamed: 0,product_id,attribute_name,attribute_value
0,01DPC9GSTT72KHNN0MNDNKH7RD,"{occasion, style}","{business casual, work, classic, day to night}"
1,01DPCB2KEAVXXKFVM7FXBNE4VY,"{color, occasion, style}","{day to night, browns, blacks, work, modern, w..."
2,01DPCDEF6SYX2E1NT5X7HJBFGY,"{color, style}","{burgundies, classic, beiges, blacks, pinks, g..."
3,01DPCG1C1P0MQAV9NMS3N1TDAA,"{color, occasion, style, fit}","{glam, semi-fitted, weekend, romantic, night o..."
4,01DPCHNEW5F2RHJQ3NJMVPK6SE,"{color, occasion, style, fit}","{burgundies, classic, day to night, casual, an..."


In [11]:
# Merge the attribute value as label in the full_data
full_data = pd.merge(full_data,tagged_data3,on='product_id', how='left')
full_data.drop("labels",axis = 1,inplace = True)
full_data.rename(columns = {'attribute_value':'labels','attribute_name':'category'}, inplace = True)
full_data.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,category,labels
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,"A modern pump, in a rounded silhouette with an...",,
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,Dress it down with jeans and sneakers or dress...,,
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,100% UV protection\nCase and cleaning cloth in...,,
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",Canvas upper\nRound toe\nLace-up vamp\nSmartFO...,,
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,100% UV protection\nGradient lenses\nAdjustabl...,,


In [12]:
# Creating a new df with labels
df = full_data[full_data['labels'].notnull()]
df.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,category,labels
15,01E5ZXP5H0BTEZT9QD2HRZJ47A,A.L.C.,Lennox High Waist Cotton & Linen Pants,High-rise trousers tailored from a cool Italia...,Unknown,"True to size. High rise.\n31"" inseam; 14"" leg ...","{occasion, fit, style}","{classic, work, semi-fitted, modern, business ..."
33,01DSECZPAGJJC1EDC79JRBF4WK,Banana Republic,Mock-Neck Sweater Top,"Designed to be worn with high-waisted bottoms,...",Unknown,"Designed to be worn with high-waisted bottoms,...","{color, occasion, style, fit}","{classic, day to night, blacks, whites, work, ..."
43,01E607BHRQAJDZ76MJFN7RPRK1,Simon Miller,Rost Belted Shorts,Cinched at the natural waist and pleated for f...,Unknown,"True to size. XS=0-2, S=4-6, M=6-8, L=8-10, XL...","{occasion, fit, style}","{oversized, casual, androgynous, modern, weeke..."
44,01E5ZXJ6G03R7177X723CT04W0,A.L.C.,Minelli Silk Sleeveless Top,Painterly brushes of color that convey the flu...,Unknown,"True to size.\n25 1/2"" length (size Medium)\nF...","{occasion, fit, style}","{day to night, casual, modern, boho, relaxed, ..."
70,01E6074PQA697JZ1SBM6NM8TBG,Simon Miller,Nepa Mismatched Button Rib Cardigan,The West Coast–based label channels beachy vib...,Unknown,"True to size. XS=0-2, S=4-6, M=6-8, L=8-10, XL...","{occasion, fit, style}","{fitted / tailored, day to night, casual, mode..."


# Data Cleaning

### 1. Stopword  removal

In [13]:
# Stopwords including custom stopwords
from nltk.corpus import stopwords

custom_stopwords =  [ 'ever','always','every','even','though','here','was'
                      'there','ve','re', "'m","'ve", "n't",'not','yourself',
                      'yup','yours','you','yet','yes','yep','or','yeah','yea',
                      'nor','no',"weren't", "mustn't","needn't","shouldn't", 
                      "won't","wouldn't","weren't","wasn't","shan't","mightn't",
                      "isn't", "haven't","hasn't","doesn't","aren't","couldn't",
                      "don't","didn't","hadn't","mustn't",'on','your','yet','why','whose','we']


stopwords = stopwords.words('english') + custom_stopwords
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [14]:
# Removing stopwords from details and description columns
df['details'] = df["details"].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stopwords)]))
df['description'] = df["description"].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stopwords)]))
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,category,labels
15,01E5ZXP5H0BTEZT9QD2HRZJ47A,A.L.C.,Lennox High Waist Cotton & Linen Pants,High-rise trousers tailored cool Italian cotto...,Unknown,"True size. High rise. 31"" inseam; 14"" leg open...","{occasion, fit, style}","{classic, work, semi-fitted, modern, business ..."
33,01DSECZPAGJJC1EDC79JRBF4WK,Banana Republic,Mock-Neck Sweater Top,"Designed worn high-waisted bottoms, oh-so-now ...",Unknown,"Designed worn high-waisted bottoms, oh-so-now ...","{color, occasion, style, fit}","{classic, day to night, blacks, whites, work, ..."


### 2. Replacing numbers using regex

In [15]:
df['details'] = df['details'].apply(lambda x: re.sub(r'$\d+\W+|\b\d+\b|\W+\d+$','Number', x, flags=re.IGNORECASE))
df['description'] = df['description'].apply(lambda x: re.sub(r'$\d+\W+|\b\d+\b|\W+\d+$','Number', x, flags=re.IGNORECASE))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### 3. Punctuation Removal ( Using Regex)

A lot of punctuations can make it difficult during word token creation and lemmatization process. Thus, we remove common punctuations like !,* etc. as they do not add much value. However, hyphen and hash can lead to some interesting word combinations and special word meanings so we retain these 2 punctuations

In [16]:
# Removing punctuations except for hyphens and hashtag
import string
remove = string.punctuation
remove = remove.replace("-", "") # don't remove hyphens
pattern = r'[{}]'.format(remove) # create the pattern
df['details'] = df['details'].apply(lambda x: re.sub(pattern,'',x))
df['description'] = df['description'].apply(lambda x: re.sub(pattern,'',x))
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,category,labels
15,01E5ZXP5H0BTEZT9QD2HRZJ47A,A.L.C.,Lennox High Waist Cotton & Linen Pants,High-rise trousers tailored cool Italian cotto...,Unknown,True size High rise Number inseam Number leg o...,"{occasion, fit, style}","{classic, work, semi-fitted, modern, business ..."
33,01DSECZPAGJJC1EDC79JRBF4WK,Banana Republic,Mock-Neck Sweater Top,Designed worn high-waisted bottoms oh-so-now m...,Unknown,Designed worn high-waisted bottoms oh-so-now m...,"{color, occasion, style, fit}","{classic, day to night, blacks, whites, work, ..."
43,01E607BHRQAJDZ76MJFN7RPRK1,Simon Miller,Rost Belted Shorts,Cinched natural waist pleated fullness long wo...,Unknown,True size XSNumber-Number SNumber-Number MNumb...,"{occasion, fit, style}","{oversized, casual, androgynous, modern, weeke..."
44,01E5ZXJ6G03R7177X723CT04W0,A.L.C.,Minelli Silk Sleeveless Top,Painterly brushes color convey flutter butterf...,Unknown,True size Number NumberNumber length size Medi...,"{occasion, fit, style}","{day to night, casual, modern, boho, relaxed, ..."
70,01E6074PQA697JZ1SBM6NM8TBG,Simon Miller,Nepa Mismatched Button Rib Cardigan,West Coast–based label channels beachy vibes c...,Unknown,True size XSNumber-Number SNumber-Number MNumb...,"{occasion, fit, style}","{fitted / tailored, day to night, casual, mode..."


In [17]:
# Replacing , and \ by " " for brand category

df['brand_category'] = df['brand_category'].apply(lambda x: re.sub(r'(\,|\/)',' ',x))
df = df.reset_index(drop = True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,category,labels
0,01E5ZXP5H0BTEZT9QD2HRZJ47A,A.L.C.,Lennox High Waist Cotton & Linen Pants,High-rise trousers tailored cool Italian cotto...,Unknown,True size High rise Number inseam Number leg o...,"{occasion, fit, style}","{classic, work, semi-fitted, modern, business ..."
1,01DSECZPAGJJC1EDC79JRBF4WK,Banana Republic,Mock-Neck Sweater Top,Designed worn high-waisted bottoms oh-so-now m...,Unknown,Designed worn high-waisted bottoms oh-so-now m...,"{color, occasion, style, fit}","{classic, day to night, blacks, whites, work, ..."
2,01E607BHRQAJDZ76MJFN7RPRK1,Simon Miller,Rost Belted Shorts,Cinched natural waist pleated fullness long wo...,Unknown,True size XSNumber-Number SNumber-Number MNumb...,"{occasion, fit, style}","{oversized, casual, androgynous, modern, weeke..."
3,01E5ZXJ6G03R7177X723CT04W0,A.L.C.,Minelli Silk Sleeveless Top,Painterly brushes color convey flutter butterf...,Unknown,True size Number NumberNumber length size Medi...,"{occasion, fit, style}","{day to night, casual, modern, boho, relaxed, ..."
4,01E6074PQA697JZ1SBM6NM8TBG,Simon Miller,Nepa Mismatched Button Rib Cardigan,West Coast–based label channels beachy vibes c...,Unknown,True size XSNumber-Number SNumber-Number MNumb...,"{occasion, fit, style}","{fitted / tailored, day to night, casual, mode..."


### 4. Tokenization and Lemmatization 

Creating word tokens on which we run lemmatization to bring different words to their base form. 
The reason for choosing lemmatization here over stemming is that lemmatization using a dictionary based on lemma and hence can 
assign the words correctly to their root forms unlike stemming where words may be cut-down and do not have a real meaning.

In [18]:
# Lemmatizing details and description

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatization_sentences(sentence):
    tokens = sentence.split()
    lemma = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemma)

df['description'] = df['description'].apply(lambda x: lemmatization_sentences(x))
df['details'] = df['details'].apply(lambda x: lemmatization_sentences(x))
df.head(2)

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,category,labels
0,01E5ZXP5H0BTEZT9QD2HRZJ47A,A.L.C.,Lennox High Waist Cotton & Linen Pants,High-rise trouser tailored cool Italian cotton...,Unknown,True size High rise Number inseam Number leg o...,"{occasion, fit, style}","{classic, work, semi-fitted, modern, business ..."
1,01DSECZPAGJJC1EDC79JRBF4WK,Banana Republic,Mock-Neck Sweater Top,Designed worn high-waisted bottom oh-so-now mo...,Unknown,Designed worn high-waisted bottom oh-so-now mo...,"{color, occasion, style, fit}","{classic, day to night, blacks, whites, work, ..."


## Create Dataframe for each category

In [19]:
# Creating 4 dataframes
for var in ['fit','occasion','color','style']:
    df['{}'.format(var)] = [var in i for i in df['category']]
    

df_fit = df[df['fit']==True].drop(['category','fit','occasion','color','style'],axis = 1)
df_occasion = df[df['occasion']==True].drop(['category','fit','occasion','color','style'],axis = 1)
df_color = df[df['color']==True].drop(['category','fit','occasion','color','style'],axis = 1)
df_style = df[df['style']==True].drop(['category','fit','occasion','color','style'],axis = 1)


# Creating fit dataframe

df_fit = df_fit.reset_index(drop = True)
df_fit1 = pd.DataFrame(df_fit['labels'].values.tolist()) \
         .rename(columns = lambda x: 'labels{}'.format(x+1)) \
          .fillna('Unknown')
df_fit = df_fit.merge(df_fit1,left_on = df_fit.index,right_on = df_fit1.index,how = 'left')
df_fit = df_fit.drop("key_0",axis = 1)


# Creating occasion dataframe

df_occasion = df_occasion.reset_index(drop = True)
df_occasion1 = pd.DataFrame(df_occasion['labels'].values.tolist()) \
         .rename(columns = lambda x: 'labels{}'.format(x+1)) \
          .fillna('Unknown')
df_occasion = df_occasion.merge(df_occasion1,left_on = df_occasion.index,right_on = df_occasion1.index,how = 'left')
df_occasion = df_occasion.drop("key_0",axis = 1)


# Creating color dataframe

df_color = df_color.reset_index(drop = True)
df_color1 = pd.DataFrame(df_color['labels'].values.tolist()) \
         .rename(columns = lambda x: 'labels{}'.format(x+1)) \
          .fillna('Unknown')
df_color = df_color.merge(df_color1,left_on = df_color.index,right_on = df_color1.index,how = 'left')
df_color = df_color.drop("key_0",axis = 1)

# Creating style dataframe
df_style = df_style.reset_index(drop = True)
df_style1 = pd.DataFrame(df_style['labels'].values.tolist()) \
         .rename(columns = lambda x: 'labels{}'.format(x+1)) \
          .fillna('Unknown')
df_style = df_style.merge(df_style1,left_on = df_style.index,right_on = df_style1.index,how = 'left')
df_style = df_style.drop("key_0",axis = 1)


In [20]:
df_color.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,labels,labels1,labels2,labels3,...,labels12,labels13,labels14,labels15,labels16,labels17,labels18,labels19,labels20,labels21
0,01DSECZPAGJJC1EDC79JRBF4WK,Banana Republic,Mock-Neck Sweater Top,Designed worn high-waisted bottom oh-so-now mo...,Unknown,Designed worn high-waisted bottom oh-so-now mo...,"{classic, day to night, blacks, whites, work, ...",classic,day to night,blacks,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,01DVA59VHYAPT4PVX32NXW91G5,Tibi,Juan Embossed Mules,Tibis Juan embossed mule made shiny black leat...,women:SHOES:MULES,seen Pre-Fall ‘Number runway Heel measure appr...,"{classic, day to night, blacks, androgynous, w...",classic,day to night,blacks,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,01DVA4XY7A0QMMSK3V3SBR52J9,Alexandre Birman,Clarita Bow-Embellished Suede Sandals,Alexandre Birmans Clarita sandal quickly risen...,women:SHOES:SANDALS,Heel height measure approximately 50mm Number ...,"{classic, day to night, casual, neutrals, week...",classic,day to night,casual,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,01DVBP9AHVQTZXJSBNJ0N2NYJP,Khaite,Leather ankle boots,Heel measure approximately 50mm Number inch Bl...,Shoes Boots Ankle,Fits true size take normal size Italian sizing,"{classic, day to night, androgynous, blacks, w...",classic,day to night,androgynous,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,01DVBR93Y7KANZE3C09YCTVXDF,Lauren Manoogian,Alpaca-blend scarf,Brown alpaca-blend Number alpaca Number polyam...,Accessories Scarves Scarves,item measurement are Length 136cm Width 32cm,"{day to night, oversized, browns, casual, andr...",day to night,oversized,browns,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


In [21]:
# Finding relevant labels in each dataframe
fit = ['semi-fitted','relaxed','straight / regular','fitted / tailored','oversized']
occasion = ['day to night','work','weekend','night out','vacation','coldweather','workout']
color =['blacks','pinks','whites','reds','greens','blues','silvers','neutrals','beiges','grays','golds','navy','yellows','burgundies','purples',
         'browns','multi','oranges','teal']
style = ['business casual','classic','modern','boho','glam','romantic','casual','androgynous','edgy','retro','athleisure']


# Creating a new column that tells that each document had the relevant label (So it will assign yes and no)
for var in fit:
    df_fit['{}'.format(var)] = functools.reduce(np.logical_or, [df_fit['labels{}'.format(i)].str.contains(var) for i in range(1,22)])
    
for var in occasion:
    df_occasion['{}'.format(var)] = functools.reduce(np.logical_or, [df_occasion['labels{}'.format(i)].str.contains(var) for i in range(1,22)])

for var in color:
    df_color['{}'.format(var)] = functools.reduce(np.logical_or, [df_color['labels{}'.format(i)].str.contains(var) for i in range(1,22)])

for var in style:
    df_style['{}'.format(var)] = functools.reduce(np.logical_or, [df_style['labels{}'.format(i)].str.contains(var) for i in range(1,22)])

#drop original labels columns
df_fit.drop(['labels{}'.format(i) for i in range(1,22)] + ['labels'],axis = 1,inplace = True)
df_occasion.drop(['labels{}'.format(i) for i in range(1,22)] + ['labels'],axis = 1,inplace = True)
df_color.drop(['labels{}'.format(i) for i in range(1,22)] + ['labels'],axis = 1,inplace = True)
df_style.drop(['labels{}'.format(i) for i in range(1,22)] + ['labels'],axis = 1,inplace = True)

In [22]:
# Exporting files
df_fit.to_csv("style.csv")
df_occasion.to_csv("occasion.csv")
df_color.to_csv("color.csv")
df_style.to_csv("fit.csv")