1. Import libraries

In [3]:
import pandas as pd
import numpy as np

2. Review the dataset

* Import the data

In [4]:
styles_link = './styles.csv'

styles_df = pd.read_csv(styles_link, names=["id", "gender", "masterCategory", "subCategory", "acticleType", "baseColour", "season", "year", "usage", "productDisplayName"], skiprows=1) # Some data has 11 fields instead of 10

styles_df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,acticleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [35]:
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44446 entries, 0 to 44445
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44446 non-null  int64  
 1   gender              44446 non-null  object 
 2   masterCategory      44446 non-null  object 
 3   subCategory         44446 non-null  object 
 4   acticleType         44446 non-null  object 
 5   baseColour          44431 non-null  object 
 6   season              44425 non-null  object 
 7   year                44445 non-null  float64
 8   usage               44129 non-null  object 
 9   productDisplayName  44439 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.4+ MB


In [5]:
images_link = './images.csv'

images_df = pd.read_csv(images_link)

images_df.head()

Unnamed: 0,filename,link
0,15970.jpg,http://assets.myntassets.com/v1/images/style/p...
1,39386.jpg,http://assets.myntassets.com/v1/images/style/p...
2,59263.jpg,http://assets.myntassets.com/v1/images/style/p...
3,21379.jpg,http://assets.myntassets.com/v1/images/style/p...
4,53759.jpg,http://assets.myntassets.com/v1/images/style/p...


> Can easily see that the *filename* field of image.csv file is exactly the id with the prefix

* So make the *filename* field into id for further searching mechanism

In [6]:
def nameToId(name):
    if name.endswith('.jpg'):
        return name[:-4]
    
    return int(name)
    
nameToId('123.jpg')

'123'

In [7]:
images_df['id'] = images_df['filename'].apply(nameToId)


# drop the filename field
images_df.drop(['filename'], axis=1, inplace=True)

In [8]:
# change the type of id to int64
images_df = images_df.astype({'id':'int64'})

In [40]:
images_df.head()

Unnamed: 0,link,id
0,http://assets.myntassets.com/v1/images/style/p...,15970
1,http://assets.myntassets.com/v1/images/style/p...,39386
2,http://assets.myntassets.com/v1/images/style/p...,59263
3,http://assets.myntassets.com/v1/images/style/p...,21379
4,http://assets.myntassets.com/v1/images/style/p...,53759


In [41]:
images_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44446 entries, 0 to 44445
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   link    44446 non-null  object
 1   id      44446 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 694.6+ KB


3. Merge the link into the data set of styles

In [9]:
product_df = pd.merge(styles_df, images_df, on='id', how='inner')

product_df

Unnamed: 0,id,gender,masterCategory,subCategory,acticleType,baseColour,season,year,usage,productDisplayName,link
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,http://assets.myntassets.com/v1/images/style/p...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,http://assets.myntassets.com/v1/images/style/p...
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,http://assets.myntassets.com/v1/images/style/p...
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,http://assets.myntassets.com/v1/images/style/p...
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,http://assets.myntassets.com/v1/images/style/p...
...,...,...,...,...,...,...,...,...,...,...,...
44441,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe,http://assets.myntassets.com/v1/images/style/p...
44442,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop,http://assets.myntassets.com/v1/images/style/p...
44443,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt,http://assets.myntassets.com/v1/images/style/p...
44444,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume,http://assets.myntassets.com/v1/images/style/p...


4. Save the result as pickle file

In [10]:
product_df.to_pickle('./preprocessed-data/product.pkl')