# Import libraries

In [1]:
import glob
import os

import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

# Define variables

In [2]:
root = "./datasets/hm-images/images/"

# Image counts

In [3]:
total = 0
for base, dirs, files in os.walk(root):
    count = len(files)
    total += count
    print(f"{base}: {count}")
print(f"Total: {total}")

./datasets/hm-images/images/: 0
./datasets/hm-images/images/large: 0
./datasets/hm-images/images/large/environmental: 940
./datasets/hm-images/images/large/descriptiveDetail: 25627
./datasets/hm-images/images/large/descriptiveStillLife: 29274
./datasets/hm-images/images/large/lookbook: 16519
./datasets/hm-images/images/extraSmall: 0
./datasets/hm-images/images/extraSmall/environmental: 940
./datasets/hm-images/images/extraSmall/descriptiveDetail: 25627
./datasets/hm-images/images/extraSmall/descriptiveStillLife: 29275
./datasets/hm-images/images/extraSmall/lookbook: 16508
./datasets/hm-images/images/medium: 0
./datasets/hm-images/images/medium/environmental: 940
./datasets/hm-images/images/medium/descriptiveDetail: 25627
./datasets/hm-images/images/medium/descriptiveStillLife: 29275
./datasets/hm-images/images/medium/lookbook: 16518
./datasets/hm-images/images/extraLarge: 0
./datasets/hm-images/images/extraLarge/environmental: 938
./datasets/hm-images/images/extraLarge/descriptiveDetai

# Create data

In [4]:
num = len(root)

file_list = glob.glob(root + "**/*.jpg", recursive=True)
castors = [int(os.path.basename(path)[:-4]) for path in file_list]
path_list = [path[num:] for path in file_list]
print(len(file_list))

289484


In [5]:
df = pd.DataFrame(data=
    {
        "path": path_list,
        "castor": castors
    }
)
print(df.shape)
df.head()

(289484, 2)


Unnamed: 0,path,castors
0,large/environmental/1108541001.jpg,1108541001
1,large/environmental/1011014001.jpg,1011014001
2,large/environmental/1116552001.jpg,1116552001
3,large/environmental/1025923001.jpg,1025923001
4,large/environmental/1094597001.jpg,1094597001


In [7]:
df.to_csv("castors_path_df.csv", index=False)

# Pim table

In [None]:
#%%bash
#gsutil cp -r gs://hdl_tables/dim/dim_pim ./
#gsutil cp -r gs://hdl-tables/dma/product_article_datamart ./

In [2]:
padma = pd.read_parquet("product_article_datamart", columns=["product_code", "article_code", "castor"])
print(padma.shape)
padma.head()

(3695062, 3)


Unnamed: 0,product_code,article_code,castor
0,686043,131,686043131
1,686043,132,686043132
2,686043,133,686043133
3,686043,134,686043134
4,686043,135,686043135


In [3]:
padma = padma.drop_duplicates()
padma.shape

(1989227, 3)

In [4]:
padma.isna().any()

product_code    False
article_code    False
castor          False
dtype: bool

In [5]:
padma.dtypes

product_code    object
article_code    object
castor          object
dtype: object

In [6]:
padma.castor = padma.castor.astype(int)

In [7]:
padma.dtypes

product_code    object
article_code    object
castor           int64
dtype: object

In [8]:
pim = pd.read_parquet("dim_pim")
print(pim.shape)
pim.head()

(274925, 109)


Unnamed: 0,product_number,product_article_code,product_code,article_code,product_name_short,product_name_long,product_short_description,product_long_description,product_title_name,product_collection,...,article_fragrance_family,article_coverage,article_skin_tone,article_finish,article_key_ingredients,article_spf,article_hair_type,article_external_colour_id,article_sustainability_details,hdl_load_id
0,109599,109599007,109599,7,Linen fitted sheet,Linen fitted sheet,,Fitted sheet in washed linen. Tumble-drying wi...,,,...,,,,,,,,,,220426035422042603
1,109599,109599006,109599,6,Linen fitted sheet,Linen fitted sheet,,Fitted sheet in washed linen. Tumble-drying wi...,,,...,,,,,,,,,,220426035422042603
2,109599,109599001,109599,1,Linen fitted sheet,Linen fitted sheet,,Fitted sheet in washed linen. Tumble-drying wi...,,,...,,,,,,,,,,220426035422042603
3,118152,118152005,118152,5,Valance,Cotton valance,,Valance in sturdy cotton canvas with a cotton/...,,,...,,,,,,,,,,220426035422042603
4,118152,118152001,118152,1,Valance,Cotton valance,,Valance in sturdy cotton canvas with a cotton/...,,,...,,,,,,,,,,220426035422042603


In [9]:
pim["product_fit"].value_counts()

fitted                            8509
regularfit                        5324
slimfit                           4541
oversized                         4170
skinnyfit                         2824
relaxedfit                        2471
loosefit                          1973
superskinnyfit                     980
musclefit                          334
["regularfit","skinnyfit"]           6
["loosefit","oversized"]             4
["relaxedfit","skinnyfit"]           2
["loosefit","slimfit"]               2
["fitted","oversized"]               2
["skinnyfit","superskinnyfit"]       1
["loosefit","regularfit"]            1
["regularfit","relaxedfit"]          1
Name: product_fit, dtype: int64

In [10]:
pim = pim[["product_code", "article_code", "product_fit"]].drop_duplicates()
pim.shape

(274924, 3)

In [11]:
pim.isna().any()

product_code    False
article_code     True
product_fit      True
dtype: bool

In [12]:
pim = pim.dropna(axis=0, subset=["article_code", "product_fit"])
print(pim.shape)
pim.head()

(31099, 3)


Unnamed: 0,product_code,article_code,product_fit
93,225618,19,fitted
94,225618,17,fitted
122,238230,1,fitted
130,251087,1,slimfit
131,251087,2,slimfit


In [13]:
pim.isna().any()

product_code    False
article_code    False
product_fit     False
dtype: bool

In [14]:
data = pim.merge(padma, on=["product_code", "article_code"], how="left")
print(data.shape)
data.head()

(31099, 4)


Unnamed: 0,product_code,article_code,product_fit,castor
0,225618,19,fitted,225618019
1,225618,17,fitted,225618017
2,238230,1,fitted,238230001
3,251087,1,slimfit,251087001
4,251087,2,slimfit,251087002


In [15]:
data.dtypes

product_code    object
article_code    object
product_fit     object
castor           int64
dtype: object

In [16]:
data = data[~data["product_fit"].str.contains("[", regex=False)]
print(data.shape)
data.head()

(31080, 4)


Unnamed: 0,product_code,article_code,product_fit,castor
0,225618,19,fitted,225618019
1,225618,17,fitted,225618017
2,238230,1,fitted,238230001
3,251087,1,slimfit,251087001
4,251087,2,slimfit,251087002


In [17]:
data.isna().any()

product_code    False
article_code    False
product_fit     False
castor          False
dtype: bool

In [18]:
data = data.drop(axis=1, labels=["product_code", "article_code"])
data.shape

(31080, 2)

In [19]:
castors = pd.read_csv("castors.csv")
print(castors.shape)
castors.head()

(289484, 2)


Unnamed: 0,path,castor
0,large/environmental/1108541001.jpg,1108541001
1,large/environmental/1011014001.jpg,1011014001
2,large/environmental/1116552001.jpg,1116552001
3,large/environmental/1025923001.jpg,1025923001
4,large/environmental/1094597001.jpg,1094597001


In [20]:
castors.dtypes

path      object
castor     int64
dtype: object

In [21]:
out = castors.merge(data, on="castor", how="inner")
print(out.shape)
out.head()

(71315, 3)


Unnamed: 0,path,castor,product_fit
0,large/environmental/1025399011.jpg,1025399011,regularfit
1,large/descriptiveDetail/1025399011.jpg,1025399011,regularfit
2,large/descriptiveStillLife/1025399011.jpg,1025399011,regularfit
3,large/lookbook/1025399011.jpg,1025399011,regularfit
4,extraSmall/environmental/1025399011.jpg,1025399011,regularfit


In [22]:
out.isna().any()

path           False
castor         False
product_fit    False
dtype: bool

In [23]:
out.castor.unique().shape

(6223,)

In [24]:
out.dtypes

path           object
castor          int64
product_fit    object
dtype: object

In [25]:
out["product_fit"].value_counts()

oversized         14580
regularfit        13766
relaxedfit        13445
fitted            13158
loosefit           9281
slimfit            4901
skinnyfit          1672
musclefit           340
superskinnyfit      172
Name: product_fit, dtype: int64

In [26]:
out["labels"] = out["product_fit"].astype("category").cat.codes
print(out.shape)
out.head()

(71315, 4)


Unnamed: 0,path,castor,product_fit,labels
0,large/environmental/1025399011.jpg,1025399011,regularfit,4
1,large/descriptiveDetail/1025399011.jpg,1025399011,regularfit,4
2,large/descriptiveStillLife/1025399011.jpg,1025399011,regularfit,4
3,large/lookbook/1025399011.jpg,1025399011,regularfit,4
4,extraSmall/environmental/1025399011.jpg,1025399011,regularfit,4


In [27]:
out.labels.value_counts()

3    14580
4    13766
5    13445
0    13158
1     9281
7     4901
6     1672
2      340
8      172
Name: labels, dtype: int64

# Split data

In [29]:
cv = StratifiedGroupKFold(n_splits=2)

In [74]:
train_idxs, test_idxs = next(cv.split(out.path, out.labels, out.castor))
print("TRAIN:", out.castor[train_idxs], out.labels[train_idxs])
print(" TEST:", out.castor[test_idxs], out.labels[test_idxs])

TRAIN: 0        1025399011
1        1025399011
2        1025399011
3        1025399011
4        1025399011
            ...    
71302    1003931005
71307    1062013006
71308    1062013006
71309    1062013006
71310    1062013006
Name: castor, Length: 35656, dtype: int64 0        4
1        4
2        4
3        4
4        4
        ..
71302    5
71307    5
71308    5
71309    5
71310    5
Name: labels, Length: 35656, dtype: int8
 TEST: 16       1032693001
17       1032693001
18       1032693001
19       1032693001
20       1032693001
            ...    
71306    1065327002
71311    1067474002
71312    1067474002
71313    1067474003
71314    1067474003
Name: castor, Length: 35659, dtype: int64 16       1
17       1
18       1
19       1
20       1
        ..
71306    4
71311    5
71312    5
71313    5
71314    5
Name: labels, Length: 35659, dtype: int8


In [75]:
len(train_idxs)

35656

In [76]:
len(test_idxs)

35659

# Write data

In [44]:
train_fit = out.iloc[train_idxs, :]
test_fit = out.iloc[test_idxs, :]

In [47]:
out.to_csv("full_fit.csv", index=False)

train_fit.to_csv("./train.csv", index=False)
test_fit.to_csv("./test.csv", index=False)

In [68]:
out_gcp = out[["path", "product_fit"]].copy()
out_gcp["path"] = "gs://hm-images-bucket/images/" + out_gcp["path"]

In [69]:
out_gcp["mode"] = "VALIDATION"
out_gcp.loc[train_idxs, "mode"] = "TRAINING"

In [70]:
out_gcp = out_gcp[["mode", "path", "product_fit"]]
out_gcp

Unnamed: 0,mode,path,product_fit
0,VALIDATION,gs://hm-images-bucket/images/large/environment...,regularfit
1,VALIDATION,gs://hm-images-bucket/images/large/descriptive...,regularfit
2,VALIDATION,gs://hm-images-bucket/images/large/descriptive...,regularfit
3,VALIDATION,gs://hm-images-bucket/images/large/lookbook/10...,regularfit
4,VALIDATION,gs://hm-images-bucket/images/extraSmall/enviro...,regularfit
...,...,...,...
71310,VALIDATION,gs://hm-images-bucket/images/extraLarge/lookbo...,relaxedfit
71311,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,relaxedfit
71312,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,relaxedfit
71313,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,relaxedfit


In [71]:
out_gcp.shape

(71315, 3)

In [72]:
out_gcp.to_csv("full_fit_gcai.csv", index=False, header=False)

In [73]:
pd.read_csv("full_fit_gcai.csv")

Unnamed: 0,VALIDATION,gs://hm-images-bucket/images/large/environmental/1025399011.jpg,regularfit
0,VALIDATION,gs://hm-images-bucket/images/large/descriptive...,regularfit
1,VALIDATION,gs://hm-images-bucket/images/large/descriptive...,regularfit
2,VALIDATION,gs://hm-images-bucket/images/large/lookbook/10...,regularfit
3,VALIDATION,gs://hm-images-bucket/images/extraSmall/enviro...,regularfit
4,VALIDATION,gs://hm-images-bucket/images/extraSmall/descri...,regularfit
...,...,...,...
71309,VALIDATION,gs://hm-images-bucket/images/extraLarge/lookbo...,relaxedfit
71310,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,relaxedfit
71311,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,relaxedfit
71312,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,relaxedfit
