# Import libraries

In [1]:
import os

import gcsfs
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
gfs = gcsfs.GCSFileSystem(project="smle-attribution-d237")

# Define variables

In [3]:
root = "gs://hm-images-bucket/"
img_path = "images"
annotation_path = "annotations"

# Image counts

In [4]:
total = 0
for base, dirs, files in gfs.walk(os.path.join(root, img_path)):
    count = len(files)
    total += count
    print(f"{base}: {count}")
print(f"Total: {total}")

hm-images-bucket/images: 0
hm-images-bucket/images/extraLarge: 0
hm-images-bucket/images/extraLarge/descriptiveDetail: 25633
hm-images-bucket/images/extraLarge/descriptiveStillLife: 29279
hm-images-bucket/images/extraLarge/environmental: 938
hm-images-bucket/images/extraLarge/lookbook: 16564
hm-images-bucket/images/extraSmall: 0
hm-images-bucket/images/extraSmall/descriptiveDetail: 25627
hm-images-bucket/images/extraSmall/descriptiveStillLife: 29275
hm-images-bucket/images/extraSmall/environmental: 940
hm-images-bucket/images/extraSmall/lookbook: 16508
hm-images-bucket/images/large: 0
hm-images-bucket/images/large/descriptiveDetail: 25627
hm-images-bucket/images/large/descriptiveStillLife: 29274
hm-images-bucket/images/large/environmental: 940
hm-images-bucket/images/large/lookbook: 16519
hm-images-bucket/images/medium: 0
hm-images-bucket/images/medium/descriptiveDetail: 25627
hm-images-bucket/images/medium/descriptiveStillLife: 29275
hm-images-bucket/images/medium/environmental: 940
h

# Create data

In [5]:
num = len(os.path.join(root, img_path)) - 4

file_list = gfs.glob(root + "images/**/*.jpg", recursive=True)
castor_list = [int(os.path.basename(path)[:-4]) for path in file_list]
path_list = [path[num:] for path in file_list]
print(len(file_list))

289484


In [6]:
df = pd.DataFrame(data=
    {
        "path": path_list,
        "castor": castor_list
    }
)
print(df.shape)
df.head()

(289484, 2)


Unnamed: 0,path,castor
0,extraLarge/descriptiveDetail/1000007001.jpg,1000007001
1,extraLarge/descriptiveDetail/1000035001.jpg,1000035001
2,extraLarge/descriptiveDetail/1000037001.jpg,1000037001
3,extraLarge/descriptiveDetail/1000037002.jpg,1000037002
4,extraLarge/descriptiveDetail/1000037003.jpg,1000037003


In [7]:
# df.to_csv(os.path.join(root, annotation_path, "castors.csv"), index=False)

# Padma table

In [8]:
padma = pd.read_parquet("gs://hdl-tables/dma/product_article_datamart",
                        columns=["product_code", "article_code", "castor"])
print(padma.shape)
padma.head()

(3695062, 3)


Unnamed: 0,product_code,article_code,castor
0,686043,131,686043131
1,686043,132,686043132
2,686043,133,686043133
3,686043,134,686043134
4,686043,135,686043135


In [9]:
padma.isna().any()

product_code    False
article_code    False
castor          False
dtype: bool

In [10]:
padma = padma.drop_duplicates()
padma.shape

(1989227, 3)

In [11]:
padma.isna().any()

product_code    False
article_code    False
castor          False
dtype: bool

In [12]:
padma.dtypes

product_code    object
article_code    object
castor          object
dtype: object

In [13]:
padma.castor = padma.castor.astype(int)
padma.dtypes

product_code    object
article_code    object
castor           int64
dtype: object

# PIM table

In [14]:
pim = pd.read_parquet("gs://hdl-tables/dim/dim_pim",
                      columns=["product_code", "article_code", "product_fit"])
print(pim.shape)
pim.head()

(274925, 3)


Unnamed: 0,product_code,article_code,product_fit
0,109599,7,
1,109599,6,
2,109599,1,
3,118152,5,
4,118152,1,


In [15]:
pim.isna().any()

product_code    False
article_code     True
product_fit      True
dtype: bool

In [16]:
pim.isna().sum()

product_code         0
article_code       776
product_fit     243780
dtype: int64

In [17]:
pim = pim.dropna(axis=0, subset=["article_code", "product_fit"])
print(pim.shape)
pim.head()

(31100, 3)


Unnamed: 0,product_code,article_code,product_fit
93,225618,19,fitted
94,225618,17,fitted
122,238230,1,fitted
130,251087,1,slimfit
131,251087,2,slimfit


In [18]:
pim["product_fit"].value_counts()

fitted                            8499
regularfit                        5320
slimfit                           4536
oversized                         4154
skinnyfit                         2819
relaxedfit                        2470
loosefit                          1971
superskinnyfit                     978
musclefit                          334
["regularfit","skinnyfit"]           6
["loosefit","oversized"]             4
["relaxedfit","skinnyfit"]           2
["loosefit","slimfit"]               2
["fitted","oversized"]               2
["skinnyfit","superskinnyfit"]       1
["loosefit","regularfit"]            1
["regularfit","relaxedfit"]          1
Name: product_fit, dtype: int64

In [19]:
pim = pim.drop_duplicates()
pim.shape

(31099, 3)

In [20]:
pim.isna().any()

product_code    False
article_code    False
product_fit     False
dtype: bool

# Merge pim, padma

In [21]:
data = pim.merge(padma, on=["product_code", "article_code"], how="left")
print(data.shape)
data.head()

(31099, 4)


Unnamed: 0,product_code,article_code,product_fit,castor
0,225618,19,fitted,225618019
1,225618,17,fitted,225618017
2,238230,1,fitted,238230001
3,251087,1,slimfit,251087001
4,251087,2,slimfit,251087002


In [22]:
data.dtypes

product_code    object
article_code    object
product_fit     object
castor           int64
dtype: object

In [23]:
data = data[~data["product_fit"].str.contains("[", regex=False)]
print(data.shape)
data.head()

(31080, 4)


Unnamed: 0,product_code,article_code,product_fit,castor
0,225618,19,fitted,225618019
1,225618,17,fitted,225618017
2,238230,1,fitted,238230001
3,251087,1,slimfit,251087001
4,251087,2,slimfit,251087002


In [24]:
data.isna().any()

product_code    False
article_code    False
product_fit     False
castor          False
dtype: bool

In [25]:
data = data.drop(axis=1, labels=["product_code", "article_code"])
data.shape

(31080, 2)

# Merge with castors

In [26]:
df.dtypes

path      object
castor     int64
dtype: object

In [27]:
out = df.merge(data, on="castor", how="inner")
print(out.shape)
out.head()

(71315, 3)


Unnamed: 0,path,castor,product_fit
0,extraLarge/descriptiveDetail/1000037001.jpg,1000037001,fitted
1,extraLarge/descriptiveStillLife/1000037001.jpg,1000037001,fitted
2,extraLarge/lookbook/1000037001.jpg,1000037001,fitted
3,extraSmall/descriptiveDetail/1000037001.jpg,1000037001,fitted
4,extraSmall/descriptiveStillLife/1000037001.jpg,1000037001,fitted


In [28]:
out.isna().any()

path           False
castor         False
product_fit    False
dtype: bool

In [29]:
out.dtypes

path           object
castor          int64
product_fit    object
dtype: object

In [30]:
out.castor.unique().shape

(6223,)

In [31]:
out["product_fit"].value_counts()

oversized         14580
regularfit        13766
relaxedfit        13445
fitted            13158
loosefit           9281
slimfit            4901
skinnyfit          1672
musclefit           340
superskinnyfit      172
Name: product_fit, dtype: int64

In [32]:
out["labels"] = out["product_fit"].astype("category").cat.codes
print(out.shape)
out.head()

(71315, 4)


Unnamed: 0,path,castor,product_fit,labels
0,extraLarge/descriptiveDetail/1000037001.jpg,1000037001,fitted,0
1,extraLarge/descriptiveStillLife/1000037001.jpg,1000037001,fitted,0
2,extraLarge/lookbook/1000037001.jpg,1000037001,fitted,0
3,extraSmall/descriptiveDetail/1000037001.jpg,1000037001,fitted,0
4,extraSmall/descriptiveStillLife/1000037001.jpg,1000037001,fitted,0


In [33]:
out.labels.value_counts()

3    14580
4    13766
5    13445
0    13158
1     9281
7     4901
6     1672
2      340
8      172
Name: labels, dtype: int64

# Split data

In [34]:
cv = StratifiedGroupKFold(n_splits=2)

In [35]:
train_idxs, test_idxs = next(cv.split(out.path, out.labels, out.castor))
print("TRAIN:", out.castor[train_idxs], out.labels[train_idxs])
print(" TEST:", out.castor[test_idxs], out.labels[test_idxs])

TRAIN: 0        1000037001
1        1000037001
2        1000037001
3        1000037001
4        1000037001
            ...    
71302    1039409002
71307    1062013006
71308    1062013006
71309    1062013006
71310    1062013006
Name: castor, Length: 35656, dtype: int64 0        0
1        0
2        0
3        0
4        0
        ..
71302    3
71307    5
71308    5
71309    5
71310    5
Name: labels, Length: 35656, dtype: int8
 TEST: 12       1000037002
13       1000037002
14       1000037002
15       1000037002
16       1000037002
            ...    
71306    1059115002
71311    1065327002
71312    1065327002
71313    1065327002
71314    1065327002
Name: castor, Length: 35659, dtype: int64 12       0
13       0
14       0
15       0
16       0
        ..
71306    0
71311    4
71312    4
71313    4
71314    4
Name: labels, Length: 35659, dtype: int8


In [36]:
len(train_idxs)

35656

In [37]:
len(test_idxs)

35659

# Split data 2

In [39]:
tmp = out[["product_fit", "castor"]].drop_duplicates()
print(tmp.shape)
tmp.head()

(6223, 2)

In [54]:
sub_train = tmp.groupby("product_fit").sample(frac=0.8)
sub_train["is_train"] = True
print(sub_train.shape)
sub_train.head()

(4977, 3)


Unnamed: 0,product_fit,castor,is_train
5608,fitted,1006409003,True
61568,fitted,1066900001,True
60528,fitted,1065197004,True
68110,fitted,1081266001,True
8328,fitted,1009762004,True


In [55]:
final = out.merge(sub_train[["castor", "is_train"]], on="castor", how="left")
print(final.shape)
final.head()

(71315, 5)


Unnamed: 0,path,castor,product_fit,labels,is_train
0,extraLarge/descriptiveDetail/1000037001.jpg,1000037001,fitted,0,True
1,extraLarge/descriptiveStillLife/1000037001.jpg,1000037001,fitted,0,True
2,extraLarge/lookbook/1000037001.jpg,1000037001,fitted,0,True
3,extraSmall/descriptiveDetail/1000037001.jpg,1000037001,fitted,0,True
4,extraSmall/descriptiveStillLife/1000037001.jpg,1000037001,fitted,0,True


In [56]:
final.isna().any()

path           False
castor         False
product_fit    False
labels         False
is_train        True
dtype: bool

In [57]:
final.fillna(False, inplace=True)

In [58]:
final.isna().any()

path           False
castor         False
product_fit    False
labels         False
is_train       False
dtype: bool

In [59]:
final.dtypes

path           object
castor          int64
product_fit    object
labels           int8
is_train         bool
dtype: object

In [60]:
final.is_train.sum()

57041

In [61]:
(~final.is_train).sum()

14274

In [78]:
train = final.loc[final.is_train, ["path", "castor", "product_fit", "labels"]]
train.shape

(57041, 4)

In [79]:
test = final.loc[~final.is_train, ["path", "castor", "product_fit", "labels"]]
test.shape

(14274, 4)

In [69]:
train.product_fit.value_counts()

oversized         11660
regularfit        11035
relaxedfit        10766
fitted            10542
loosefit           7385
slimfit            3909
skinnyfit          1340
musclefit           268
superskinnyfit      136
Name: product_fit, dtype: int64

In [70]:
test.product_fit.value_counts()

oversized         2920
regularfit        2731
relaxedfit        2679
fitted            2616
loosefit          1896
slimfit            992
skinnyfit          332
musclefit           72
superskinnyfit      36
Name: product_fit, dtype: int64

In [76]:
assert not set(train.castor) & set(test.castor)

# Write data

In [43]:
train_fit = out.iloc[train_idxs, :]
test_fit = out.iloc[test_idxs, :]

In [51]:
print(train_fit.shape)
print(test_fit.shape)

(35656, 4)
(35659, 4)


In [77]:
# out.to_csv(os.path.join(root, annotation_path, "full_fit.csv"), index=False)

# train_fit.to_csv(os.path.join(root, annotation_path, "train.csv"), index=False)
# test_fit.to_csv(os.path.join(root, annotation_path, "test.csv"), index=False)

In [45]:
out_gcp = out[["path", "product_fit"]].copy()
out_gcp["path"] = "gs://hm-images-bucket/images/" + out_gcp["path"]

In [46]:
out_gcp["mode"] = "VALIDATION"
out_gcp.loc[train_idxs, "mode"] = "TRAINING"

In [47]:
out_gcp = out_gcp[["mode", "path", "product_fit"]]
out_gcp

Unnamed: 0,mode,path,product_fit
0,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,fitted
1,TRAINING,gs://hm-images-bucket/images/extraLarge/descri...,fitted
2,TRAINING,gs://hm-images-bucket/images/extraLarge/lookbo...,fitted
3,TRAINING,gs://hm-images-bucket/images/extraSmall/descri...,fitted
4,TRAINING,gs://hm-images-bucket/images/extraSmall/descri...,fitted
...,...,...,...
71310,TRAINING,gs://hm-images-bucket/images/medium/lookbook/1...,relaxedfit
71311,VALIDATION,gs://hm-images-bucket/images/extraLarge/lookbo...,regularfit
71312,VALIDATION,gs://hm-images-bucket/images/extraSmall/lookbo...,regularfit
71313,VALIDATION,gs://hm-images-bucket/images/large/lookbook/10...,regularfit


In [48]:
out_gcp.shape

(71315, 3)

In [86]:
# out_gcp.to_csv(os.path.join(root, annotation_path, "full_fit_gcai.csv"), index=False, header=False)