# Import libraries

In [None]:
import json
import os

import gcsfs
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

In [None]:
gfs = gcsfs.GCSFileSystem(project="airesearch-1409")

# Define variables

In [None]:
root = "gs://hm_images/"
img_path = "images"
annotation_path = "annotations"

# Image counts

In [None]:
total = 0
for base, dirs, files in gfs.walk(os.path.join(root, img_path)):
    count = len(files)
    total += count
    print(f"{base}: {count}")
print(f"Total: {total}")

# Create data

In [None]:
num = len(os.path.join(root, img_path)) - 4

file_list = gfs.glob(root + "images/**/*.jpg", recursive=True)
castor_list = [int(os.path.basename(path)[:-4]) for path in file_list]
path_list = [path[num:] for path in file_list]
print(len(file_list))

In [None]:
df = pd.DataFrame(data=
    {
        "path": path_list,
        "castor": castor_list
    }
)
print(df.shape)
df.head()

In [None]:
# df.to_csv(os.path.join(root, annotation_path, "castors.csv"), index=False)

# Padma table

In [None]:
padma = pd.read_parquet("gs://hdl_tables/dma/product_article_datamart.parquet",
                        columns=["product_code", "article_code", "castor"])
print(padma.shape)
padma.head()

In [None]:
padma.isna().any()

In [None]:
padma = padma.drop_duplicates()
padma.shape

In [None]:
padma.isna().any()

In [None]:
padma.dtypes

In [None]:
padma.castor = padma.castor.astype(int)
padma.dtypes

In [None]:
padma.head()

# PIM table

In [None]:
cols = [
    "product_code",
    "article_code",
    "product_age_group",
    "product_waist_rise",
    "product_sleeve_length",
    "product_garment_length",
    "product_fit",
    "product_sleeve_style",
    "product_neck_line_style",
    "product_collar_style",
]

In [None]:
pim = pd.read_parquet("gs://hdl_tables/dim/dim_pim.parquet",
                      columns=cols)
print(pim.shape)
pim.head()

In [None]:
pim.isna().any()

In [None]:
pim.isna().sum()

In [None]:
pim = pim.dropna(axis=0, subset=["article_code"])
print(pim.shape)
pim.head()

In [None]:
pim.drop_duplicates(inplace=True)
pim.shape

In [None]:
out = []
for c in cols:
    out.append(pim[c].apply(lambda x: json.loads(x) if x and "[" in x else x))

In [None]:
tmp = pd.concat(out, axis=1)
print(tmp.shape)
tmp.head()

In [None]:
out = []
for c in cols[2:]:
    out.append(pd.get_dummies(tmp[c].explode()).reset_index().groupby("index").max())

In [None]:
res = pd.concat(out, axis=1)
res = pd.concat([pim[cols[:2]], res], axis=1)
print(res.shape)
res.head()

In [None]:
res.isna().any().sum()

# Merge pim, padma

In [None]:
data = res.merge(padma, on=["product_code", "article_code"], how="left")
print(data.shape)
data.head()

In [None]:
data.dtypes

In [None]:
print(data.isna().any().sum())
data.isna().any()

In [None]:
data.dropna(inplace=True)
print(data.shape)
print(data.isna().any().sum())

In [None]:
data = data.drop(axis=1, labels=["product_code", "article_code"])
data.shape

# Merge with castors

In [None]:
df.dtypes

In [None]:
out = df.merge(data, on="castor", how="inner")
print(out.shape)
out.head()

In [None]:
print(out.isna().any().sum())
out.isna().any()

In [None]:
out.dtypes

In [None]:
out.castor.unique().shape

# Split data

In [None]:
gss = GroupShuffleSplit(n_splits=1, train_size=.9, random_state=42)

In [None]:
train_idxs, test_idxs = next(gss.split(X=out.path, groups=out.castor))
print("TRAIN:", out.castor[train_idxs], out.path[train_idxs])
print(" TEST:", out.castor[test_idxs], out.path[test_idxs])

In [None]:
len(train_idxs)

In [None]:
len(test_idxs)

In [None]:
assert not set(out.castor[train_idxs]) & set(out.castor[test_idxs])

# Write data

In [None]:
out.drop(columns=["castor"], axis=1, inplace=True)

train = out.iloc[train_idxs, :]
test = out.iloc[test_idxs, :]

print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# train.to_csv(os.path.join(root, annotation_path, "train.csv"), index=False)
# test.to_csv(os.path.join(root, annotation_path, "test.csv"), index=False)