# Import libraries

In [None]:
import os

import gcsfs
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
gfs = gcsfs.GCSFileSystem(project="smle-attribution-d237")

# Define variables

In [None]:
root = "gs://hm-images-bucket/"
img_path = "images"
annotation_path = "annotations"

# Image counts

In [None]:
total = 0
for base, dirs, files in gfs.walk(os.path.join(root, img_path)):
    count = len(files)
    total += count
    print(f"{base}: {count}")
print(f"Total: {total}")

# Create data

In [None]:
num = len(os.path.join(root, img_path)) - 4

file_list = gfs.glob(root + "images/**/*.jpg", recursive=True)
castor_list = [int(os.path.basename(path)[:-4]) for path in file_list]
path_list = [path[num:] for path in file_list]
print(len(file_list))

In [None]:
df = pd.DataFrame(data=
    {
        "path": path_list,
        "castor": castor_list
    }
)
print(df.shape)
df.head()

In [None]:
# df.to_csv(os.path.join(root, annotation_path, "castors.csv"), index=False)

# Padma table

In [None]:
padma = pd.read_parquet("gs://hdl-tables/dma/product_article_datamart",
                        columns=["product_code", "article_code", "castor"])
print(padma.shape)
padma.head()

In [None]:
padma.isna().any()

In [None]:
padma = padma.drop_duplicates()
padma.shape

In [None]:
padma.isna().any()

In [None]:
padma.dtypes

In [None]:
padma.castor = padma.castor.astype(int)
padma.dtypes

# PIM table

In [None]:
pim = pd.read_parquet("gs://hdl-tables/dim/dim_pim",
                      columns=["product_code", "article_code", "product_fit"])
print(pim.shape)
pim.head()

In [None]:
pim.isna().any()

In [None]:
pim.isna().sum()

In [None]:
pim = pim.dropna(axis=0, subset=["article_code", "product_fit"])
print(pim.shape)
pim.head()

In [None]:
pim["product_fit"].value_counts()

In [None]:
pim = pim.drop_duplicates()
pim.shape

In [None]:
pim.isna().any()

# Merge pim, padma

In [None]:
data = pim.merge(padma, on=["product_code", "article_code"], how="left")
print(data.shape)
data.head()

In [None]:
data.dtypes

In [None]:
data = data[~data["product_fit"].str.contains("[", regex=False)]
print(data.shape)
data.head()

In [None]:
data.isna().any()

In [None]:
data = data.drop(axis=1, labels=["product_code", "article_code"])
data.shape

# Merge with castors

In [None]:
df.dtypes

In [None]:
out = df.merge(data, on="castor", how="inner")
print(out.shape)
out.head()

In [None]:
out.isna().any()

In [None]:
out.dtypes

In [None]:
out.castor.unique().shape

In [None]:
out["product_fit"].value_counts()

In [None]:
out["labels"] = out["product_fit"].astype("category").cat.codes
print(out.shape)
out.head()

In [None]:
out.labels.value_counts()

# Split data

In [None]:
cv = StratifiedGroupKFold(n_splits=2)

In [None]:
train_idxs, test_idxs = next(cv.split(out.path, out.labels, out.castor))
print("TRAIN:", out.castor[train_idxs], out.labels[train_idxs])
print(" TEST:", out.castor[test_idxs], out.labels[test_idxs])

In [None]:
len(train_idxs)

In [None]:
len(test_idxs)

# Split data 2

In [None]:
tmp = out[["product_fit", "castor"]].drop_duplicates()
print(tmp.shape)
tmp.head()

In [None]:
sub_train = tmp.groupby("product_fit").sample(frac=0.8)
sub_train["is_train"] = True
print(sub_train.shape)
sub_train.head()

In [None]:
final = out.merge(sub_train[["castor", "is_train"]], on="castor", how="left")
print(final.shape)
final.head()

In [None]:
final.isna().any()

In [None]:
final.fillna(False, inplace=True)

In [None]:
final.isna().any()

In [None]:
final.dtypes

In [None]:
final.is_train.sum()

In [None]:
(~final.is_train).sum()

In [None]:
train = final.loc[final.is_train, ["path", "castor", "product_fit", "labels"]]
train.shape

In [None]:
test = final.loc[~final.is_train, ["path", "castor", "product_fit", "labels"]]
test.shape

In [None]:
train.product_fit.value_counts()

In [None]:
test.product_fit.value_counts()

In [None]:
assert not set(train.castor) & set(test.castor)

# Write data

In [None]:
train_fit = out.iloc[train_idxs, :]
test_fit = out.iloc[test_idxs, :]

In [None]:
print(train_fit.shape)
print(test_fit.shape)

In [None]:
# out.to_csv(os.path.join(root, annotation_path, "full_fit.csv"), index=False)

# train_fit.to_csv(os.path.join(root, annotation_path, "train.csv"), index=False)
# test_fit.to_csv(os.path.join(root, annotation_path, "test.csv"), index=False)

In [None]:
out_gcp = out[["path", "product_fit"]].copy()
out_gcp["path"] = "gs://hm-images-bucket/images/" + out_gcp["path"]

In [None]:
out_gcp["mode"] = "VALIDATION"
out_gcp.loc[train_idxs, "mode"] = "TRAINING"

In [None]:
out_gcp = out_gcp[["mode", "path", "product_fit"]]
out_gcp

In [None]:
out_gcp.shape

In [None]:
# out_gcp.to_csv(os.path.join(root, annotation_path, "full_fit_gcai.csv"), index=False, header=False)