# Install packages

In [0]:
!pip install openpyxl

# Import packages

In [0]:
import io
import os
import requests

import pandas as pd
from PIL import Image, ImageFile
from tqdm import tqdm

ImageFile.LOAD_TRUNCATED_IMAGES = True

# Set paths

In [0]:
root = "/dbfs/sketches"
img_path = "images"
label_path = "Training_data_Jan2022.xlsx"
label_file1 = "sketches_mlmc.csv"
label_file2 = "sketches_mc.csv"

fill_str = "None"

# Read data

In [0]:
sketch_df = pd.read_excel(os.path.join(root, label_path))
sketch_df.head()

In [0]:
print(f"Unique URL's {sketch_df['Image URL'].nunique()}")
print(f"Unique ID's {sketch_df['Image Id'].nunique()}")
print(f"Unique Product number's {sketch_df['Product Number'].nunique()}")
print(f"Unique Name's {sketch_df['Image Name'].nunique()}")

# Download data

In [0]:
def fetch_images(
    data: pd.DataFrame,
    dir_path: str,
    img_path: str,
    img_ext: str = "jpg"
) -> None:
    """Fetch images given the dataframe of urls and image id's
    Args:
        data: Input dataframe containing image urls and id's
        dir_path: Root directory
        img_path: Path to save images to
        img_ext: Image extension to save as
    """
    tot_url = len(data)

    cnt_downloaded, cnt_exists, cnt_resp_err = 0, 0, 0
    for _, row in tqdm(data.iterrows(), unit="rows", total=tot_url):
        url = row["Image URL"]
        id_ = row["Image Id"]
        file_name = os.path.join(dir_path, img_path, f"{id_}.{img_ext}")
        if not os.path.isfile(file_name):
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    with Image.open(io.BytesIO(response.content)) as img:
                        try:
                            img.convert("RGB").save(file_name)
                        except OSError:
                            img.convert("RGB").save(file_name)
                    cnt_downloaded += 1
                else:
                    cnt_resp_err += 1
            except requests.exceptions.RequestException as e:
                raise SystemExit(e)
        else:
            cnt_exists += 1

    print(f"Images downloaded: {cnt_downloaded}!")
    print(f"Non existent urls: {cnt_resp_err}!")
    print(f"Images already exist: {cnt_exists}!")

## Fix Id's

In [0]:
fetch_data = sketch_df.groupby("Image URL").first().reset_index()
print(f"Unique URL's: {fetch_data.shape}")
fetch_data = fetch_data[~(fetch_data["Image URL"] == "Image not found")]
print(f"Valid unique URL's: {fetch_data.shape}")

In [0]:
print(f"Unique URL's {fetch_data['Image URL'].nunique()}")
print(f"Unique ID's {fetch_data['Image Id'].nunique()}")

In [0]:
mask = fetch_data["Image Id"].duplicated()
fetch_data["Image Id"] = fetch_data["Image Id"].astype("str")
fetch_data.loc[mask, "Image Id"] += "_1"

In [0]:
print(f"Unique URL's {fetch_data['Image URL'].nunique()}")
print(f"Unique ID's {fetch_data['Image Id'].nunique()}")

## Download

In [0]:
os.makedirs(os.path.join(root, img_path), exist_ok=True)
fetch_images(fetch_data, root, img_path, "png")

In [0]:
file_list = os.listdir(os.path.join(root, img_path))
print(f"Number of images: {len(file_list)}")

# Create labels

In [0]:
del_cols = ["Image URL", "Image Name", "Product Number", "Garment group", "Department Name", "Seasonold", "UniquieVal"]
data = fetch_data.drop(del_cols, axis=1)
print(data.shape)
data.head()

In [0]:
data.info(verbose=True)

## Fix nulls

In [0]:
data_clean = data.fillna(value=fill_str, axis=0)
data_clean.info(verbose=True)

In [0]:
print(f"Numberof unique types: {data_clean['Type'].nunique()}")
print(f"Numberof unique category: {data_clean['Category'].nunique()}")
print(f"Numberof unique subcategory: {data_clean['SubCategory'].nunique()}")
print(f"Numberof unique customer group: {data_clean['Customer Group'].nunique()}")

## Filter data

In [0]:
id_list = [f[:-4] for f in file_list]
data_filt = data_clean[data_clean["Image Id"].isin(id_list)]
data_filt.shape

## Check labels

In [0]:
type_vc = data_clean["Type"].value_counts()
type_vc.plot.bar(figsize=(15, 8))

In [0]:
cat_vc = data_clean["Category"].value_counts()
cat_vc.plot.bar(figsize=(15, 8))

In [0]:
scat_vc = data_clean["SubCategory"].value_counts()
scat_vc.plot.bar(figsize=(15, 8))

In [0]:
cg_vc = data_clean["Customer Group"].value_counts()
cg_vc.plot.bar(figsize=(15, 8))

## Multi label Multi-class

In [0]:
out = pd.get_dummies(
    data_filt,
    columns=["Type", "Category", "SubCategory", "Customer Group"],
)
print(out.shape)
out.head()

In [0]:
tmp = out.groupby("Image Id").sum()
assert (tmp > 1).any().any() == False
assert (tmp < 0).any().any() == False
assert (tmp == 0).all().any() == False
assert (tmp == 0).all(1).any() == False
assert (tmp.sum(1) > 1).all() == True
assert tmp.reset_index().shape == out.shape

### Write labels

In [0]:
out.to_csv(os.path.join(root, label_file1), index=False)

## Hierarchical

In [0]:
labels = pd.DataFrame(columns=["Image Id", "labels"])
labels["Image Id"] = data_filt["Image Id"]
labels["labels"] = data_filt[["Type", "Category", "SubCategory", "Customer Group"]].agg("-".join, axis=1)
print(labels.shape)
labels.head()

In [0]:
l_vc = labels["labels"].value_counts()
l_vc.plot.bar(figsize=(15, 8))

In [0]:
labels["code"] = labels["labels"].astype("category").cat.codes
labels

In [0]:
print(f"Min code value: {labels['code'].min()}")
print(f"Max code value: {labels['code'].max()}")

### Write labels

In [0]:
labels.to_csv(os.path.join(root, label_file2), index=False)