In [None]:
%load_ext autoreload
import gzip
import io
import os
import os.path as osp
import random
import sys
import time
from collections import Counter
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import torch
from torch.utils import data
from torchvision import transforms
from tqdm import tqdm_notebook as tqdm
from PIL import Image
from io import BytesIO

print(os.cpu_count())
print(torch.cuda.is_available())


In [None]:
%load_ext autoreload
%autoreload 2

os.chdir("/home/kbibas/fbsource/fbcode/fblearner/flow/projects/experimental/cactus")
print(os.getcwd())
print(os.listdir())
sys.path.append(".")

from src.dataset_utils import get_datasets, TransformBatch, create_on_box_dataloader
from src.download_data import filter_meta_df
from src.manifold_utils import read_data_from_manifold, save_data_to_manifold
from src.lit_utils import LitModel


# Load dataset to RAM

In [None]:
# Params
category = "Clothing_Shoes_and_Jewelry" # "Beauty" # "Clothing_Shoes_and_Jewelry" # "Electronics"
data_dir = "product_clustering_fluent2_bucket/tree/cactus/amazon_dataset"
item_cf_vector_path = "product_clustering_fluent2_bucket/tree/cactus/outputs/train_bpr/train_bpr_20210824_023831/embed_items.pt"
batch_size = 128
train_set_repeat = 1
dpp_server_num_worker_threads = 1
num_workers = 0
train_set_ratio = 1.0
cf_vector_base_dir = None # "product_clustering_fluent2_bucket/tree/cactus/outputs/train_bpr/train_bpr_20210826_061843"
local_dir_path = "/home/kbibas/cactus_outputs/amazon_review_local_dir"
is_use_cf_bias = True
is_shuffle_train = False
train_batch_transform = test_batch_transform = None


In [None]:
class TransformBatchRaw(TransformBatch):
    def __call__(self, batch):
        """Interprets bytes and replaces them with its interpretation"""
        batch_imgs, batch_labels, batch_cf_vectors = [], [], []

        for i in range(len(batch["image"])):
            raw_img = batch["image"][i]
            label = batch["label"][0][i]  # [0] since it is a tuple of [label, True]
            cf_vector = batch["cf_vector"][i][0]

            try:
                byte = BytesIO(raw_img.numpy().view())  # pyre-ignore[6]
                byte.seek(0)
                img = Image.open(byte, mode="r").convert("RGB")

            except Exception as e:
                print(
                    f'Fail load image. {batch["image_path"][i]=} {type(byte)=} {e=} {byte=}'
                )
                continue

            img = np.array(img)
            batch_imgs.append(img)
            batch_labels.append(label)
            batch_cf_vectors.append(cf_vector)
        return batch_imgs, batch_labels, batch_cf_vectors


In [None]:
dataset_h, _, dataset_meta = get_datasets(
    category=category,
    data_dir=data_dir,
    cf_vector_base_dir=cf_vector_base_dir,
    is_use_cf_bias=is_use_cf_bias,
    batch_size=batch_size,
    train_set_repeat=train_set_repeat,
    num_workers=num_workers,
    train_set_ratio=train_set_ratio,
    local_dir_path=local_dir_path,
    is_shuffle_train=is_shuffle_train,
    train_batch_transform=TransformBatchRaw(),
    test_batch_transform=TransformBatchRaw(),
)

dataloader = create_on_box_dataloader(
    dataset=dataset_h,
    num_workers=num_workers,
    dpp_server_num_worker_threads=dpp_server_num_worker_threads,
)
print(dataset_meta["train_set_size"], dataset_meta["test_set_size"])


In [None]:
t0 = time.time()
total = round(dataset_meta["train_set_size"] / batch_size)
img_list, label_list = [], []
for imgs, labels, cf_vectors in tqdm(dataloader, total=total):
    img_list += imgs
    label_list += labels
print(
    f"Iterated on dataloader in {time.time() -t0 :.2f}. {len(img_list)=}  total images = {dataset_meta['train_set_size']}"
)


# Visualize dataset

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(20, 20), facecolor="white")
axs = axs.flatten()
idxs = np.random.randint(0, len(img_list), len(axs))
t0 = time.time()
for ax, idx in zip(axs, idxs):
    img, label = img_list[idx],label_list[idx].item()
    label_str = dataset_meta['classes'][label]
    ax.imshow(img)
    ax.set_xlabel(f"{idx=} \n {label_str=} \n {label=}")
print(f"loaded {len(axs)} images in {time.time() - t0:.2f} sec")
plt.tight_layout()
plt.show()


# Get distribution of classes

In [None]:
labels_np = torch.stack(label_list).numpy()
count_np = np.bincount(labels_np)
count_np.sort()
count_np = count_np[::-1]

In [None]:
fig, ax = plt.subplots(1, 1, facecolor="white", figsize=(4, 4))

ax.plot(count_np)
ax.set_yscale("log")
ax.set_xlabel("Label id")
ax.set_ylabel("Count")

count_threshold = 100
print(
    f"Labels that have more than {count_threshold} examples: {len(count_np[count_np>count_threshold])}. {count_np.min()=}"
)

ax.set_title("Labels")
plt.tight_layout()
plt.show()


# Get the ditribution of image size

In [None]:
heights, widths = [], []
for img in tqdm(img_list):
    h, w, _ = img.shape
    heights.append(h)
    widths.append(w)
print(f"mean [h w]=[{np.mean(heights):.2f} {np.mean(widths):.2f}]")

In [None]:
fig, axs = plt.subplots(2, 1, facecolor="white")

ax = axs[0]
ax.hist(heights, bins=100)
ax.set_xlabel("Heights")
ax.set_ylabel("Counts")
ax.set_yscale("log")

ax = axs[1]
ax.hist(widths, bins=100)
ax.set_xlabel("Widths")
ax.set_ylabel("Counts")
ax.set_yscale("log")

axs[0].set_title(category)

plt.tight_layout()
plt.show()


# Get distribution of color

In [None]:
mean_color_list =  []
for img in tqdm(img_list):
    mean_color = np.mean(img ,axis=(0, 1))
    mean_color_list.append(mean_color)
    
mean_colors = np.asarray(mean_color_list).mean(axis=0)
print(f"{mean_colors=}")

## Reviwer distributin

In [None]:
from src.manifold_utils import read_data_from_manifold


t0 = time.time()
pkl_dict = read_data_from_manifold(
    "product_clustering_fluent2_bucket/tree/cactus/amazon_dataset/Electronics.pkl",
    is_from_pkl=True,
)
print(time.time() - t0)
pkl_dict.keys()


In [None]:
reviews_df = pkl_dict["reviews_df"]
fig, ax = plt.subplots(1, 1, facecolor="white")
reviews_df.hist(column="overall", ax=ax)
ax.set_title("User rating")
plt.show()


In [None]:
fig, ax = plt.subplots(1, 1, facecolor="white")
review_size_df = reviews_df.groupby("reviewerID").size()
review_size_df.hist(ax=ax, bins=100)
ax.set_xlabel("Num reviews")
ax.set_ylabel("Count")
ax.set_yscale("log")
plt.tight_layout()
plt.show()


# Debug

In [None]:
# Load pkl
t0 = time.time()
pkl_data = read_data_from_manifold(
    f"{data_dir}/{category}.pkl",
    is_from_pkl=True,
)
print(f"In {time.time()-t0:.2f}")


In [None]:
train_set, test_set = pkl_data["train_set"], pkl_data["test_set"]


In [None]:
user_id, item_id = train_set[:, 0], train_set[:, 1]

user_hist = np.bincount(user_id)
user_hist.sort()

item_hist = np.bincount(item_id)
item_hist.sort()

plt.plot(user_hist[::-1])
plt.yscale("symlog")
plt.title("Train user hist")

plt.show()

plt.plot(item_hist[::-1])
plt.yscale("symlog")
plt.title("Train item hist")

plt.show()
print(user_hist[:00])


In [None]:
user_id, item_id = test_set[:, 0], test_set[:, 1]

user_hist = np.bincount(user_id)
user_hist.sort()

item_hist = np.bincount(item_id)
item_hist.sort()

plt.plot(user_hist[::-1])
plt.yscale("symlog")
plt.title('Test user hist')
plt.show()

plt.plot(item_hist[::-1])
plt.yscale("symlog")
plt.title('Test item hist')
plt.show()


In [None]:
reviews_df = pkl_data["reviews_df"]
user_count = pkl_data["user_count"]
item_count = pkl_data["item_count"]
example_count = pkl_data["example_count"]
reviews_df = pkl_data["reviews_df"]


In [None]:
reviews_df[reviews_df['reviewerID'] ==2 ]

In [None]:
train_set[train_set[:,1] ==13104]

In [None]:
meta_df = pkl_data['meta_df']

In [None]:
meta_df

In [None]:
# Load pkl
t0 = time.time()
meta_raw = read_data_from_manifold(
    f"{data_dir}/meta_{category}.pkl",
    is_from_pkl=True,
)
print(f"In {time.time()-t0:.2f}")


In [None]:
df = pd.read_pickle(meta_raw)

In [None]:
df['categories'].values

In [None]:
# for cat in df["categories"]:
#     print(cat)
categories  = []
for df_categories in df["categories"]:
    longest_list_idx = np.argmax([len(list_i) for list_i in df_categories])
    categories.append(' '.join(df_categories[longest_list_idx]))
print(pd.unique(categories))
# print(df["categories"].map(lambda x: '_'.join(x[-1])).unique())

In [None]:
category_list = []
lengths = []
for raw in df["categories"]:
    for raw_i in raw:
        if raw_i[0] != 'Clothing, Shoes & Jewelry':
            continue
        if len(raw_i) <= 2:
            continue
        prefix = " ".join(raw_i)
        category_list.append(prefix)
        lengths.append(len(raw_i))


In [None]:
len(category_list), len(df)

In [212]:
df["categories"][0]

[['Clothing, Shoes & Jewelry', 'Girls'],
 ['Clothing, Shoes & Jewelry',
  'Novelty, Costumes & More',
  'Costumes & Accessories',
  'More Accessories',
  'Kids & Baby']]

In [None]:
len(train_set), len(test_set)

In [None]:
a  =[1,2] + list([2] if False else [])
print(a)

In [None]:
list([2])