# Dataset

In [1]:
import pathlib
import sys

import polars as pl
from tqdm.notebook import tqdm

parent_dir = pathlib.Path("../").resolve()

if str(parent_dir) not in sys.path:
    sys.path.append(str(parent_dir))

from data.dataset import AmazonReviewsDataModule

In [2]:
save_dir = pathlib.Path("../data/data")
if not save_dir.exists():
    save_dir.mkdir(parents=True)

datamodule = AmazonReviewsDataModule(save_dir, batch_size=32, num_workers=4, max_seq_len=50)

In [3]:
datamodule.prepare_data()

In [4]:
datamodule.setup(stage="fit")

In [5]:
train_dataset = datamodule.train_dataset
train_df = datamodule.train_dataset.df
val_df = datamodule.val_dataset.df

In [6]:
# dataset

len(datamodule.train_dataset), len(datamodule.val_dataset)

(715329, 329719)

In [7]:
# users and items
def n_unique(df: pl.DataFrame):
    users = df.select(pl.col("user_index")).n_unique()
    items = df.select(pl.col("item_index")).n_unique()
    categories = df.select(pl.col("category_index")).n_unique()
    print(f"Users: {users}, Items: {items}, Categories: {categories}")

print("=== train ===")
n_unique(train_df)
print("=== val ===")
n_unique(val_df)

=== train ===
Users: 182215, Items: 71227, Categories: 489
=== val ===
Users: 182216, Items: 41230, Categories: 475


In [8]:
batch = next(iter(datamodule.train_dataloader()))
batch

[tensor([168603,  95594,  41115, 145837,   4063,  23281,  95346,  33739, 145244,
         121128,  97979,  76836, 108346, 177842, 149325,  27740,  52481,  93631,
          95124,  74745,  89975, 152833, 143212, 102232,  16757,  81967, 114408,
          43056,  63656, 138438,  69835,   6150]),
 tensor([[    0,     0,     0,  ..., 64321, 66186, 78069],
         [    0,     0,     0,  ...,     0,     0, 26116],
         [    0,     0,     0,  ..., 40706, 67083, 47777],
         ...,
         [    0,     0,     0,  ...,     0,     0, 19611],
         [    0,     0,     0,  ..., 44626, 41041, 37816],
         [    0,     0,     0,  ...,     0, 27471, 22159]]),
 tensor([[  0,   0,   0,  ..., 347, 351, 345],
         [  0,   0,   0,  ...,   0,   0, 361],
         [  0,   0,   0,  ..., 474, 401, 401],
         ...,
         [  0,   0,   0,  ...,   0,   0, 192],
         [  0,   0,   0,  ..., 325, 159, 147],
         [  0,   0,   0,  ...,   0, 375, 375]]),
 tensor([[76543],
         [41611],
  

In [9]:
%%time

# check dataloader speed
for batch in tqdm(datamodule.train_dataloader()):
    pass


  0%|          | 0/22355 [00:00<?, ?it/s]

CPU times: user 18.7 s, sys: 13 s, total: 31.7 s
Wall time: 3min 28s
