In [1]:
from pathlib import Path
import pandas as pd

In [2]:
path = Path.home() / 'OneDrive - Seagroup/ai/kaggle_dataset/instacart'

df_order = pd.read_csv(path / 'order_products__train.csv')

# filter orders have 1 product
order_1_product = df_order.groupby('order_id')[['product_id']].nunique()
order_1_product = order_1_product.query('product_id == 1').index.tolist()
df_order.query(f'order_id != {order_1_product}', inplace=True)

print(df_order.shape, df_order['order_id'].nunique())
df_order.head(10)

(1377772, 4) 124364


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
5,1,13176,6,0
6,1,47209,7,0
7,1,22035,8,1
8,36,39612,1,0
9,36,19660,2,1


In [3]:
df_product = pd.read_csv(path / 'products.csv', usecols=["product_id", "product_name"])
print(df_product.shape, df_product['product_id'].nunique())

df_product.head()

(49688, 2) 49688


Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies
1,2,All-Seasons Salt
2,3,Robust Golden Unsweetened Oolong Tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...
4,5,Green Chile Anytime Sauce


In [11]:
orders = df_order.sort_values(by=["order_id", "add_to_cart_order"])
orders = df_order.groupby("order_id")["product_id"].apply(list).tolist()

product_name_by_id = df_product.set_index("product_id").to_dict()["product_name"]

# set index
ordered_products = set([product for order in orders for product in order])
product_mapping = {
    'index_by_id': dict(),
    'name_by_index': dict(),
}
ind = 0
for ind, product_id in enumerate(ordered_products):
    product_name = product_name_by_id[product_id]
    product_mapping["index_by_id"][product_id] = ind
    product_mapping["name_by_index"][ind] = product_name

indexed_orders = [
    [product_mapping["index_by_id"][product_id] for product_id in order]
    for order in orders
]

In [10]:
context_window = 5

# total number of context products, including positive and negative products
all_targets = []
all_positive_contexts = []
for order in indexed_orders:
    for i, product in enumerate(order):
        all_targets.append(product)
        positive_context = [
            order[j]
            for j in range(
                max(0, i - context_window), min(len(order), i + context_window + 1)
            )
            if j != i
        ]
        all_positive_contexts.append(positive_context)

print("Sample order:", indexed_orders[0])
for i in range(3):
    print(f"Target product: {all_targets[i]}", end = ", ")
    print(f"Positive context products: {all_positive_contexts[i]}")

Sample order: [38764, 8736, 8063, 39057, 34290, 10349, 37106, 17371]
Target product: 38764, Positive context products: [8736, 8063, 39057, 34290, 10349]
Target product: 8736, Positive context products: [38764, 8063, 39057, 34290, 10349, 37106]
Target product: 8063, Positive context products: [38764, 8736, 39057, 34290, 10349, 37106, 17371]


In [12]:
from collections import Counter
import random



def get_sampling_weights(orders):
    product_freq = Counter([product for order in orders for product in order])
    sampling_weights = [0 for _ in product_freq]
    for product_index, count in product_freq.items():
        sampling_weights[product_index] = count**0.5
    return sampling_weights

sampling_weights = get_sampling_weights(indexed_orders)

In [13]:
class ProductSampler:
    def __init__(self, products, weights, pre_drawn=10_000_000):
        self.products = products
        self.weights = weights
        self.pre_drawn = pre_drawn
        self.pre_drawn_products = []

    def refill(self):
        self.pre_drawn_products = random.choices(
            population=self.products, weights=self.weights, k=self.pre_drawn
        )

    def draw(self):
        if not self.pre_drawn_products:
            self.refill()
        return self.pre_drawn_products.pop()


num_products = len(ordered_products)
product_sampler = ProductSampler(
    products=range(num_products),
    weights=sampling_weights,
    pre_drawn=10_000_000,
)

print("Sampling samples:", [product_sampler.draw() for _ in range(10)])

Sampling samples: [36810, 19075, 24340, 23282, 9008, 13766, 31624, 11700, 28271, 36479]


In [15]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


class TargetContextDataset(Dataset):
    def __init__(self,
                 all_targets,
                 all_positive_contexts,
                 product_sampler,
                 num_context_products: int = 10):

        self.all_targets = all_targets,
        self.all_positive_contexts = all_positive_contexts
        self.product_sampler = product_sampler
        self.num_context_products = num_context_products

    def __len__(self):
        return len(self.all_targets)

    def __getitem__(self, index):
        target = torch.IntTensor([self.all_targets[index]])
        positive_contexts = self.all_positive_contexts[index].copy()
        num_pos = len(positive_contexts)
        num_neg = self.num_context_products - len(positive_contexts)
        mask = [1] * num_pos + [0] * num_neg
        while len(positive_contexts) < self.num_context_products:
            product = self.product_sampler.draw()
            if product not in positive_contexts:
                positive_contexts.append(product)

        contexts = torch.IntTensor(positive_contexts)
        mask = torch.FloatTensor(mask)
        return target, contexts, mask


training_data = TargetContextDataset(all_targets, all_positive_contexts, product_sampler)
train_dataloader = DataLoader(training_data, batch_size=8192, shuffle=True)

for target, context_products, labels in train_dataloader:
    print("Target:", target[0])
    print("Context products:", context_products[0])
    print("Labels:", labels[0])
    break

Target: tensor([[38764,  8736,  8063,  ..., 28275, 13323,  3731]], dtype=torch.int32)
Context products: tensor([ 8736,  8063, 39057, 34290, 10349,  6978, 31808,  4344, 33812,  8230],
       dtype=torch.int32)
Labels: tensor([1., 1., 1., 1., 1., 0., 0., 0., 0., 0.])


In [16]:
class SigmoidBCELoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, inputs, label):
        inputs = torch.reshape(inputs, (inputs.shape[0], -1))
        out = nn.functional.binary_cross_entropy_with_logits(inputs, label, reduction="none")
        return torch.mean(out)


loss_fn = SigmoidBCELoss()
sample_logits = torch.Tensor([[100, -100], [1, 1]])
sample_labels = torch.Tensor([[1, 0], [1, 0]])
loss_fn(sample_logits, sample_labels)

tensor(0.4066)

In [59]:
from lightning import LightningModule, Trainer


class Prod2VecModel(LightningModule):
    def __init__(self, num_products, embed_size: int = 50):
        super().__init__()
        self.embed_size = embed_size
        self.embed_t = nn.Embedding(num_products, self.embed_size)
        self.embed_c = nn.Embedding(num_products, self.embed_size)

    def forward(self, targets, contexts):
        v = self.embed_t(targets.squeeze(1))
        u = self.embed_c(contexts)
        pred = torch.bmm(v, u.permute(0, 2, 1))
        return pred

    def training_step(self, batch, batch_idx):
        targets, contexts, labels = batch
        output = self.forward(targets, contexts)
        loss = loss_fn(output, labels)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-3)
        return optimizer

In [64]:
embed_t = nn.Embedding(num_products, 50)
v = embed_t(target.squeeze(1))
u = embed_t(context_products)
output = torch.bmm(v, u.permute(0, 2, 1))

print(v.shape, u.shape, output.shape)

torch.Size([1, 1377772, 50]) torch.Size([1, 10, 50]) torch.Size([1, 1377772, 10])


In [65]:
labels.shape

torch.Size([1, 10])

In [68]:
output

tensor([[[  1.4234,  -6.9554,   6.0896,  ...,  -0.1159,  -8.3908,   5.6874],
         [ 39.1924,   8.0971,  -1.2898,  ...,  -0.9312,   3.6597,   1.3886],
         [  8.0971,  50.9569,   2.5032,  ...,   4.0706,   2.5474,  -2.9239],
         ...,
         [  1.5698,   2.1034,   8.1602,  ...,  -8.6373,   0.6249,  -5.6843],
         [ -6.6285, -14.4251,  -1.7551,  ...,  -0.8922,  -3.5124, -20.5016],
         [  0.5823,  -3.7209,   6.1809,  ...,  -7.9312,  11.8111,  -2.1895]]],
       grad_fn=<BmmBackward0>)

In [79]:
torch.reshape(output, (-1,)).shape

torch.Size([13777720])

In [72]:
labels

tensor([[1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]])

In [None]:
inputs = torch.reshape(inputs, (inputs.shape[0], -1))
out = nn.functional.binary_cross_entropy_with_logits(inputs, label, reduction="none")

In [None]:
loss_fn(output, labels)

In [28]:
target, context_products, labels = next(iter(train_dataloader))

In [82]:
target

tensor([[[38764,  8736,  8063,  ..., 28275, 13323,  3731]]], dtype=torch.int32)

In [83]:
labels

tensor([[1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]])

In [81]:
context_products

tensor([[ 8736,  8063, 39057, 34290, 10349,  5585,  8533, 31179, 30265, 31970]],
       dtype=torch.int32)

In [60]:
embed_size = 100
model = Prod2VecModel(num_products, embed_size)
trainer = Trainer(max_epochs=50)
trainer.fit(model, train_dataloader, train_dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")

  | Name    | Type      | Params
--------------------------------------
0 | embed_t | Embedding | 3.9 M 
1 | embed_c | Embedding | 3.9 M 
--------------------------------------
7.8 M     Trainable params
0         Non-trainable params
7.8 M     Total params
31.249    Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

ValueError: Target size (torch.Size([1, 10])) must be the same as input size (torch.Size([1, 13777720]))

In [25]:
next(iter(train_dataloader))

[tensor([[[38764,  8736,  8063,  ..., 28275, 13323,  3731]]], dtype=torch.int32),
 tensor([[ 8736,  8063, 39057, 34290, 10349, 18580,  3320, 26843,  4678,  9309]],
        dtype=torch.int32),
 tensor([[1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]])]