In [4]:
from pathlib import Path
import pandas as pd
from collections import Counter
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from lightning import Trainer, seed_everything, LightningModule
from lightning.pytorch.callbacks import ModelCheckpoint

seed_everything(42)

Global seed set to 42


42

In [5]:
path = Path.home() / 'OneDrive - Seagroup/ai/kaggle_dataset/instacart'

df_order = pd.read_csv(path / 'order_products__train.csv')

# filter orders have 1 product
order_1_product = df_order.groupby('order_id')[['product_id']].nunique()
order_1_product = order_1_product.query('product_id == 1').index.tolist()
df_order.query(f'order_id != {order_1_product}', inplace=True)

print(df_order.shape, df_order['order_id'].nunique())
df_order.head(10)

(1377772, 4) 124364


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
5,1,13176,6,0
6,1,47209,7,0
7,1,22035,8,1
8,36,39612,1,0
9,36,19660,2,1


In [6]:
df_product = pd.read_csv(path / 'products.csv', usecols=["product_id", "product_name"])
print(df_product.shape, df_product['product_id'].nunique())

df_product.head()

(49688, 2) 49688


Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies
1,2,All-Seasons Salt
2,3,Robust Golden Unsweetened Oolong Tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...
4,5,Green Chile Anytime Sauce


In [7]:
orders = df_order.sort_values(by=["order_id", "add_to_cart_order"])
orders = orders.groupby("order_id")["product_id"].apply(list).tolist()

product_name_by_id = df_product.set_index("product_id").to_dict()["product_name"]

# set index
ordered_products = set([product for order in orders for product in order])
product_mapping = {
    'index_by_id': dict(),
    'name_by_index': dict(),
}
ind = 0
for ind, product_id in enumerate(ordered_products):
    product_name = product_name_by_id[product_id]
    product_mapping["index_by_id"][product_id] = ind
    product_mapping["name_by_index"][ind] = product_name

indexed_orders = [
    [product_mapping["index_by_id"][product_id] for product_id in order]
    for order in orders
]

In [8]:
context_window = 5

# total number of context products, including positive and negative products
all_targets = []
all_positive_contexts = []
for order in indexed_orders:
    for i, product in enumerate(order):
        all_targets.append(product)
        positive_context = [
            order[j]
            for j in range(
                max(0, i - context_window), min(len(order), i + context_window + 1)
            )
            if j != i
        ]
        all_positive_contexts.append(positive_context)

print("Sample order:", indexed_orders[0])
for i in range(3):
    print(f"Target product: {all_targets[i]}", end = ", ")
    print(f"Positive context products: {all_positive_contexts[i]}")

Sample order: [38764, 8736, 8063, 39057, 34290, 10349, 37106, 17371]
Target product: 38764, Positive context products: [8736, 8063, 39057, 34290, 10349]
Target product: 8736, Positive context products: [38764, 8063, 39057, 34290, 10349, 37106]
Target product: 8063, Positive context products: [38764, 8736, 39057, 34290, 10349, 37106, 17371]


In [9]:
def get_sampling_weights(orders):
    product_freq = Counter([product for order in orders for product in order])
    sampling_weights = [0 for _ in product_freq]
    for product_index, count in product_freq.items():
        sampling_weights[product_index] = count**0.5
    return sampling_weights

sampling_weights = get_sampling_weights(indexed_orders)

In [10]:
class ProductSampler:
    def __init__(self, products, weights, pre_drawn=10_000_000):
        self.products = products
        self.weights = weights
        self.pre_drawn = pre_drawn
        self.pre_drawn_products = []

    def refill(self):
        self.pre_drawn_products = random.choices(
            population=self.products, weights=self.weights, k=self.pre_drawn
        )

    def draw(self):
        if not self.pre_drawn_products:
            self.refill()
        return self.pre_drawn_products.pop()


num_products = len(ordered_products)
product_sampler = ProductSampler(
    products=range(num_products),
    weights=sampling_weights,
    pre_drawn=10_000_000,
)

print("Sampling samples:", [product_sampler.draw() for _ in range(10)])

Sampling samples: [18129, 15355, 20263, 23956, 38661, 29016, 34084, 18226, 24946, 17815]


In [11]:
class TargetContextDataset(Dataset):
    def __init__(self,
                 all_targets,
                 all_positive_contexts,
                 product_sampler,
                 num_context_products: int = 10):

        self.all_targets = all_targets
        self.all_positive_contexts = all_positive_contexts
        self.product_sampler = product_sampler
        self.num_context_products = num_context_products

    def __len__(self):
        return len(self.all_targets)

    def __getitem__(self, index):
        target = torch.tensor([self.all_targets[index]], dtype=torch.int32)
        positive_contexts = self.all_positive_contexts[index].copy()
        num_pos = len(positive_contexts)
        num_neg = self.num_context_products - len(positive_contexts)
        mask = [1] * num_pos + [0] * num_neg
        while len(positive_contexts) < self.num_context_products:
            product = self.product_sampler.draw()
            if product not in positive_contexts:
                positive_contexts.append(product)

        contexts = torch.IntTensor(positive_contexts)
        mask = torch.FloatTensor(mask)
        return target, contexts, mask


training_data = TargetContextDataset(all_targets, all_positive_contexts, product_sampler)
train_dataloader = DataLoader(training_data, batch_size=2**17, shuffle=True)
val_dataloader = DataLoader(training_data, batch_size=2**17, shuffle=False)

# for target in train_dataloader:
# for target, context_products, labels in train_dataloader:
#     print("Target:", target, target.shape)
#     print("Context products:", context_products, context_products.shape)
#     print("Labels:", labels, labels.shape)
#     break

In [12]:
class SigmoidBCELoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, inputs, label):
        inputs = torch.reshape(inputs, (inputs.shape[0], -1))
        out = nn.BCEWithLogitsLoss()(inputs, label)
        return out


loss_fn = SigmoidBCELoss()
sample_logits = torch.Tensor([[100, -100], [1, 1]])
sample_labels = torch.Tensor([[1, 0], [1, 0]])
loss_fn(sample_logits, sample_labels)

tensor(0.4066)

In [13]:
class Prod2VecModel(LightningModule):
    def __init__(self, num_products, embed_size: int = 50):
        super().__init__()
        self.embed_size = embed_size
        self.embed_t = nn.Embedding(num_products, self.embed_size)
        self.embed_c = nn.Embedding(num_products, self.embed_size)
        self.loss_fn = nn.BCEWithLogitsLoss()

    def forward(self, targets, contexts):
        v = self.embed_t(targets)
        u = self.embed_c(contexts)
        pred = torch.bmm(v, u.permute(0, 2, 1))
        return pred.squeeze(1)

    def training_step(self, batch, batch_idx):
        targets, contexts, labels = batch
        output = self.forward(targets, contexts)
        # print(output.shape, labels.shape)
        loss = self.loss_fn(output, labels)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        targets, contexts, labels = batch
        output = self.forward(targets, contexts)
        loss = self.loss_fn(output, labels)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-3)
        return optimizer

In [14]:
embed_size = 100
model = Prod2VecModel(num_products, embed_size)

model_checkpoint = ModelCheckpoint(dirpath='product2vec/',
                                   save_top_k=1,
                                   monitor="val_loss",
                                   mode="min",)
trainer = Trainer(max_epochs=50,
                  accelerator='gpu',
                  callbacks=[model_checkpoint],
                  log_every_n_steps=5,)

trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | embed_t | Embedding         | 3.9 M 
1 | embed_c | Embedding         | 3.9 M 
2 | loss_fn | BCEWithLogitsLoss | 0     
----------------------------------------------
7.8 M     Trainable params
0         Non-trainable params
7.8 M     Total params
31.249    Total estima

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=50` reached.


In [23]:
print(f'model path: {model_checkpoint.best_model_path}')
print(f'best loss: {model_checkpoint.best_model_score.cpu().item():,.2f}')


model path: C:\Users\Kevin\PycharmProjects\ML-learning-journey\analytic_pipeline\product2vec\epoch=49-step=550.ckpt
best loss: 0.50


In [27]:
model.load_from_checkpoint(model_checkpoint.best_model_path, num_products=num_products, embed_size=embed_size)

Prod2VecModel(
  (embed_t): Embedding(39061, 100)
  (embed_c): Embedding(39061, 100)
)

In [33]:
embs_arr = model.state_dict()['embed_t.weight'].detach().numpy()

In [38]:
embs_arr[6030]

array([ 0.43720758, -0.5138851 ,  0.62963295, -0.4596874 ,  0.93724656,
       -0.12639831,  0.90524673,  0.15311292,  0.71666056, -0.6315555 ,
       -0.48874924,  1.0050014 ,  2.7187738 ,  0.35522315, -0.8772384 ,
       -2.0101988 ,  1.5837808 , -1.4180889 ,  1.5076984 ,  0.2034378 ,
       -0.11139997, -1.694714  ,  0.03630535, -0.57917625, -0.5729373 ,
        1.3544452 ,  2.0840118 , -0.67371505, -0.01930216,  0.8293551 ,
        0.07352173,  0.58840185,  0.4468925 , -1.0258918 , -1.3451661 ,
       -0.14209984,  0.2720714 ,  0.8545576 ,  0.03307387, -0.8443877 ,
       -0.8201797 , -0.7926045 ,  0.30645132,  0.7059188 , -1.1148585 ,
       -0.0383687 , -0.70483285, -0.6794859 , -1.094641  , -1.7687249 ,
        0.0982416 ,  0.46045965, -0.6180603 , -1.7129457 , -1.7511535 ,
       -1.1507181 ,  0.9388377 ,  1.0120523 , -0.5998491 ,  1.1275885 ,
        0.5033181 ,  0.31725222, -0.35094416,  2.3266103 ,  0.46643716,
       -1.7193557 ,  0.29801866,  1.554903  ,  0.6995537 ,  0.35

In [32]:
embs_arr = model.state_dict()['embed_t.weight'].detach().numpy()
names = [product_mapping["name_by_index"][i] for i in range(num_products)]
sub_name = "Organic Yogurt"
ids = [ind for ind, name in enumerate(names) if sub_name in name]
for ind in ids[:5]:
    print('==========')
    print(f'Similar items of "{names[ind]}":')
    nearest_ids = emb_nn.find_nearest_neighbors(embs_arr[ind, :], k=2)
    print([names[i] for i in nearest_ids])

[6030, 15007, 15656, 15702, 18870, 21973, 24238, 25469, 34826, 36151]

In [39]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


X2 = PCA(n_components=2).fit_transform(embs_arr)
plt.figure(figsize=(20, 20))
colors = ['b'] * num_products
s = [1] * num_products
for i, product in product_mapping["name_by_index"].items():
    if "Organic" in product:
        colors[i] = 'r'
        s[i] = 30
plt.scatter(X2[:,0], X2[:,1], c=colors, s=s)

<matplotlib.collections.PathCollection at 0x28808d16470>

In [None]:
# embs_arr = model.state_dict()['embed_t.weight'].detach().numpy()
#
# emb_nn = embedding.NearestNeighbor(embs_arr, measure="cosine")
# names = [product_mapping["name_by_index"][i] for i in range(num_products)]
#
# sub_name = "Organic Yogurt"
# ids = [ind for ind, name in enumerate(names) if sub_name in name]
# for ind in ids[:5]:
#     print('==========')
#     print(f'Similar items of "{names[ind]}":')
#     nearest_ids = emb_nn.find_nearest_neighbors(embs_arr[ind, :], k=2)
#     print([names[i] for i in nearest_ids])