In [None]:
!pip install pytorch_lightning==1.5.3

You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

In [None]:
# import dataset
ratings = pd.read_json('datasets/Appliances.json', lines = True)#, orient = 'records')

In [None]:
ratings['reviewTime'] = pd.to_datetime(ratings['reviewTime'], format='%M %d, %Y')

In [None]:
rand_userIds = np.random.choice(ratings['reviewerID'].unique(), 
                                size=int(len(ratings['reviewerID'].unique())), 
                                replace=False)

ratings = ratings.loc[ratings['reviewerID'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

There are 602777 rows of data from 515650 users


In [None]:
# ^ Barely any users are reviewing more than one product

In [None]:
ratings.rename(columns = {'reviewerID':'userId', 'asin':'productId', 'overall':'rating'}, inplace = True)

all_productIds = ratings['productId'].unique()
all_userIds = ratings['userId'].unique()
product_mapper = dict(zip(all_productIds, range(len(all_productIds))))
user_mapper = dict(zip(all_userIds, range(len(all_userIds))))

# ratings.replace({'userId': user_mapper})
# ratings.replace({'productId': product_mapper})

ratings['userId'] = ratings['userId'].map(user_mapper)
ratings['productId'] = ratings['productId'].map(product_mapper)


In [None]:
ratings['rank_latest'] = ratings.groupby(['userId'])['reviewTime'] \
                                .rank(method='first', ascending=False)


train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'productId', 'rating']]
test_ratings = test_ratings[['userId', 'productId', 'rating']]

In [None]:
train_ratings.loc[:, 'rating'] = 1

train_ratings.head()

Unnamed: 0,userId,productId,rating
0,0,0,1
14,14,0,1
21,21,0,1
24,24,0,1
29,29,0,1


In [None]:
# Get a list of all product IDs
all_productIds = ratings['productId'].unique()
len(all_productIds)


30252

In [None]:
# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['productId']))
user_item_set


{(309073, 5765),
 (89207, 5843),
 (191629, 2520),
 (59489, 503),
 (124900, 2436),
 (240279, 3208),
 (213428, 3681),
 (36979, 4273),
 (72758, 2788),
 (2139, 3946),
 (54873, 449),
 (58185, 1713),
 (165375, 1858),
 (42673, 827),
 (1125, 194),
 (204384, 5954),
 (232787, 3176),
 (41660, 2194),
 (180717, 4354),
 (290820, 6353),
 (60118, 2512),
 (64611, 3285),
 (118882, 4141),
 (55325, 453),
 (315026, 5780),
 (42616, 1662),
 (99977, 1901),
 (265472, 6235),
 (47723, 3653),
 (117782, 1211),
 (276569, 15961),
 (234557, 3179),
 (410964, 14353),
 (4954, 4690),
 (57357, 502),
 (339023, 5662),
 (59477, 503),
 (5916, 40),
 (86400, 849),
 (479571, 4933),
 (18421, 843),
 (64037, 566),
 (148844, 4138),
 (36078, 5102),
 (482853, 21463),
 (182097, 2195),
 (188283, 2324),
 (150697, 2804),
 (188233, 2320),
 (163816, 1831),
 (134503, 1736),
 (69699, 2914),
 (53056, 430),
 (62969, 537),
 (56777, 476),
 (160490, 1721),
 (267079, 3897),
 (46445, 359),
 (418838, 22482),
 (26652, 174),
 (72506, 625),
 (39715, 279

In [None]:
# # 4:1 ratio of negative to positive samples
# num_negatives = 4

# for (u, i) in tqdm(user_item_set):
#     users.append(u)
#     items.append(i)
#     labels.append(1) # items that the user has interacted with are positive
#     for _ in range(num_negatives):
#         # randomly select an item
#         negative_item = np.random.choice(all_movieIds) 
#         # check that the user has not interacted with this item
#         while (u, negative_item) in user_item_set:
#             negative_item = np.random.choice(all_movieIds)
#         users.append(u)
#         items.append(negative_item)
#         labels.append(0) # items not interacted with are negative

In [None]:
num_negatives = 4

for _ in range(len(user_item_set)):
    (u, i) = user_item_set.pop()
    users.append(u)
    items.append(i)
    labels.append(1)
    for _ in range(num_negatives):
        negative_item = np.random.choice(all_productIds)
        while(u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_productIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0)

In [None]:
class ProductTrainDataset(Dataset):
    """Product PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the product ratings
        all_productIds (list): List containing all productIds
    
    """

    def __init__(self, ratings, all_productIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_productIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_productIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['productId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_productIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_productIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [None]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the product ratings for training
            all_productIds (list): List containing all productIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_productIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_productIds = all_productIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(ProductTrainDataset(self.ratings, self.all_productIds),
                          batch_size=512, num_workers=4)

In [None]:
num_users = ratings['userId'].max()+1
num_items = ratings['productId'].max()+1

all_productIds = ratings['productId'].unique()

model = NCF(num_users, num_items, train_ratings, all_productIds)

In [None]:
print(num_users, num_items)

515650 30252


In [None]:
trainer = pl.Trainer(max_epochs=10, gpus=0, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
  "`reload_dataloaders_every_epoch` is deprecated in v1.4 and will be removed in v1.6."
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 4.1 M 
1 | item_embedding | Embedding | 242 K 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.482    Total estimated model params size (MB)
  cpuset_checked))
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['productId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['productId'].apply(list).to_dict()

hits = []
for (u,i) in test_user_item_set:
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_productIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

The Hit Ratio @ 10 is 0.12


In [None]:
# max_epoch = 5, hit ratio = .7
# max_epoch = 7, hit ratio = .72
# max_epoch = 20, hit ratio = .68

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f7184e04-b640-4b25-92e7-fbfbadff1c0e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>