In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-20m-dataset/rating.csv
/kaggle/input/movielens-20m-dataset/link.csv
/kaggle/input/movielens-20m-dataset/genome_tags.csv
/kaggle/input/movielens-20m-dataset/genome_scores.csv
/kaggle/input/movielens-20m-dataset/tag.csv
/kaggle/input/movielens-20m-dataset/movie.csv


In [3]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl

np.random.seed(123)



In [4]:
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv',
                     parse_dates=['timestamp'])

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
ratings.shape

(20000263, 4)

In [7]:
len(ratings['userId'].unique())*0.3

41547.9

In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int64         
 1   movieId    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 610.4 MB


In [9]:
rand_userIds = np.random.choice(ratings['userId'].unique(),
                               size=int(len(ratings['userId'].unique())*0.3),
                               replace=False)
ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]
print('There are {} rows of data from {} users'.format(len(ratings),len(rand_userIds)))

There are 6027314 rows of data from 41547 users


In [10]:
ratings.shape

(6027314, 4)

In [11]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [12]:
ratings.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
3840312,26182,3704,4.0,2007-01-31 21:56:52
7608731,52439,3365,4.0,2004-03-21 08:02:56
19363634,134060,1027,3.0,2003-07-15 22:43:45
17181947,118860,2629,1.0,2007-11-29 21:27:08
9344779,64638,4723,2.0,2001-09-10 20:11:41
10356404,71637,3882,0.5,2004-02-20 01:48:53
962965,6460,11,4.0,1996-10-22 12:53:32
12014375,82949,54001,4.5,2010-08-06 16:39:25
10532053,72855,7158,2.5,2011-07-25 01:07:58
18710288,129551,1228,4.5,2007-06-28 00:17:42


In [13]:
ratings[(ratings['userId']==3)]

Unnamed: 0,userId,movieId,rating,timestamp
236,3,1,4.0,1999-12-11 13:36:47
237,3,24,3.0,1999-12-14 12:54:08
238,3,32,4.0,1999-12-11 13:14:07
239,3,50,5.0,1999-12-11 13:13:38
240,3,160,3.0,1999-12-14 12:54:08
...,...,...,...,...
418,3,3070,4.0,1999-12-11 13:42:09
419,3,3072,4.0,1999-12-11 13:22:47
420,3,3098,4.0,1999-12-11 13:21:08
421,3,3142,4.0,1999-12-11 13:02:28


In [14]:
ratings['rank_latest']=ratings.groupby(['userId'])['timestamp'].rank(method='first',ascending=False)

In [15]:
train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no Longer need 
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId','movieId','rating']]

In [16]:
train_ratings.loc[:, 'rating'] = 1

train_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
3411906,23263,5481,1.0
1815983,12245,5464,1.0
16198592,112109,6,1.0
13914487,96124,1247,1.0
1445807,9790,1690,1.0


In [17]:
all_movieIds = ratings['movieId'].unique()
users,items,labels = [],[],[]
user_item_set = set(zip(train_ratings['userId'],train_ratings['movieId']))
num_negatives=4
for (u,i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1)
    for _ in range(num_negatives):
        negative_item = np.random.choice(all_movieIds)
        while (u,negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

  0%|          | 0/5985767 [00:00<?, ?it/s]

In [18]:
class MovieLensDataset(Dataset):
    def __init__(self,ratings,all_movieIds):
        self.users,self.items,self.labels=self.get_dataset(ratings,all_movieIds)
    
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self,idx):
        return self.users[idx],self.items[idx],self.labels[idx]
    
    def get_dataset(self,ratings,all_movieIds):
        users,items,labels = [],[],[]
        user_item_set = set(zip(train_ratings['userId'],train_ratings['movieId']))
        num_negatives=4
        for (u,i) in tqdm(user_item_set):
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u,negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)
        return torch.tensor(users),torch.tensor(items),torch.tensor(labels)

In [19]:
class NCF(pl.LightningModule):
    def __init__(self,num_users,num_items,ratings,all_movieIds):
        super().__init__()
        
        self.user_embedding=nn.Embedding(num_embeddings=num_users,embedding_dim=8)
        self.item_embedding=nn.Embedding(num_embeddings=num_items,embedding_dim=8)
        self.fc1=nn.Linear(in_features=16,out_features=64)
        self.fc2=nn.Linear(in_features=64,out_features=32)
        self.output=nn.Linear(in_features=32,out_features=1)
        self.ratings=ratings
        self.all_movieIds=all_movieIds
        
    def forward(self,user_input,item_input):
        
        user_embedded=self.user_embedding(user_input)
        item_embedded=self.item_embedding(item_input)
        
        vector=torch.cat([user_embedded,item_embedded],dim=-1)
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        
        pred = nn.Sigmoid()(self.output(vector))
        return pred
    
    def training_step(self,batch,batch_idx):
        user_input,item_input,labels=batch
        predicted_labels=self(user_input,item_input)
        loss = nn.BCELoss()(predicted_labels,labels.view(-1,1).float())
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())
    
    def train_dataloader(self):
        return DataLoader(MovieLensDataset(self.ratings,self.all_movieIds),
                         batch_size=512,num_workers=4)

In [20]:
num_users=ratings['userId'].max()+1
num_items=ratings['movieId'].max()+1
all_movieIds = ratings['movieId'].unique()
model = NCF(num_users,num_items,train_ratings,all_movieIds)

In [21]:
trainer = pl.Trainer(max_epochs=5,accelerator="gpu",devices=1, reload_dataloaders_every_n_epochs=False,
                     logger=False, enable_checkpointing=False)

trainer.fit(model)

  0%|          | 0/5985767 [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

In [22]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/41547 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.86


In [28]:
# len(hits)
# test_ratings['userId'].head()
1 in hits

True

In [25]:
predicted_labels[0]

1.0

In [31]:
# len(user_interacted_items[3])
# torch.tensor([3]*100)
interacted_items = user_interacted_items[98955]
not_interacted_items = set(all_movieIds) - set(interacted_items)
selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
test_items = selected_not_interacted + [47]
predicted_labels = np.squeeze(model(torch.tensor([98955]*100), 
                                        torch.tensor(test_items)).detach().numpy())
top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
tmp_hits=[]
if i in top10_items:
    tmp_hits.append(1)
else:
    tmp_hits.append(0)

In [33]:
# np.average(tmp_hits)
top10_items

[47, 903, 806, 1228, 1057, 2971, 2246, 545, 3609, 62925]

In [30]:
predicted_labels

array([9.38959577e-09, 3.34162231e-09, 1.46210068e-05, 8.30846503e-12,
       7.11498491e-04, 3.21978798e-11, 6.58003148e-04, 1.61951754e-11,
       2.83099827e-04, 3.35523259e-06, 6.08864248e-01, 1.04154809e-04,
       8.49854987e-05, 1.41974757e-04, 2.64702074e-04, 1.13151699e-09,
       1.63983032e-06, 1.34268485e-09, 3.06650905e-09, 5.07726043e-07,
       6.53094457e-06, 3.23390935e-07, 4.33634763e-04, 1.20917548e-07,
       4.23508277e-12, 5.50746462e-11, 1.67590065e-06, 2.04662474e-08,
       5.31545595e-07, 3.14185381e-05, 4.08313717e-05, 4.16505682e-05,
       2.81955477e-06, 2.18869711e-04, 3.65909000e-05, 2.28452041e-06,
       1.91024796e-07, 3.18106563e-09, 3.25199676e-06, 8.27643376e-09,
       2.61677724e-05, 4.63186929e-08, 3.09639677e-06, 2.72980833e-06,
       9.31312456e-07, 3.72679838e-12, 7.92313731e-05, 9.54229745e-06,
       5.86095774e-14, 2.65322146e-08, 2.68855347e-05, 9.45594195e-07,
       1.44642104e-06, 1.60281015e-05, 2.86491997e-10, 1.16094341e-07,
      

In [23]:
test_user_item_set

{(122444, 1500),
 (11928, 104),
 (77010, 6016),
 (78703, 1213),
 (43434, 858),
 (18706, 77240),
 (95349, 2396),
 (133568, 31431),
 (134458, 95510),
 (26714, 665),
 (134580, 3821),
 (22990, 6711),
 (136965, 63082),
 (20798, 1967),
 (100889, 78499),
 (109222, 21),
 (80016, 49530),
 (116575, 1278),
 (131033, 1623),
 (7633, 2335),
 (113555, 95),
 (105790, 25),
 (92608, 224),
 (26176, 208),
 (77478, 2455),
 (15302, 1384),
 (117992, 97304),
 (30623, 1350),
 (8699, 36),
 (23766, 7090),
 (59442, 1353),
 (101639, 1281),
 (98955, 47),
 (105018, 1616),
 (96933, 1376),
 (102375, 383),
 (110145, 3271),
 (66573, 1285),
 (19484, 8644),
 (77122, 109374),
 (119272, 5349),
 (55724, 8907),
 (108673, 314),
 (50838, 26562),
 (109044, 2701),
 (56888, 3760),
 (12612, 2183),
 (25440, 2395),
 (62897, 102033),
 (1100, 39),
 (37822, 4772),
 (50761, 1617),
 (70017, 77455),
 (64590, 448),
 (73865, 7150),
 (12162, 1573),
 (47165, 6882),
 (4006, 5065),
 (134902, 34),
 (130896, 48061),
 (79332, 7153),
 (104013, 99387