In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import ndcg_score
import json

import numpy as np
import pandas as pd
import torch
from torch import nn
from tqdm import tqdm


## Data preprocessing

In [2]:
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

# Download the data
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-20m.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()


Downloading movielens data...


In [3]:
# load in the data
df = pd.read_csv('ml-20m/ratings.csv')

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
bs = 128
reg = 0. # regularization penalty


In [6]:
# Customize dataset
N = df.userId.max() + 1 # number of users
M = df.movieId.max() + 1 # number of movies

# split into train and test
df = shuffle(df)
df = df.iloc[:1000000]
cutoff = int(0.8*len(df))
cutoffeval = int(0.9*len(df))
df_train = df.iloc[:cutoff]
df_eval = df.iloc[cutoff:cutoffeval]
df_test = df.iloc[cutoffeval:]

# initialize variables
K = 10 # latent dimensionality
mu = df_train.rating.mean()


In [7]:
N,M,K,len(df_train),len(df), mu

(138494, 131263, 10, 800000, 1000000, 3.52558625)

In [8]:
class ratingdataset(Dataset):
  def __init__(self, df):
    self.df = df
    self.u = df.userId.values
    self.m = df.movieId.values
    self.r = df.rating.values
    self.N = self.u.max() + 1
    self.M = self.m.max() + 1

  def __len__(self):
    return len(self.u)
  
  def __getitem__(self, idx):
    return [self.u[idx], self.m[idx]], self.r[idx]


In [9]:
ratingData_train = ratingdataset(df_train)
ratingData_eval = ratingdataset(df_eval)
ratingData_test = ratingdataset(df_test)

In [10]:
train_dataloader = DataLoader(ratingData_train, batch_size=bs, shuffle=True)
eval_dataloader = DataLoader(ratingData_eval, batch_size=bs, shuffle=True)
test_dataloader = DataLoader(ratingData_test, batch_size=bs, shuffle=True)

In [11]:
train_dataloader, eval_dataloader, test_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x7efde694c310>,
 <torch.utils.data.dataloader.DataLoader at 0x7efde694c250>,
 <torch.utils.data.dataloader.DataLoader at 0x7efde694c510>)

In [12]:
# # direct data
# users = torch.tensor(df_train.userId.values)
# items = torch.tensor(df_train.movieId.values)
# ratings = torch.tensor(df_train.rating.values)

## Model

In [13]:
, models
class UserItemEmbeddingNNNet(torch.nn.Module):
    def __init__(self, n_users, n_items, k_factors):

        super(UserItemEmbeddingNNNet, self).__init__()
        self.u = torch.nn.Embedding(n_users, k_factors)
        self.m = torch.nn.Embedding(n_items, k_factors)
        self.u_bias = torch.nn.Embedding(n_users, 1)
        self.m_bias = torch.nn.Embedding(n_items, 1)
        self.lout = torch.nn.Linear(2*k_factors, 400)
        self.act = torch.nn.ReLU()
        self.out = torch.nn.Linear(400,1)


    def forward(self, users, items):
        uembed = self.u(users)
        membed = self.m(items)
        ubias = self.u_bias(users)
        mbias = self.m_bias(items)
        umdot = torch.mul(uembed,membed)
        umdot = torch.sum(umdot,1)
        umdot = torch.reshape(umdot, (umdot.shape[0], 1))
        # umcat = torch.cat((uembed.clone().detach(),membed.clone().detach()), 1)
        umcat = torch.cat((uembed,membed), 1)
        umcat = self.lout(umcat)
        umcat = self.act(umcat)
        umcat = self.out(umcat)
        output = torch.add(umdot, umcat)
        # output = umdot
        output = torch.add(output, ubias)
        output = torch.add(output, mbias)
        # or use: result = torch.sum(torch.stack([x, y, ...]), dim=0)
        output = torch.flatten(output)
        return output

model = UserItemEmbeddingNNNet(N, M, K)
print(model)

UserItemEmbeddingNNNet(
  (u): Embedding(138494, 10)
  (m): Embedding(131263, 10)
  (u_bias): Embedding(138494, 1)
  (m_bias): Embedding(131263, 1)
  (lout): Linear(in_features=20, out_features=400, bias=True)
  (act): ReLU()
  (out): Linear(in_features=400, out_features=1, bias=True)
)


In [14]:
epochs = 30
lr = 0.05
min_valid_loss = np.inf
loss_func = torch.nn.MSELoss()

def train_loop(epoch, min_valid_loss, loss_func, model):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    train_loss = 0.0
    model.train()
    
    for i, data in tqdm(enumerate(train_dataloader)):
        inputs, labels = data
        if torch.cuda.is_available():
            inputs, labels = inputs.cuda(), labels.cuda()
        users, items = inputs[0], inputs[1]

        optimizer.zero_grad()
        target = model(users, items)
        loss = loss_func(target.float(), labels.float())
        #loss = loss_func(target, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    valid_loss = 0.0
    model.eval()     # Optional when not using model Specific layer
    for data in eval_dataloader:
        inputs, labels = data
        if torch.cuda.is_available():
            inputs, labels =inputs.cuda(), labels.cuda()
        users, items = inputs[0], inputs[1]
        target = model(users, items)
        loss = loss_func(target.float(), labels.float())
        valid_loss += loss.item() * len(inputs)

    if epoch%1==0:
        print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss, train_loss / len(train_dataloader)} \t\t Validation Loss: {valid_loss, valid_loss / len(eval_dataloader)}')
    if min_valid_loss > valid_loss:
        print(f"Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f})")
        min_valid_loss = valid_loss
        # Saving State Dict
        torch.save(model.state_dict(), 'saved_model.pth')
    return min_valid_loss
 

## Training

In [15]:
for epoch in range(epochs):
    min_valid_loss = train_loop(epoch, min_valid_loss, loss_func, model)

6250it [00:59, 104.92it/s]


Epoch 1 		 Training Loss: (19104.721152305603, 3.0567553843688966) 		 Validation Loss: (3711.959711790085, 4.746751549603689)
Validation Loss Decreased(inf--->3711.959712)


6250it [00:58, 106.48it/s]


Epoch 2 		 Training Loss: (13618.610536575317, 2.178977685852051) 		 Validation Loss: (3129.3096187114716, 4.001674704234619)
Validation Loss Decreased(3711.959712--->3129.309619)


6250it [01:07, 92.54it/s] 


Epoch 3 		 Training Loss: (11582.888159513474, 1.8532621055221559) 		 Validation Loss: (2941.721385240555, 3.7617920527372823)
Validation Loss Decreased(3129.309619--->2941.721385)


6250it [01:00, 104.06it/s]


Epoch 4 		 Training Loss: (10242.486230134964, 1.6387977968215943) 		 Validation Loss: (2739.531768321991, 3.5032375553989654)
Validation Loss Decreased(2941.721385--->2739.531768)


6250it [01:00, 103.54it/s]


Epoch 5 		 Training Loss: (9318.935917913914, 1.4910297468662261) 		 Validation Loss: (2439.926154613495, 3.120110172140019)
Validation Loss Decreased(2739.531768--->2439.926155)


6250it [00:59, 105.10it/s]


Epoch 6 		 Training Loss: (8598.378878116608, 1.3757406204986573) 		 Validation Loss: (2304.460342168808, 2.9468802329524397)
Validation Loss Decreased(2439.926155--->2304.460342)


6250it [01:02, 99.85it/s] 


Epoch 7 		 Training Loss: (8066.823098361492, 1.2906916957378387) 		 Validation Loss: (2293.6280349493027, 2.9330281776845304)
Validation Loss Decreased(2304.460342--->2293.628035)


6250it [01:00, 102.50it/s]


Epoch 8 		 Training Loss: (7604.491394281387, 1.216718623085022) 		 Validation Loss: (2179.7109602689743, 2.787354169141911)
Validation Loss Decreased(2293.628035--->2179.710960)


6250it [01:01, 101.42it/s]


Epoch 9 		 Training Loss: (7246.4050016999245, 1.159424800271988) 		 Validation Loss: (2079.468311905861, 2.659166639265807)
Validation Loss Decreased(2179.710960--->2079.468312)


6250it [01:01, 101.83it/s]


Epoch 10 		 Training Loss: (6913.38003629446, 1.1061408058071136) 		 Validation Loss: (2076.1534378528595, 2.654927669888567)
Validation Loss Decreased(2079.468312--->2076.153438)


6250it [01:01, 102.20it/s]


Epoch 11 		 Training Loss: (6656.185852944851, 1.0649897364711762) 		 Validation Loss: (1973.7055970430374, 2.5239202008222983)
Validation Loss Decreased(2076.153438--->1973.705597)


6250it [01:00, 103.38it/s]


Epoch 12 		 Training Loss: (6412.94257992506, 1.0260708127880096) 		 Validation Loss: (1917.1438838243484, 2.4515906442766604)
Validation Loss Decreased(1973.705597--->1917.143884)


6250it [01:00, 103.37it/s]


Epoch 13 		 Training Loss: (6196.944331109524, 0.9915110929775238) 		 Validation Loss: (1909.4216879606247, 2.4417157135046352)
Validation Loss Decreased(1917.143884--->1909.421688)


6250it [01:01, 102.34it/s]


Epoch 14 		 Training Loss: (6016.4717400074005, 0.9626354784011841) 		 Validation Loss: (1915.928909420967, 2.4500369685690115)


6250it [00:59, 104.21it/s]


Epoch 15 		 Training Loss: (5851.631177783012, 0.9362609884452819) 		 Validation Loss: (1844.4686324596405, 2.3586555402297193)
Validation Loss Decreased(1909.421688--->1844.468632)


6250it [01:00, 103.10it/s]


Epoch 16 		 Training Loss: (5700.662489384413, 0.9121059983015061) 		 Validation Loss: (1872.5964572429657, 2.394624625630391)


6250it [01:00, 103.43it/s]


Epoch 17 		 Training Loss: (5565.370525598526, 0.8904592840957641) 		 Validation Loss: (1830.5692011117935, 2.3408813313450043)
Validation Loss Decreased(1844.468632--->1830.569201)


6250it [01:00, 102.89it/s]


Epoch 18 		 Training Loss: (5443.453827142715, 0.8709526123428345) 		 Validation Loss: (1776.1281238794327, 2.271263585523571)
Validation Loss Decreased(1830.569201--->1776.128124)


6250it [01:00, 103.23it/s]


Epoch 19 		 Training Loss: (5319.191139161587, 0.8510705822658539) 		 Validation Loss: (1784.5261439085007, 2.282002741570973)


6250it [01:00, 102.75it/s]


Epoch 20 		 Training Loss: (5218.888403058052, 0.8350221444892884) 		 Validation Loss: (1766.8184914588928, 2.2593586847300418)
Validation Loss Decreased(1776.128124--->1766.818491)


6250it [01:00, 103.47it/s]


Epoch 21 		 Training Loss: (5116.717197060585, 0.8186747515296936) 		 Validation Loss: (1742.8624116182327, 2.228724311532267)
Validation Loss Decreased(1766.818491--->1742.862412)


6250it [01:01, 101.78it/s]


Epoch 22 		 Training Loss: (5019.731088101864, 0.8031569740962983) 		 Validation Loss: (1743.4509890079498, 2.2294769680408564)


6250it [01:00, 103.71it/s]


Epoch 23 		 Training Loss: (4936.521913647652, 0.7898435061836243) 		 Validation Loss: (1723.3388208150864, 2.203758082883742)
Validation Loss Decreased(1742.862412--->1723.338821)


6250it [01:00, 102.82it/s]


Epoch 24 		 Training Loss: (4856.360728472471, 0.7770177165555954) 		 Validation Loss: (1723.86923122406, 2.2044363570640155)


6250it [01:00, 103.22it/s]


Epoch 25 		 Training Loss: (4777.567641675472, 0.7644108226680756) 		 Validation Loss: (1690.3916329145432, 2.1616261290467302)
Validation Loss Decreased(1723.338821--->1690.391633)


6250it [01:01, 102.28it/s]


Epoch 26 		 Training Loss: (4708.327015191317, 0.7533323224306107) 		 Validation Loss: (1696.6924525499344, 2.1696834431584837)


6250it [01:00, 102.72it/s]


Epoch 27 		 Training Loss: (4634.938100129366, 0.7415900960206986) 		 Validation Loss: (1687.0881880521774, 2.1574017750027843)
Validation Loss Decreased(1690.391633--->1687.088188)


6250it [01:00, 102.60it/s]


Epoch 28 		 Training Loss: (4576.343933850527, 0.7322150294160843) 		 Validation Loss: (1673.5922901630402, 2.140143593558875)
Validation Loss Decreased(1687.088188--->1673.592290)


6250it [01:00, 103.11it/s]


Epoch 29 		 Training Loss: (4511.880024522543, 0.7219008039236069) 		 Validation Loss: (1697.8949502706528, 2.171221164028968)


6250it [01:01, 102.05it/s]


Epoch 30 		 Training Loss: (4451.84169331193, 0.7122946709299087) 		 Validation Loss: (1680.4866058826447, 2.148959854069878)


## Evaluation

### Will evaluate the performance of the model using nDCG@k metric.

In [16]:
tusers = torch.tensor(df_test.userId.values)
titems = torch.tensor(df_test.movieId.values)

In [17]:
# bestmodel = UserItemEmbeddingNNNet(N, M, K)
# bestmodel.load_state_dict(torch.load('saved_model.pth'))
# bestmodel.eval()
# tpredsbest = bestmodel(tusers, titems)

In [19]:
model.eval()
tpreds = model(tusers, titems)
tpreds = tpreds.detach().numpy()

In [20]:
tusersnp = tusers.detach().numpy()
titemsnp = titems.detach().numpy()
tratings = df_test.rating.values
tratings = (tratings*2).astype(int)

def getUmapImap(tusersnp, titemsnp, tratings, tpreds):
  usermap = {}
  itemmap = {}

  for umrp in zip(tusersnp, titemsnp, tratings, tpreds):
    u,m,r,p = umrp[0], umrp[1], umrp[2], umrp[3]
    if u not in usermap:
        usermap[u] = [(m,r,p)]
    else:
      usermap[u] += [(m,r,p)]

    if m not in itemmap:
        itemmap[m] = [(u,r,p)]
    else:
      itemmap[m] += [(u,r,p)]

  return usermap, itemmap

usermap, itemmap = getUmapImap(tusersnp, titemsnp, tratings, tpreds)

usercount = []
for u,v in usermap.items():
  usercount.append((u, len(v)))

itemcount = []
for u,v in itemmap.items():
  itemcount.append((u, len(v)))
usercount = sorted(usercount, key=lambda k: k[1])
itemcount = sorted(itemcount, key=lambda k: k[1])
print(usercount[51700], usercount[-1], len(usercount))
print(itemcount[6500], itemcount[-1], len(itemcount))


(92637, 10) (118205, 44) 52065
(3261, 11) (296, 340) 8448


#### In the test set, only ~500 user have rated more than 10 movies.
#### For the item part, nearly 2000 movies have been rated by more than 10 users.
#### We will do an nDCG@k metric evaluation for different thresholds.


> nDCG@k

In [21]:
def nDCGtk(usermap, threshold):

  nDCGatk = 0
  n = 0
  for u,v in usermap.items():
    if len(v) > threshold:
      labels = np.array([[m[1] for m in v]])
      preds = np.array([[m[2] for m in v]])
      nDCGatk += ndcg_score(labels, preds)
      n += 1
  return nDCGatk/n


print("for users who rated 5 movies or more: ", nDCGtk(usermap, 5))
print("for users who rated 10 movies or more: ", nDCGtk(usermap, 10))
print("for users who rated 20 movies or more: ", nDCGtk(usermap, 20))

for users who rated 5 movies or more:  0.9473262273576568
for users who rated 10 movies or more:  0.9377138965134775
for users who rated 20 movies or more:  0.9273918161562738


The score above is high because we didn't really do the nearest neighbour search in the full movie category, but just ranked on the movies that the users rated.

Test on an untrained model:

In [22]:
randommodel = UserItemEmbeddingNNNet(N, M, K)
tpredsrandom = randommodel(tusers, titems)
tpredsrandom = tpredsrandom.detach().numpy()

In [24]:
usermap, itemmap = getUmapImap(tusersnp, titemsnp, tratings, tpredsrandom)

print("(Untrained) for users who rated 5 movies or more: ", nDCGtk(usermap, 5))
print("(Untrained) for users who rated 10 movies or more: ", nDCGtk(usermap, 10))
print("(Untrained) for users who rated 50 movies or more: ", nDCGtk(usermap, 20))

(Untrained) for users who rated 5 movies or more:  0.9203125822466507
(Untrained) for users who rated 10 movies or more:  0.9136617249919924
(Untrained) for users who rated 50 movies or more:  0.9177193769121238
