In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
import json

import numpy as np
import pandas as pd
import torch
from torch import nn
from tqdm import tqdm


## Data preprocessing

In [2]:
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

# Download the data
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-20m.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()


Downloading movielens data...


In [3]:
# load in the data
df = pd.read_csv('ml-20m/ratings.csv')

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
bs = 128
reg = 0. # regularization penalty


In [6]:
# Customize dataset
N = df.userId.max() + 1 # number of users
M = df.movieId.max() + 1 # number of movies

# split into train and test
df = shuffle(df)
df = df.iloc[:1000000]
cutoff = int(0.8*len(df))
cutoffeval = int(0.9*len(df))
df_train = df.iloc[:cutoff]
df_eval = df.iloc[cutoff:cutoffeval]
df_test = df.iloc[cutoffeval:]

# initialize variables
K = 10 # latent dimensionality
mu = df_train.rating.mean()


In [7]:
N,M,K,len(df_train),len(df), mu

(138494, 131263, 10, 800000, 1000000, 3.5261075)

In [8]:
class ratingdataset(Dataset):
  def __init__(self, df):
    self.df = df
    self.u = df.userId.values
    self.m = df.movieId.values
    self.r = df.rating.values
    self.N = self.u.max() + 1
    self.M = self.m.max() + 1

  def __len__(self):
    return len(self.u)
  
  def __getitem__(self, idx):
    return [self.u[idx], self.m[idx]], self.r[idx]


In [9]:
ratingData_train = ratingdataset(df_train)
ratingData_eval = ratingdataset(df_eval)
ratingData_test = ratingdataset(df_test)

In [10]:
train_dataloader = DataLoader(ratingData_train, batch_size=bs, shuffle=True)
eval_dataloader = DataLoader(ratingData_eval, batch_size=bs, shuffle=True)
test_dataloader = DataLoader(ratingData_test, batch_size=bs, shuffle=True)

In [11]:
train_dataloader, eval_dataloader, test_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x7f85b2e24b90>,
 <torch.utils.data.dataloader.DataLoader at 0x7f85b2e8dc10>,
 <torch.utils.data.dataloader.DataLoader at 0x7f85b2e8d750>)

In [12]:
# # direct data
# users = torch.tensor(df_train.userId.values)
# items = torch.tensor(df_train.movieId.values)
# ratings = torch.tensor(df_train.rating.values)

In [13]:
class UserItemEmbeddingNNNet(torch.nn.Module):
    def __init__(self, n_users, n_items, k_factors, embedding_dropout = 0.02, hidden = 10, dropouts = 0.2):

        super(UserItemEmbeddingNNNet, self).__init__()
        self.u = torch.nn.Embedding(n_users, k_factors)
        self.m = torch.nn.Embedding(n_items, k_factors)
        self.u_bias = torch.nn.Embedding(n_users, 1)
        self.m_bias = torch.nn.Embedding(n_items, 1)
        self.lout = torch.nn.Linear(2*k_factors, 400)
        self.act = torch.nn.ReLU()
        self.out = torch.nn.Linear(400,1)


    def forward(self, users, items):
        uembed = self.u(users)
        membed = self.m(items)
        ubias = self.u_bias(users)
        mbias = self.m_bias(items)
        umdot = torch.mul(uembed,membed)
        umdot = torch.sum(umdot,1)
        umdot = torch.reshape(umdot, (umdot.shape[0], 1))
        #umcat = torch.cat((uembed.clone().detach(),membed.clone().detach()), 1)
        umcat = torch.cat((uembed,membed), 1)
        umcat = self.lout(umcat)
        umcat = self.act(umcat)
        umcat = self.out(umcat)
        output = torch.add(umdot, umcat)
        # print(output[0], umdot[0], umcat[0])
        output = torch.add(output, ubias)
        output = torch.add(output, mbias)
        # or use: result = torch.sum(torch.stack([x, y, ...]), dim=0)
        output = torch.flatten(output)
        return output

UserItemEmbeddingNNNet = UserItemEmbeddingNNNet(N, M, K)
print(UserItemEmbeddingNNNet)

UserItemEmbeddingNNNet(
  (u): Embedding(138494, 10)
  (m): Embedding(131263, 10)
  (u_bias): Embedding(138494, 1)
  (m_bias): Embedding(131263, 1)
  (lout): Linear(in_features=20, out_features=400, bias=True)
  (act): ReLU()
  (out): Linear(in_features=400, out_features=1, bias=True)
)


In [14]:
epochs = 10
lr = 0.05
min_valid_loss = np.inf
loss_func = torch.nn.MSELoss()

def train_loop(epoch, min_valid_loss, loss_func, model):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    train_loss = 0.0
    model.train()
    
    for i, data in tqdm(enumerate(train_dataloader)):
        inputs, labels = data
        if torch.cuda.is_available():
            inputs, labels = inputs.cuda(), labels.cuda()
        users, items = inputs[0], inputs[1]

        optimizer.zero_grad()
        target = model(users, items)
        loss = loss_func(target.float(), labels.float())
        #loss = loss_func(target, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    valid_loss = 0.0
    model.eval()     # Optional when not using model Specific layer
    for data in eval_dataloader:
        inputs, labels = data
        if torch.cuda.is_available():
            inputs, labels =inputs.cuda(), labels.cuda()
        users, items = inputs[0], inputs[1]
        target = model(users, items)
        loss = loss_func(target.float(), labels.float())
        valid_loss += loss.item() * len(inputs)

    if epoch%1==0:
        print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss, train_loss / len(train_dataloader)} \t\t Validation Loss: {valid_loss, valid_loss / len(eval_dataloader)}')
    if min_valid_loss > valid_loss:
        print(f"Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f})")
        min_valid_loss = valid_loss
        # Saving State Dict
        torch.save(model.state_dict(), 'saved_model.pth')
    return min_valid_loss
 

In [15]:
for epoch in range(epochs):
    min_valid_loss = train_loop(epoch, min_valid_loss, loss_func, UserItemEmbeddingNNNet)

6250it [03:48, 27.31it/s]


Epoch 1 		 Training Loss: (19435.097393989563, 3.1096155830383303) 		 Validation Loss: (3723.798581838608, 4.761890769614588)
Validation Loss Decreased(inf--->3723.798582)


6250it [03:51, 27.02it/s]


Epoch 2 		 Training Loss: (13724.028740286827, 2.1958445984458925) 		 Validation Loss: (3211.930772781372, 4.107328353940374)
Validation Loss Decreased(3723.798582--->3211.930773)


6250it [03:48, 27.39it/s]


Epoch 3 		 Training Loss: (11633.094459533691, 1.8612951135253906) 		 Validation Loss: (3036.2382473945618, 3.8826576053638897)
Validation Loss Decreased(3211.930773--->3036.238247)


6250it [03:37, 28.74it/s]


Epoch 4 		 Training Loss: (10302.01851093769, 1.6483229617500306) 		 Validation Loss: (2608.7273111343384, 3.335968428560535)
Validation Loss Decreased(3036.238247--->2608.727311)


6250it [03:26, 30.28it/s]


Epoch 5 		 Training Loss: (9355.773501038551, 1.4969237601661682) 		 Validation Loss: (2553.3668506145477, 3.2651750007858666)
Validation Loss Decreased(2608.727311--->2553.366851)


6250it [03:37, 28.68it/s]


Epoch 6 		 Training Loss: (8662.600610435009, 1.3860160976696014) 		 Validation Loss: (2359.6794426441193, 3.017492893406802)
Validation Loss Decreased(2553.366851--->2359.679443)


6250it [03:04, 33.84it/s]


Epoch 7 		 Training Loss: (8097.742522656918, 1.2956388036251067) 		 Validation Loss: (2219.2110756635666, 2.837865825656735)
Validation Loss Decreased(2359.679443--->2219.211076)


6250it [02:29, 41.77it/s]


Epoch 8 		 Training Loss: (7648.609627246857, 1.223777540359497) 		 Validation Loss: (2150.5205614566803, 2.7500262934228648)
Validation Loss Decreased(2219.211076--->2150.520561)


6250it [02:28, 41.95it/s]


Epoch 9 		 Training Loss: (7234.131665945053, 1.1574610665512084) 		 Validation Loss: (2086.32681453228, 2.6679371029824552)
Validation Loss Decreased(2150.520561--->2086.326815)


6250it [02:29, 41.89it/s]


Epoch 10 		 Training Loss: (6953.18362724781, 1.1125093803596497) 		 Validation Loss: (2044.7947500944138, 2.6148270461565395)
Validation Loss Decreased(2086.326815--->2044.794750)


In [16]:
tusers = torch.tensor(df_test.userId.values)
titems = torch.tensor(df_test.movieId.values)

In [17]:
UserItemEmbeddingNNNet.eval()
tpreds = UserItemEmbeddingNNNet(tusers, titems)

In [18]:
len(tpreds)

100000

In [19]:
tpredsfinal = tpreds.detach().numpy()
tpredsfinal = (tpredsfinal>3).astype(int)
tratings = df_test.rating.values
tratings = (tratings>3).astype(int)
tratings, tpredsfinal

(array([0, 0, 1, ..., 0, 1, 1]), array([0, 1, 1, ..., 1, 1, 1]))

In [20]:
f1_score(tpredsfinal, tratings)

0.7463350535518967

In [21]:
tratings = df_test.rating.values
tpredsfinal = tpreds.detach().numpy()
tpredsfinal = (tpredsfinal>2.5).astype(int)
tratings = (tratings>2.5).astype(int)
f1_score(tpredsfinal, tratings)

0.881347013943699