In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
import json

import numpy as np
import pandas as pd
import torch
from torch import nn
from tqdm import tqdm


## Data preprocessing

In [2]:
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

# Download the data
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-20m.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()


Downloading movielens data...


In [3]:
# load in the data
df = pd.read_csv('ml-20m/ratings.csv')

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
bs = 128
reg = 0. # regularization penalty


In [6]:
# Customize dataset
N = df.userId.max() + 1 # number of users
M = df.movieId.max() + 1 # number of movies

# split into train and test
df = shuffle(df)
df = df.iloc[:1000000]
cutoff = int(0.8*len(df))
cutoffeval = int(0.9*len(df))
df_train = df.iloc[:cutoff]
df_eval = df.iloc[cutoff:cutoffeval]
df_test = df.iloc[cutoffeval:]

# initialize variables
K = 10 # latent dimensionality
mu = df_train.rating.mean()


In [7]:
N,M,K,len(df_train),len(df), mu

(138494, 131263, 10, 800000, 1000000, 3.52350625)

In [8]:
class ratingdataset(Dataset):
  def __init__(self, df):
    self.df = df
    self.u = df.userId.values
    self.m = df.movieId.values
    self.r = df.rating.values
    self.N = self.u.max() + 1
    self.M = self.m.max() + 1

  def __len__(self):
    return len(self.u)
  
  def __getitem__(self, idx):
    return [self.u[idx], self.m[idx]], self.r[idx]


In [9]:
ratingData_train = ratingdataset(df_train)
ratingData_eval = ratingdataset(df_eval)
ratingData_test = ratingdataset(df_test)

In [10]:
train_dataloader = DataLoader(ratingData_train, batch_size=bs, shuffle=True)
eval_dataloader = DataLoader(ratingData_eval, batch_size=bs, shuffle=True)
test_dataloader = DataLoader(ratingData_test, batch_size=bs, shuffle=True)

In [11]:
train_dataloader, eval_dataloader, test_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x7fb53f015e10>,
 <torch.utils.data.dataloader.DataLoader at 0x7fb53f015d50>,
 <torch.utils.data.dataloader.DataLoader at 0x7fb53f018090>)

In [12]:
# # direct data
# users = torch.tensor(df_train.userId.values)
# items = torch.tensor(df_train.movieId.values)
# ratings = torch.tensor(df_train.rating.values)

In [13]:
class UserItemEmbeddingNNNet(torch.nn.Module):
    def __init__(self, n_users, n_items, k_factors, embedding_dropout = 0.02, hidden = 10, dropouts = 0.2):

        super(UserItemEmbeddingNNNet, self).__init__()
        self.u = torch.nn.Embedding(n_users, k_factors)
        self.m = torch.nn.Embedding(n_items, k_factors)
        self.u_bias = torch.nn.Embedding(n_users, 1)
        self.m_bias = torch.nn.Embedding(n_items, 1)
        self.lout = torch.nn.Linear(2*k_factors, 400)
        self.act = torch.nn.ReLU()
        self.out = torch.nn.Linear(400,1)


    def forward(self, users, items):
        uembed = self.u(users)
        membed = self.m(items)
        ubias = self.u_bias(users)
        mbias = self.m_bias(items)
        umdot = torch.mul(uembed,membed)
        umdot = torch.sum(umdot,1)
        umdot = torch.reshape(umdot, (umdot.shape[0], 1))
        #umcat = torch.cat((uembed.clone().detach(),membed.clone().detach()), 1)
        umcat = torch.cat((uembed,membed), 1)
        umcat = self.lout(umcat)
        umcat = self.act(umcat)
        umcat = self.out(umcat)
        output = torch.add(umdot, umcat)
        # print(output[0], umdot[0], umcat[0])
        output = torch.add(output, ubias)
        output = torch.add(output, mbias)
        # or use: result = torch.sum(torch.stack([x, y, ...]), dim=0)
        output = torch.flatten(output)
        return output

UserItemEmbeddingNNNet = UserItemEmbeddingNNNet(N, M, K)
print(UserItemEmbeddingNNNet)

UserItemEmbeddingNNNet(
  (u): Embedding(138494, 10)
  (m): Embedding(131263, 10)
  (u_bias): Embedding(138494, 1)
  (m_bias): Embedding(131263, 1)
  (lout): Linear(in_features=20, out_features=400, bias=True)
  (act): ReLU()
  (out): Linear(in_features=400, out_features=1, bias=True)
)


In [14]:
epochs = 30
lr = 0.05
min_valid_loss = np.inf
loss_func = torch.nn.MSELoss()

def train_loop(epoch, min_valid_loss, loss_func, model):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    train_loss = 0.0
    model.train()
    
    for i, data in tqdm(enumerate(train_dataloader)):
        inputs, labels = data
        if torch.cuda.is_available():
            inputs, labels = inputs.cuda(), labels.cuda()
        users, items = inputs[0], inputs[1]

        optimizer.zero_grad()
        target = model(users, items)
        loss = loss_func(target.float(), labels.float())
        #loss = loss_func(target, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    valid_loss = 0.0
    model.eval()     # Optional when not using model Specific layer
    for data in eval_dataloader:
        inputs, labels = data
        if torch.cuda.is_available():
            inputs, labels =inputs.cuda(), labels.cuda()
        users, items = inputs[0], inputs[1]
        target = model(users, items)
        loss = loss_func(target.float(), labels.float())
        valid_loss += loss.item() * len(inputs)

    if epoch%1==0:
        print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss, train_loss / len(train_dataloader)} \t\t Validation Loss: {valid_loss, valid_loss / len(eval_dataloader)}')
    if min_valid_loss > valid_loss:
        print(f"Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f})")
        min_valid_loss = valid_loss
        # Saving State Dict
        torch.save(model.state_dict(), 'saved_model.pth')
    return min_valid_loss
 

In [15]:
for epoch in range(epochs):
    min_valid_loss = train_loop(epoch, min_valid_loss, loss_func, UserItemEmbeddingNNNet)

6250it [02:44, 37.91it/s]


Epoch 1 		 Training Loss: (19336.989125609398, 3.0939182600975035) 		 Validation Loss: (3961.6147418022156, 5.066003506140941)
Validation Loss Decreased(inf--->3961.614742)


6250it [02:44, 38.09it/s]


Epoch 2 		 Training Loss: (13764.112625956535, 2.2022580201530455) 		 Validation Loss: (3184.8644323349, 4.072716665389898)
Validation Loss Decreased(3961.614742--->3184.864432)


6250it [02:44, 38.08it/s]


Epoch 3 		 Training Loss: (11648.72125697136, 1.8637954011154174) 		 Validation Loss: (2930.5989661216736, 3.7475690103857717)
Validation Loss Decreased(3184.864432--->2930.598966)


6250it [02:42, 38.53it/s]


Epoch 4 		 Training Loss: (10317.361651062965, 1.6507778641700745) 		 Validation Loss: (2590.4914541244507, 3.312648918317712)
Validation Loss Decreased(2930.598966--->2590.491454)


6250it [02:41, 38.67it/s]


Epoch 5 		 Training Loss: (9348.777659118176, 1.495804425458908) 		 Validation Loss: (2548.781842827797, 3.2593118194728863)
Validation Loss Decreased(2590.491454--->2548.781843)


6250it [02:25, 42.86it/s]


Epoch 6 		 Training Loss: (8648.98660427332, 1.383837856683731) 		 Validation Loss: (2302.0849286317825, 2.9438426197337373)
Validation Loss Decreased(2548.781843--->2302.084929)


6250it [02:25, 43.07it/s]


Epoch 7 		 Training Loss: (8095.819298684597, 1.2953310877895354) 		 Validation Loss: (2184.367105960846, 2.7933083196430255)
Validation Loss Decreased(2302.084929--->2184.367106)


6250it [02:22, 43.95it/s]


Epoch 8 		 Training Loss: (7635.4493471980095, 1.2216718955516814) 		 Validation Loss: (2225.6325138807297, 2.846077383479194)


6250it [02:25, 43.04it/s]


Epoch 9 		 Training Loss: (7261.5197612047195, 1.1618431617927552) 		 Validation Loss: (2210.636840224266, 2.8269013302100587)


6250it [02:27, 42.46it/s]


Epoch 10 		 Training Loss: (6958.720790028572, 1.1133953264045715) 		 Validation Loss: (2055.2727403640747, 2.6282260106957476)
Validation Loss Decreased(2184.367106--->2055.272740)


6250it [02:27, 42.38it/s]


Epoch 11 		 Training Loss: (6673.684870362282, 1.067789579257965) 		 Validation Loss: (2015.5212477445602, 2.577392899929105)
Validation Loss Decreased(2055.272740--->2015.521248)


6250it [02:28, 42.17it/s]


Epoch 12 		 Training Loss: (6428.431854784489, 1.0285490967655182) 		 Validation Loss: (1945.7019659280777, 2.488109930854319)
Validation Loss Decreased(2015.521248--->1945.701966)


6250it [02:13, 46.87it/s]


Epoch 13 		 Training Loss: (6224.646181106567, 0.9959433889770508) 		 Validation Loss: (1917.7209169864655, 2.452328538345864)
Validation Loss Decreased(1945.701966--->1917.720917)


6250it [02:05, 49.65it/s]


Epoch 14 		 Training Loss: (6039.714427828789, 0.9663543084526062) 		 Validation Loss: (1860.1247453689575, 2.3786761449730913)
Validation Loss Decreased(1917.720917--->1860.124745)


6250it [02:07, 49.20it/s]


Epoch 15 		 Training Loss: (5863.624195098877, 0.9381798712158204) 		 Validation Loss: (1893.307063817978, 2.4211087772608413)


6250it [02:07, 49.13it/s]


Epoch 16 		 Training Loss: (5712.220959246159, 0.9139553534793854) 		 Validation Loss: (1877.084687113762, 2.4003640500175982)


6250it [02:06, 49.58it/s]


Epoch 17 		 Training Loss: (5580.952288925648, 0.8929523662281036) 		 Validation Loss: (1788.4591158628464, 2.2870321174716706)
Validation Loss Decreased(1860.124745--->1788.459116)


6250it [02:08, 48.72it/s]


Epoch 18 		 Training Loss: (5451.687248647213, 0.872269959783554) 		 Validation Loss: (1802.4682412147522, 2.3049466000188645)


6250it [02:14, 46.41it/s]


Epoch 19 		 Training Loss: (5327.100653588772, 0.8523361045742035) 		 Validation Loss: (1776.6769856214523, 2.2719654547588903)
Validation Loss Decreased(1788.459116--->1776.676986)


6250it [02:14, 46.62it/s]


Epoch 20 		 Training Loss: (5225.041429400444, 0.836006628704071) 		 Validation Loss: (1753.2467070817947, 2.2420034617414255)
Validation Loss Decreased(1776.676986--->1753.246707)


6250it [02:11, 47.52it/s]


Epoch 21 		 Training Loss: (5120.318281590939, 0.8192509250545502) 		 Validation Loss: (1796.3456431627274, 2.2971171907451757)


6250it [02:22, 43.74it/s]


Epoch 22 		 Training Loss: (5030.4137127399445, 0.8048661940383911) 		 Validation Loss: (1774.6297190189362, 2.269347466776133)


6250it [02:27, 42.49it/s]


Epoch 23 		 Training Loss: (4935.496313005686, 0.7896794100809097) 		 Validation Loss: (1734.5515322685242, 2.218096588578675)
Validation Loss Decreased(1753.246707--->1734.551532)


6250it [02:16, 45.68it/s]


Epoch 24 		 Training Loss: (4862.370447695255, 0.7779792716312408) 		 Validation Loss: (1734.491734623909, 2.218020121002441)
Validation Loss Decreased(1734.551532--->1734.491735)


6250it [02:11, 47.60it/s]


Epoch 25 		 Training Loss: (4776.985863417387, 0.7643177381467819) 		 Validation Loss: (1739.4465942382812, 2.224356258616728)


6250it [02:19, 44.69it/s]


Epoch 26 		 Training Loss: (4711.558499991894, 0.753849359998703) 		 Validation Loss: (1724.7381066083908, 2.2055474509058706)
Validation Loss Decreased(1734.491735--->1724.738107)


6250it [02:23, 43.70it/s]


Epoch 27 		 Training Loss: (4637.1162377893925, 0.7419385980463028) 		 Validation Loss: (1722.0324491262436, 2.202087530851974)
Validation Loss Decreased(1724.738107--->1722.032449)


6250it [02:23, 43.53it/s]


Epoch 28 		 Training Loss: (4580.096899241209, 0.7328155038785934) 		 Validation Loss: (1706.7287948131561, 2.1825176404260307)
Validation Loss Decreased(1722.032449--->1706.728795)


6250it [02:24, 43.39it/s]


Epoch 29 		 Training Loss: (4515.409800291061, 0.7224655680465698) 		 Validation Loss: (1695.210100531578, 2.1677878523421716)
Validation Loss Decreased(1706.728795--->1695.210101)


6250it [02:24, 43.36it/s]


Epoch 30 		 Training Loss: (4450.352580696344, 0.7120564129114151) 		 Validation Loss: (1681.8223263025284, 2.150667936448246)
Validation Loss Decreased(1695.210101--->1681.822326)


In [16]:
tusers = torch.tensor(df_test.userId.values)
titems = torch.tensor(df_test.movieId.values)

In [18]:
# bestmodel = UserItemEmbeddingNNNet(N, M, K)
# bestmodel.load_state_dict(torch.load('saved_model.pth'))
# bestmodel.eval()
# tpredsbest = bestmodel(tusers, titems)

In [19]:
UserItemEmbeddingNNNet.eval()
tpreds = UserItemEmbeddingNNNet(tusers, titems)

In [20]:
len(tpreds)

100000

In [21]:
tpredsfinal = tpreds.detach().numpy()
tpredsfinal = (tpredsfinal>3).astype(int)
tratings = df_test.rating.values
tratings = (tratings>3).astype(int)
tratings, tpredsfinal

(array([0, 1, 1, ..., 1, 0, 0]), array([0, 1, 0, ..., 0, 1, 1]))

In [22]:
f1_score(tpredsfinal, tratings)

0.7660234498474657

In [23]:
tratings = df_test.rating.values
tpredsfinal = tpreds.detach().numpy()
tpredsfinal = (tpredsfinal>2.5).astype(int)
tratings = (tratings>2.5).astype(int)
f1_score(tpredsfinal, tratings)

0.8936955180801415