In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import datasets
import json

import numpy as np
import pandas as pd
import torch
from torch import nn
from tqdm import tqdm

## Data preprocessing

In [2]:
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

# Download the data
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-20m.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()


Downloading movielens data...


In [3]:
# load in the data
df = pd.read_csv('ml-20m/ratings.csv')

ml-100k  ml-20m  movielens.zip	sample_data


In [4]:
df.head()

In [6]:
# Customize dataset
N = df.userId.max() + 1 # number of users
M = df.movieId.max() + 1 # number of movies

# split into train and test
df = shuffle(df)
cutoff = int(0.008*len(df))
cutoffe = int(0.01*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:cutoffe]

# initialize variables
K = 10 # latent dimensionality
mu = df_train.rating.mean()


In [None]:
# with open("rating.json", "w") as outfile:
#     for u, m in zip(df.userId.values, df_train.movieId.values):
#       j = {'user':s, 'movie':l}
#       s = json.dumps(j)
#       outfile.write(f"{s}\n")

In [7]:
N,M,K,len(df_train),len(df), mu

(138494, 131263, 10, 160002, 20000263, 3.524280946488169)

In [8]:
df_train.iloc[0:5]

Unnamed: 0,userId,movieId,rating,timestamp
11979571,82721,2100,3.5,1258418120
854706,5725,1485,2.5,1090589513
7532431,51978,1287,4.0,942404793
12204326,84316,1281,4.0,879418702
311431,2126,608,4.0,941309835


In [9]:
users = torch.tensor(df_train.userId.values)
items = torch.tensor(df_train.movieId.values)
ratings = torch.tensor(df_train.rating.values)

In [10]:
class UserItemEmbeddingNet(torch.nn.Module):
    def __init__(self, n_users, n_items, k_factors, embedding_dropout = 0.02, hidden = 10, dropouts = 0.2):

        super(UserItemEmbeddingNet, self).__init__()
        self.u = torch.nn.Embedding(n_users, k_factors)
        self.m = torch.nn.Embedding(n_items, k_factors)
        self.u_bias = torch.nn.Embedding(n_users, 1)
        self.m_bias = torch.nn.Embedding(n_items, 1)

    def forward(self, users, items):
        uembed = self.u(users)
        membed = self.m(items)
        ubias = self.u_bias(users)
        mbias = self.m_bias(items)
        umdot = torch.mul(uembed,membed)
        umdot = torch.sum(umdot,1)
        umdot = torch.reshape(umdot, (umdot.shape[0], 1))
        output = torch.add(umdot, ubias)
        output = torch.add(output, mbias)
        output = output - mu
        output = torch.flatten(output)
        return output

UserItemEmbeddingNet = UserItemEmbeddingNet(N, M, K)
print(UserItemEmbeddingNet)

UserItemEmbeddingNet(
  (u): Embedding(138494, 10)
  (m): Embedding(131263, 10)
  (u_bias): Embedding(138494, 1)
  (m_bias): Embedding(131263, 1)
)


In [78]:
epochs = 1000
bs = 128
lr = 5.0
reg = 0. # regularization penalty

def fit():
    for epoch in range(epochs):
        pred = UserItemEmbeddingNet(users, items)
        loss_func = torch.nn.MSELoss()
        loss = loss_func(pred.float(), ratings.float())
        optimizer = torch.optim.SGD(UserItemEmbeddingNet.parameters(), lr=lr, momentum=0.9)

        if epoch%100==0:
          print("Training epoch: ", epoch, loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
 

In [80]:
fit()

Training epoch:  0 0.22769078612327576
Training epoch:  100 0.22522272169589996
Training epoch:  200 0.22279657423496246
Training epoch:  300 0.22041136026382446
Training epoch:  400 0.21806606650352478
Training epoch:  500 0.21575985848903656
Training epoch:  600 0.21349172294139862
Training epoch:  700 0.2112608104944229
Training epoch:  800 0.20906628668308258
Training epoch:  900 0.20690733194351196


In [69]:
tusers = torch.tensor(df_test.userId.values)
titems = torch.tensor(df_test.movieId.values)
tratings = df_test.rating.values

In [37]:
with torch.no_grad():
    tpreds = UserItemEmbeddingNet(tusers, titems)

In [38]:
tpreds

tensor([2.9075, 3.6170, 3.2106,  ..., 4.5088, 2.8524, 4.3815],
       grad_fn=<ReshapeAliasBackward0>)

In [70]:
tpredsfinal = tpreds.detach().numpy()
#tpredsfinal = (tpredsfinal/0.5).astype(int)
tpredsfinal = (tpredsfinal>3).astype(int)

In [71]:
#tratings = (tratings*2).astype(int)
tratings = (tratings>3).astype(int)
tratings, tpredsfinal

(array([1, 0, 1, ..., 0, 0, 1]), array([0, 1, 1, ..., 1, 0, 1]))

In [72]:
from sklearn.metrics import f1_score
f1_score(tpredsfinal, tratings)

0.6145366093146774