In [3]:
import numpy as np
import pandas as pd
from time import time

In [4]:
print('reading rating data...')
tic = time()
data = np.loadtxt('./ml-1m/ratings.dat', skiprows=0,  delimiter='::').astype('int32')
print("reading user data...")
datContent = [i.strip().split('::') for i in open("./ml-1m/users.dat",encoding="ISO-8859-1").readlines()]
user_data = pd.DataFrame(datContent,columns=['userId','gender','occupation','age','zip'])
print('data read in', time() - tic, 'seconds')

reading rating data...
reading user data...
data read in 5.906542062759399 seconds


In [5]:
df =pd.DataFrame(data)
df.columns=['userId','movieId','rating','timestampe']
df.head()

Unnamed: 0,userId,movieId,rating,timestampe
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
users = df.userId.unique()
movies = df.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}


df['userId'] = df['userId'].apply(lambda x: userid2idx[x])
df['movieId'] = df['movieId'].apply(lambda x: movieid2idx[x])

In [7]:
import torch
from sklearn.utils import shuffle

In [8]:
class Loader():
    current = 0
    def __init__(self, x, y, batchsize=1024, do_shuffle=True):
        self.shuffle = shuffle
        self.x = x
        self.y = y
        self.batchsize = batchsize
        self.batches = range(0, len(self.y), batchsize)
        if do_shuffle:
            self.x, self.y = shuffle(self.x, self.y)


    def __len__(self):
        # Return the number of batches
        return int(len(self.x) / self.batchsize)

    def __next__(self):
        # Update iterator and stop iteration until the batch size is out of range
        n = self.batchsize
        if self.current + n >= len(self.y):
            raise StopIteration
        i = self.current

        # Transform NumPy arrays to PyTorch tensors
        xs = torch.from_numpy(self.x[i:i + n])
        ys = torch.from_numpy(self.y[i:i + n])
        self.current += n
        return xs, ys


In [9]:
split = np.random.rand(len(df)) < 0.8
train = df[split]
valid = df[~split]


In [10]:
import torch
from torch import nn
import torch.nn.functional as F


In [33]:
def l2_regularize(array):
    """
    Function to do L2 regularization
    """
    loss = torch.sum(array ** 2.0)
    return loss

class MF(nn.Module):
    itr = 0
    def __init__(self,n_user, n_item, k=10, c_vector=1.0, writer=None):
        super(MF , self).__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.c_vector = c_vector
        
        self.userEmbedding = nn.Embedding(num_embeddings=self.n_user , embedding_dim=k)
        self.itemEmbedding = nn.Embedding(num_embeddings=self.n_item , embedding_dim=k)        
    def forward(self,user,item):
        vector_user = self.userEmbedding(torch.from_numpy(user))
        vector_item = self.itemEmbedding(torch.from_numpy(item))
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        return ui_interaction
    
    def loss(prediction, target):
        loss_mse = F.mse_loss(prediction, target.squeeze())

        prior_user = l2_regularize(self.userEmbedding.weight)*self.c_vector
        prior_item = l2_regularize(self.itemEmbedding.weight)*self.c_vector        

        loss_total = loss_mse+prior_user+prior_item
        return loss



In [34]:
users = df.userId.unique()
movies = df.movieId.unique()

n_user=len(users)
n_item = len(movies)

In [35]:
# Define the Hyper-parameters
lr = 1e-2  # Learning Rate
k = 10  # Number of dimensions per user, item
c_vector = 1e-6  # regularization constant

In [36]:
n_user,n_item

(6040, 3706)

In [37]:
model = MF(n_user, n_item, k=k, c_vector=c_vector)

In [38]:
optimizer = torch.optim.Adam(model.parameters(),lr)

In [39]:
n_epoch =10
loss_func = torch.nn.MSELoss()


In [43]:
import numpy as np
from scipy.sparse import rand as sprand
import torch

In [52]:
train.values

array([[        0,         0,         5, 978300760],
       [        0,         1,         3, 978302109],
       [        0,         2,         3, 978301968],
       ...,
       [     6039,      1106,         5, 956704887],
       [     6039,       365,         5, 956704746],
       [     6039,       152,         4, 956715648]])

In [29]:
task3loss=[]
optimizer = torch.optim.SGD(model.parameters(), lr=0.000001)
for epoch in range(n_epoch):
    Y_ = model()
    loss = loss_fn(Y_, torch.Tensor(train.rating.astype(np.float).values))
    model.zero_grad() # need to clear the old gradients
    loss.backward()
    optimizer.step()
#    print(loss)
    task3loss.append(loss)
    print(loss)


TypeError: forward() missing 2 required positional arguments: 'user' and 'item'

In [33]:
loss

tensor(18986216., grad_fn=<MseLossBackward>)