### 0. Data Preprocessing

In [None]:
import pandas as pd

In [None]:
udata = pd.read_csv('./data/ml-100k/u.data', sep='\t', header=None, names=['userId', 'itemId', 'rating', 'timestamp'])
ugenre = pd.read_csv('./data/ml-100k/u.genre', sep='|', header=None)
uitem = pd.read_csv('./data/ml-100k/u.item', sep='|', header=None, encoding='latin-1', names=['id', 'title', 'date', 'NaN', 'link', *[str(x) for x in list(range(19))]])
uuser = pd.read_csv('./data/ml-100k/u.user', sep='|', header=None, names=['userId', 'age', 'gender', 'occupation', 'zipcode'], encoding='latin-1')

In [293]:
uitem.head()

Unnamed: 0,id,title,date,NaN,link,0,1,2,3,4,...,9,10,11,12,13,14,15,16,17,18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
udata.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
uuser.head()

Unnamed: 0,userId,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### 0. Import, Hyperparameters

In [255]:
import torch.nn as nn
from torch.autograd import Variable
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from random import shuffle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [256]:
lst_uid = list(sorted(udata.userId.unique().tolist()))
lst_fid = list(sorted(udata.itemId.unique().tolist()))

In [258]:
class MovieLenDataset(Dataset):
    
    def __init__(self, udata):
        
        self.user = udata.userId.tolist()
        self.item = udata.itemId.tolist()
        self.rating = udata.rating.tolist()
        
    def __len__(self):
        return len(self.user)
    
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.rating[idx]
    
    @staticmethod
    def collate(batch):
        user = torch.LongTensor([x[0] for x in batch])
        item = torch.LongTensor([x[1] for x in batch])
        rating = torch.FloatTensor([x[2] for x in batch])
        return user.to(device), item.to(device), rating.to(device)

In [344]:
epochs = 100
lr = 1e-3
n_factors = 64
n_user = 1 + lst_uid[-1]
n_movie = 1 + lst_fid[-1]
out_range = [min(udata.rating.tolist()) - 0.1, max(udata.rating.tolist()) + 0.1]
criteration = nn.MSELoss(reduction='sum')
lr=1e-2
wd=0
print_step = epochs // 10
batch_size = 256


In [345]:
dataset = MovieLenDataset(udata=udata)
loader = DataLoader(dataset=dataset, collate_fn=MovieLenDataset.collate, batch_size = batch_size)

### 1. Embedding: UserId, Film $\rightarrow$ Rating (17%)

In [363]:
class MovieLenModel(nn.Module):
    
    def __init__(self, n_user, n_title, n_factors=32, out_range=[0, 1]):
        super(MovieLenModel, self).__init__()
        # self.bias = torch.nn.Parameter(data=torch.Tensor(1, n_factors), requires_grad=True)
        # self.bias.data.uniform_(-1, 1)
        self.extra_w = torch.nn.Parameter(data=torch.Tensor(n_factors, n_factors), requires_grad=True)
        self.extra_w.data.uniform_(-1, 1)
        
    
        self.movie_emb = nn.Embedding(n_title, n_factors)
        self.user_emb = nn.Embedding(n_user, n_factors)
        nn.init.normal_(self.movie_emb.weight)
        nn.init.normal_(self.user_emb.weight)
        self.out = lambda x: (x + 1) *(out_range[1] - out_range[0]) / 2 + out_range[0]
        
        self.sigmoid = nn.Tanh()
        
    def forward(self, user, movie):
        movie = self.movie_emb(movie)
        user = self.user_emb(user)
        out = movie*user #+ self.bias
        out = torch.matmul(movie, self.extra_w)*user #+ self.bias
        out = torch.sum(out , dim=1)
        out = self.out(self.sigmoid(out))
        return out

In [364]:
model = MovieLenModel(n_user=n_user, n_title=n_movie, n_factors=n_factors, out_range=out_range)
model = model.to(device)

In [365]:
sgd = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=0.03)

In [366]:


for i in range(epochs):
    predicted = model(user, item)
    loss = criteration(predicted, rating)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % print_step == 0:
        print('Epoch {}/{} Loss {}'.format(i + 1, epochs, loss.item()))
    
    

Epoch 1/100 Loss 968.7225341796875
Epoch 11/100 Loss 968.7225341796875
Epoch 21/100 Loss 968.7225341796875
Epoch 31/100 Loss 968.7225341796875
Epoch 41/100 Loss 968.7225341796875
Epoch 51/100 Loss 968.7225341796875
Epoch 61/100 Loss 968.7225341796875
Epoch 71/100 Loss 968.7225341796875
Epoch 81/100 Loss 968.7225341796875
Epoch 91/100 Loss 968.7225341796875


In [367]:
predicted = model(user, item).round().long()
lr = rating.long()
accuracy = (torch.sum(lp == lr).float()/predicted.shape[0]).item(); accuracy

0.17499999701976776

In [368]:
predicted

tensor([1, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 2, 1, 5, 1, 5, 5, 5,
        5, 5, 2, 1, 5, 5, 1, 1, 1, 5, 5, 1, 5, 1, 1, 1, 5, 1, 5, 5, 5, 1, 1, 1,
        5, 5, 1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 1, 5, 5, 5, 1, 1, 5, 1, 1, 1, 1, 1,
        1, 5, 1, 5, 5, 1, 1, 5, 1, 1, 1, 1, 5, 5, 1, 1, 5, 1, 5, 4, 5, 5, 1, 1,
        1, 5, 1, 1, 1, 1, 5, 5, 1, 5, 1, 1, 5, 1, 5, 1, 5, 1, 5, 5, 4, 5, 1, 1,
        5, 5, 4, 1, 1, 5, 4, 5, 5, 1, 3, 5, 5, 5, 5, 5, 1, 1, 5, 5, 5, 1, 5, 5,
        1, 1, 5, 5, 5, 1, 5, 1, 1, 1, 5, 5, 1, 5, 1, 1], device='cuda:0')

### 2. Embedding: UserId, Film $\rightarrow$ Rating (17%)

In [376]:
class MovieLenModel(nn.Module):
    
    def __init__(self, n_user, n_title, n_factors=32, out_range=[0, 1]):
        super(MovieLenModel, self).__init__()
        self.bias = torch.nn.Parameter(data=torch.Tensor(1, n_factors), requires_grad=True)
        self.bias.data.uniform_(-1, 1)
        self.movie_emb = nn.Embedding(n_title, n_factors)
        self.user_emb = nn.Embedding(n_user, n_factors)
        nn.init.normal_(self.movie_emb.weight)
        nn.init.normal_(self.user_emb.weight)
        self.linear = nn.Linear(n_factors*2, 10)
        self.out = lambda x: x#(x + 1) *(out_range[1] - out_range[0]) / 2 + out_range[0]
        self.sigmoid = nn.Softmax(dim=1)
        
    def forward(self, user, movie):
        movie = self.movie_emb(movie)
        user = self.user_emb(user)
        out = torch.cat([movie, user], dim=1)
        out = self.linear(out)
        out = self.out(self.sigmoid(out))
        return out

In [377]:
model = MovieLenModel(n_user=n_user, n_title=n_movie, n_factors=n_factors, out_range=out_range)
model = model.to(device)

In [378]:
sgd = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=wd)
for i in tqdm(range(epochs)):
    total = 0
    for user , item, rating in loader:
        predicted = model(user, item)
        loss = criteration(predicted, rating)
        total += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if i % print_step == 0:
        print('Epoch {}/{} Loss {}'.format(i + 1, epochs, loss.item()))
    

HBox(children=(IntProgress(value=0), HTML(value='')))




  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (10) must match the size of tensor b (256) at non-singleton dimension 1

In [341]:
predicted = model(user, item).round().long()
lr = rating.long()
accuracy = (torch.sum(lp == lr).float()/predicted.shape[0]).item(); accuracy

0.17499999701976776

### 3. 

In [343]:
len(udata), len(uuser), len(uitem)

(100000, 943, 1682)

In [None]:
len()