### 0. Data Preprocessing

In [1]:
import pandas as pd

In [2]:
udata = pd.read_csv('./data/ml-100k/u.data', sep='\t', header=None, names=['userId', 'itemId', 'rating', 'timestamp'])
ugenre = pd.read_csv('./data/ml-100k/u.genre', sep='|', header=None)
uitem = pd.read_csv('./data/ml-100k/u.item', sep='|', header=None, encoding='latin-1', names=['id', 'title', 'date', 'NaN', 'link', *[str(x) for x in list(range(19))]])
uuser = pd.read_csv('./data/ml-100k/u.user', sep='|', header=None, names=['userId', 'age', 'gender', 'occupation', 'zipcode'], encoding='latin-1')

In [3]:
uitem.head()

Unnamed: 0,id,title,date,NaN,link,0,1,2,3,4,...,9,10,11,12,13,14,15,16,17,18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
udata.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
uuser.head()

Unnamed: 0,userId,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### 0. Import, Hyperparameters

In [6]:
import torch.nn as nn
from torch.autograd import Variable
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from random import shuffle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
lst_uid = list(sorted(udata.userId.unique().tolist()))
lst_fid = list(sorted(udata.itemId.unique().tolist()))

In [8]:
class MovieLenDataset(Dataset):
    
    def __init__(self, udata):
        
        self.user = udata.userId.tolist()
        self.item = udata.itemId.tolist()
        self.rating = udata.rating.tolist()
        
    def __len__(self):
        return len(self.user)
    
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.rating[idx]
    
    @staticmethod
    def collate(batch):
        user = torch.LongTensor([x[0] for x in batch])
        item = torch.LongTensor([x[1] for x in batch])
        rating = torch.FloatTensor([x[2] for x in batch])
        return user.to(device), item.to(device), rating.to(device)

In [26]:
epochs = 100000
lr = 1e-3
n_factors = 32
n_user = 1 + lst_uid[-1]
n_movie = 1 + lst_fid[-1]
out_range = [min(udata.rating.tolist()) - 0.1, max(udata.rating.tolist()) + 0.1]
criteration = nn.MSELoss(reduction='mean')
lr=1e-2
wd=0
print_step = epochs // 20
batch_size = 256


In [10]:
dataset = MovieLenDataset(udata=udata)
loader = DataLoader(dataset=dataset, collate_fn=MovieLenDataset.collate, batch_size = batch_size)

### 1. Embedding: UserId, Film $\rightarrow$ Rating (17%)

In [11]:
user = torch.LongTensor(udata.userId.tolist()).to(device)
item = torch.LongTensor(udata.itemId.tolist()).to(device)
rating = torch.FloatTensor(udata.rating.tolist()).to(device)

In [12]:
class MovieLenModel(nn.Module):
    
    def __init__(self, n_user, n_title, n_factors=32, out_range=[0, 1]):
        super(MovieLenModel, self).__init__()
        # self.bias = torch.nn.Parameter(data=torch.Tensor(1, n_factors), requires_grad=True)
        # self.bias.data.uniform_(-1, 1)
#         self.extra_w = torch.nn.Parameter(data=torch.Tensor(n_factors, n_factors), requires_grad=True)
#         self.extra_w.data.uniform_(-1, 1)

        self.movie_bias = nn.Embedding(n_title, 1)
        self.user_bias = nn.Embedding(n_user, 1)
        
    
        self.movie_emb = nn.Embedding(n_title, n_factors)
        self.user_emb = nn.Embedding(n_user, n_factors)
        
        nn.init.normal_(self.movie_emb.weight)
        nn.init.normal_(self.user_emb.weight)
        self.out = lambda x: (x + 1) *(out_range[1] - out_range[0]) / 2 + out_range[0]
        
        self.sigmoid = nn.Tanh()
        
    def forward(self, user, movie):
        movie_w = self.movie_emb(movie)
        user_w = self.user_emb(user)
        movie_bias = self.movie_bias(movie)
        user_bias = self.user_bias(user)
        out = movie_w*user_w + movie_bias*user_bias
        out = torch.sum(out , dim=1)
        out = self.out(self.sigmoid(out))
        return out

In [13]:
model = MovieLenModel(n_user=n_user, n_title=n_movie, n_factors=n_factors, out_range=out_range)
model = model.to(device)

In [19]:
optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=0.5, momentum=0.7)

In [None]:


for i in tqdm(range(epochs)):
    predicted = model(user, item)
    loss = criteration(predicted, rating)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % print_step == 0:
        print('Epoch {}/{} Loss {}'.format(i + 1, epochs, loss.item()))
    
    

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))

Epoch 1/100000 Loss 1.2942843437194824
Epoch 5001/100000 Loss 1.1428219079971313
Epoch 10001/100000 Loss 1.016632080078125
Epoch 15001/100000 Loss 0.9164386987686157
Epoch 20001/100000 Loss 0.8391782641410828
Epoch 25001/100000 Loss 0.7755985260009766
Epoch 30001/100000 Loss 0.7232488393783569
Epoch 35001/100000 Loss 0.6857067942619324
Epoch 40001/100000 Loss 0.6533280611038208
Epoch 45001/100000 Loss 0.6230920553207397
Epoch 50001/100000 Loss 0.594969630241394
Epoch 55001/100000 Loss 0.5725213289260864
Epoch 60001/100000 Loss 0.5529027581214905
Epoch 65001/100000 Loss 0.5333041548728943
Epoch 70001/100000 Loss 0.5170705318450928
Epoch 75001/100000 Loss 0.5017793774604797
Epoch 80001/100000 Loss 0.4898618459701538
Epoch 85001/100000 Loss 0.47970059514045715


In [25]:
predicted = model(user, item).round().long()
lr = rating.long()
accuracy = (torch.sum(predicted == lr).float()/predicted.shape[0]).item(); accuracy

0.5987899899482727

In [22]:
predicted

tensor([5, 1, 1,  ..., 5, 1, 5])

### 2. Embedding: UserId, Film $\rightarrow$ Rating (17%)

In [None]:
class MovieLenModel(nn.Module):
    
    def __init__(self, n_user, n_title, n_factors=32, out_range=[0, 1]):
        super(MovieLenModel, self).__init__()
        self.bias = torch.nn.Parameter(data=torch.Tensor(1, n_factors), requires_grad=True)
        self.bias.data.uniform_(-1, 1)
        self.movie_emb = nn.Embedding(n_title, n_factors)
        self.user_emb = nn.Embedding(n_user, n_factors)
        nn.init.normal_(self.movie_emb.weight)
        nn.init.normal_(self.user_emb.weight)
        self.linear = nn.Linear(n_factors*2, 10)
        self.out = lambda x: x#(x + 1) *(out_range[1] - out_range[0]) / 2 + out_range[0]
        self.sigmoid = nn.Softmax(dim=1)
        
    def forward(self, user, movie):
        movie = self.movie_emb(movie)
        user = self.user_emb(user)
        out = torch.cat([movie, user], dim=1)
        out = self.linear(out)
        out = self.out(self.sigmoid(out))
        return out

In [None]:
model = MovieLenModel(n_user=n_user, n_title=n_movie, n_factors=n_factors, out_range=out_range)
model = model.to(device)

In [None]:
sgd = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=wd)
for i in tqdm(range(epochs)):
    total = 0
    for user , item, rating in loader:
        predicted = model(user, item)
        loss = criteration(predicted, rating)
        total += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if i % print_step == 0:
        print('Epoch {}/{} Loss {}'.format(i + 1, epochs, loss.item()))
    

In [None]:
predicted = model(user, item).round().long()
lr = rating.long()
accuracy = (torch.sum(lp == lr).float()/predicted.shape[0]).item(); accuracy

### 3. 

In [None]:
len(udata), len(uuser), len(uitem)

In [None]:
len()