In [1]:
import torch
from torch.utils.data.dataloader import DataLoader, Dataset
from tqdm.auto import tqdm
import polars as pl
import numpy as np
import random
import os
from sklearn.metrics import mean_squared_error

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


In [2]:
class EmbeddedFeatures(torch.nn.Module):
    
    def __init__(self, sizes, dims=32):
        super().__init__()
        embs = []
        for c in sizes:
            m = c
            e = torch.nn.Embedding(m, dims)
            embs.append(e)
        self.embeddings = torch.nn.ModuleList(embs)
        
    def forward(self, cats):
        embs = None
        for c, e in zip(cats, self.embeddings):
            if embs is None:
                embs = e(c)
            else:
                embs += e(c)
        embs /= len(cats)
        return embs

class DeepFeatures(torch.nn.Module):
    
    def __init__(self, num_size, embs, depth=3, dims=32):
        super().__init__()
        self.embs = embs
        num_dims = num_size
        #first 
        deep_list = [torch.nn.Linear(dims + num_dims, dims)]
        for _ in range(1, depth):
            deep_list.append(torch.nn.Linear(dims, dims))
        self.deep = torch.nn.ModuleList(deep_list)
        
        
    def forward(self, cats, nums, std=0.5):
        embs = self.embs(cats)
        if nums is not None:
            x = torch.cat((embs, nums), dim=1)
        else:
            x = embs
        if self.training:
            x = x * (1 + std * torch.randn_like(x))
        output = []
        for l in self.deep:
            x = l(x)
            if self.training:
                x = x * (1 + std * torch.randn_like(x))
            output.append(x)
            x = torch.nn.functional.leaky_relu(x)
        return output
    

class DeepMF(torch.nn.Module):
    
    def __init__(self, cat_sizes, num_size, depth=3, dims=32):
        super().__init__()
        embds = EmbeddedFeatures(cat_sizes, dims)
        self.base = DeepFeatures(num_size, embds, depth=depth, dims=32)
        self.click = DeepFeatures(num_size, embds, depth=depth, dims=32)
        self.multi = torch.nn.parameter.Parameter(torch.randn((1,1)))
        self.att = torch.nn.parameter.Parameter(torch.randn((depth, 1)))
        
        
    def forward(self, cats, nums):
        base = self.base(cats, nums)
        click = self.click(cats, nums)
        click_out = None
        for e, (b, c) in enumerate(zip(base, click)):
            c_v = torch.sum(b * c, dim=1, keepdim=True) * self.multi
            if click_out is None:
                click_out = c_v * self.att[e, 0]
            else:
                click_out += (c_v * self.att[e, 0])
        out = click_out
        return torch.nn.functional.sigmoid(out)

In [3]:
def epoch(model, loss_f, optimizer, dl_train, device):
    loss = 0
    model.train()
    for x, y in tqdm(dl_train):
        optimizer.zero_grad()
        cats = [c.to(device) for c in x[:-1]]
        nums = x[-1].to(device)
        y = y.float().to(device)
        y_pred = model(cats, nums)
        c_loss = loss_f(y_pred[:, 0], y)
        c_loss.backward()
        optimizer.step()
        loss += c_loss.cpu().item()
    return loss / len(dl_train)


def predict(model, dl_test, device):
    preds = [] 
    model.eval()
    with torch.no_grad():
        for x, _ in tqdm(dl_test):
            cats = [c.to(device) for c in x[:-1]]
            nums = x[-1].to(device)
            y_pred = model(cats, nums).cpu().numpy()
            preds.append(y_pred[:, 0])
    preds = np.concatenate(preds, axis=0)
    return preds

In [4]:
users = pl.read_csv('ml-100k/u.user', separator='|', has_header=False)
users.columns = ['user', 'age', 'gender', 'occupation', 'zip_code']
movies = pl.read_csv('ml-100k/u.item', separator='|', has_header=False, encoding='utf8-lossy')
movies.columns = ['movie', 'title', 'release_date', 'video release date', \
              'IMDb', 'unknown', 'Action', 'Adventure', 'Animation', \
              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', \
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', \
              'Thriller', 'War', 'Western']
train = pl.read_csv('ml-100k/u1.base', separator='\t', has_header=False)
train.columns = ['user', 'item', 'rating', 'timestamp'] 
test = pl.read_csv('ml-100k/u1.test', separator='\t', has_header=False)
test.columns = ['user', 'item', 'rating', 'timestamp'] 

In [5]:
train = train.join(movies, left_on='item', right_on='movie').join(users, on='user')
test = test.join(movies, left_on='item', right_on='movie').join(users, on='user')

In [6]:
def build_map(ds, column):
    data = ds[column].unique().sort()
    dic = {o: i  for i, o in enumerate(data)}
    return dic

In [7]:
occ_map = build_map(train, 'occupation')
age_map = build_map(train, 'age')

In [8]:
movie_stats = train[['item', 'rating']].groupby('item').\
                                    agg(((pl.mean('rating') -1)/4).alias('item_mean'),\
                                             (pl.std('rating') / pl.mean('rating')).alias('item_std'))
users_stats = train[['user', 'rating']].groupby('user').\
                                    agg(((pl.mean('rating') -1)/4).alias('user_mean'),\
                                             (pl.std('rating') / pl.mean('rating')).alias('user_std'))

In [9]:
train = train.join(movie_stats, on='item')
test = test.join(movie_stats, on='item')
train = train.join(users_stats, on='user')
test = test.join(users_stats, on='user')

In [10]:
train = train.with_columns(pl.col('occupation').apply(lambda x: occ_map[x]))
train = train.with_columns(pl.col('gender').apply(lambda x: 1 if x == 'M' else 0).alias('M'))
train = train.with_columns(pl.col('gender').apply(lambda x: 1 if x == 'F' else 0).alias('F'))
train = train.with_columns(pl.col('age').apply(lambda x: age_map[x]))
test = test.with_columns(pl.col('occupation').apply(lambda x: occ_map[x]))
test = test.with_columns(pl.col('gender').apply(lambda x: 1 if x == 'M' else 0).alias('M'))
test = test.with_columns(pl.col('gender').apply(lambda x: 1 if x == 'F' else 0).alias('F'))
test = test.with_columns(pl.col('age').apply(lambda x: age_map[x]))

In [11]:
class ML100K(Dataset):
    
    def __init__(self, rankings):
        self.users = rankings['user'].to_numpy()
        self.occupation = rankings['occupation'].to_numpy()
        self.age = rankings['age'].to_numpy()
        self.items = rankings['item'].to_numpy()
        self.categories = rankings['unknown', 'Action', 'Adventure', 'Animation', \
              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', \
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', \
              'Thriller', 'War', 'Western', 'M', 'F', 'item_mean', 'item_std', 'user_mean', 'user_std'].to_numpy().astype(np.float32)
        self.rankings = (rankings['rating'].to_numpy().astype(np.float32) - 1 ) / 4
    
    def __getitem__(self, idx):
        return (self.users[idx], self.occupation[idx], self.age[idx], self.items[idx], self.categories[idx, :]), self.rankings[idx]
                                                      
    def __len__(self):
        return self.users.shape[0]

In [12]:
dl_train = DataLoader(ML100K(train), \
                      batch_size=32, shuffle=True)
dl_test = DataLoader(ML100K(test), \
                      batch_size=len(test), shuffle=False)

In [13]:
device = 'cuda'

model = DeepMF([train['user'].max() + 1,\
                train['occupation'].max() + 1,\
                train['age'].max() + 1,\
                train['item'].max() + 1], 25).to(device)
loss_f = torch.nn.MSELoss() #torch.nn.BCELoss()
optimizer = torch.optim.RAdam(model.parameters())

In [14]:
if not os.path.exists('ml100k-exp-full+mean+std-batch.pt'):
    for i in range(40):
        l = epoch(model, loss_f, optimizer, dl_train, device)
        print(f'{i}: Current loss in training {l}')
        y_pred = predict(model, dl_test, device)
        print(f'RMSE Score: {mean_squared_error(test["rating"].to_numpy(), y_pred * 4 + 1, squared=False)}')
    torch.save(model.state_dict(), 'ml100k-exp-full+mean+std-batch.pt')
else:
    model.load_state_dict(torch.load('ml100k-exp-full+mean+std-batch.pt'))


y_pred = predict(model, dl_test, device)

print(f'RMSE Score: {mean_squared_error(test["rating"].to_numpy(), y_pred * 4 + 1, squared=False)}')

  0%|          | 0/2500 [00:00<?, ?it/s]

0: Current loss in training 0.07308472376838326


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 1.0296543691594942


  0%|          | 0/2500 [00:00<?, ?it/s]

1: Current loss in training 0.06565033820345997


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 1.0138349906138295


  0%|          | 0/2500 [00:00<?, ?it/s]

2: Current loss in training 0.06210368131995201


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9873554659120859


  0%|          | 0/2500 [00:00<?, ?it/s]

3: Current loss in training 0.059382349062711


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.975561815235332


  0%|          | 0/2500 [00:00<?, ?it/s]

4: Current loss in training 0.057646494037657973


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.970141206956907


  0%|          | 0/2500 [00:00<?, ?it/s]

5: Current loss in training 0.05621504941433668


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9624070989986886


  0%|          | 0/2500 [00:00<?, ?it/s]

6: Current loss in training 0.055179738858342174


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9576045397897742


  0%|          | 0/2500 [00:00<?, ?it/s]

7: Current loss in training 0.054476661321520806


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9536626340366184


  0%|          | 0/2500 [00:00<?, ?it/s]

8: Current loss in training 0.05391501266248524


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9505889059021181


  0%|          | 0/2500 [00:00<?, ?it/s]

9: Current loss in training 0.05323119440265


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9480948941388085


  0%|          | 0/2500 [00:00<?, ?it/s]

10: Current loss in training 0.05275805822983384


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9449269836495434


  0%|          | 0/2500 [00:00<?, ?it/s]

11: Current loss in training 0.05231439746543765


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9459230680178936


  0%|          | 0/2500 [00:00<?, ?it/s]

12: Current loss in training 0.05179123833626509


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9411016129859411


  0%|          | 0/2500 [00:00<?, ?it/s]

13: Current loss in training 0.051448387787491084


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9410927332786371


  0%|          | 0/2500 [00:00<?, ?it/s]

14: Current loss in training 0.05074470082297921


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9393503515715279


  0%|          | 0/2500 [00:00<?, ?it/s]

15: Current loss in training 0.05031380203403533


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9365702193833074


  0%|          | 0/2500 [00:00<?, ?it/s]

16: Current loss in training 0.04977239542454481


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9370018450735208


  0%|          | 0/2500 [00:00<?, ?it/s]

17: Current loss in training 0.04966412891894579


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9356243315490682


  0%|          | 0/2500 [00:00<?, ?it/s]

18: Current loss in training 0.049212784990295765


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9344305361239126


  0%|          | 0/2500 [00:00<?, ?it/s]

19: Current loss in training 0.048836793613433835


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9314010606287636


  0%|          | 0/2500 [00:00<?, ?it/s]

20: Current loss in training 0.04843518213108182


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.931949128448128


  0%|          | 0/2500 [00:00<?, ?it/s]

21: Current loss in training 0.048114689841866495


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9317156888489457


  0%|          | 0/2500 [00:00<?, ?it/s]

22: Current loss in training 0.04797147209122777


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9313551420352678


  0%|          | 0/2500 [00:00<?, ?it/s]

23: Current loss in training 0.04743527876883745


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9296305739258838


  0%|          | 0/2500 [00:00<?, ?it/s]

24: Current loss in training 0.04702040317952633


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9308537749277576


  0%|          | 0/2500 [00:00<?, ?it/s]

25: Current loss in training 0.04679119113460183


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9290866253622209


  0%|          | 0/2500 [00:00<?, ?it/s]

26: Current loss in training 0.04646369729712606


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9324422239256511


  0%|          | 0/2500 [00:00<?, ?it/s]

27: Current loss in training 0.046345261772722


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9273785036876308


  0%|          | 0/2500 [00:00<?, ?it/s]

28: Current loss in training 0.0459206248216331


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9286818793646057


  0%|          | 0/2500 [00:00<?, ?it/s]

29: Current loss in training 0.045788531848043205


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9269202681003669


  0%|          | 0/2500 [00:00<?, ?it/s]

30: Current loss in training 0.04548644805140793


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9258647604984396


  0%|          | 0/2500 [00:00<?, ?it/s]

31: Current loss in training 0.04512299887835979


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9270452427913786


  0%|          | 0/2500 [00:00<?, ?it/s]

32: Current loss in training 0.04483475595936179


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9264659863310728


  0%|          | 0/2500 [00:00<?, ?it/s]

33: Current loss in training 0.04453373244330287


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9273257802350566


  0%|          | 0/2500 [00:00<?, ?it/s]

34: Current loss in training 0.044242083839699624


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.929156001157824


  0%|          | 0/2500 [00:00<?, ?it/s]

35: Current loss in training 0.04426339259147644


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9283086146848297


  0%|          | 0/2500 [00:00<?, ?it/s]

36: Current loss in training 0.04392811592891813


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9339532096123675


  0%|          | 0/2500 [00:00<?, ?it/s]

37: Current loss in training 0.04374562946520746


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9279208335248306


  0%|          | 0/2500 [00:00<?, ?it/s]

38: Current loss in training 0.04327906314097345


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9282049145991023


  0%|          | 0/2500 [00:00<?, ?it/s]

39: Current loss in training 0.043280678112804886


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9282656260259431


  0%|          | 0/1 [00:00<?, ?it/s]

RMSE Score: 0.9282656260259431
