In [None]:
!pip install lightfm

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM

In [None]:
from lightfm.datasets import fetch_movielens

In [None]:
from tqdm.notebook import tqdm

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import ndcg_score

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F

In [None]:
from itertools import chain

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data

В качестве датасета возьмем movielens100k. Этот датасет используется во многих работах по collaborative filltering (в частности в статье про NCF его и используют). Однако movielens датасет с эксплисит данными, поэтому в этой работе мы будем считать за взаимодействие любую оценку (аналогично работе по NCF).

In [None]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
            names=['user_id', 'movie_id', 'rating', 'timestamp'], 
            usecols=['user_id', 'movie_id', 'rating'], engine='python')

In [None]:
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

In [None]:
uid2userid = np.unique(ratings["user_id"])
userid2uid = {i: j for (j, i) in enumerate(uid2userid)}

In [None]:
mid2movieid = np.unique(ratings["movie_id"])
movieid2mid = {i: j for (j, i) in enumerate(mid2movieid)}

In [None]:
new_users = [userid2uid[i] for i in ratings['user_id']]
new_movies = [movieid2mid[i] for i in ratings['movie_id']]

In [None]:
explicit = np.vstack((new_users, new_movies, ratings["rating"].to_numpy())).T
implicit = explicit[:, :2]

In [None]:
user_count, movie_count = len(uid2userid), len(mid2movieid)

Тестировать модели будем по заранее отложенной выборке фильмов. Выборка получается выкидыванием у каждого из юзеров по последнему фильму (LOO).

In [None]:
test_data = np.empty((0,2), int)
train_data = np.empty((0,2), int)
neg_dict = {}
pos_dict = []

In [None]:
for i in tqdm(range(user_count)):
    seen_data = implicit[implicit[:,0] == i]
    pos_dict.append(seen_data[:-1,1])
    seen = set(seen_data[:-1,1])
    if len(seen) < 2:
        continue
    unseen = set(np.arange(movie_count)) - seen
    unseen = np.array(list(unseen), dtype=int)
    neg_dict[i] = unseen

    test_data = np.vstack((test_data, seen_data[-1]))
    train_data = np.vstack((train_data, seen_data[:-1]))

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




In [None]:
train_data_coo = sp.coo_matrix((np.ones_like(train_data[:,0]), (train_data[:,0], train_data[:,1])))
train_data_coo = train_data_coo.tocsr()

In [None]:
get_similars = lambda item_id, model : [movie_info[movie_info["movie_id"] == mid2movieid[i]]["name"].to_string()
                                        for i in model.similar_items(movieid2mid[item_id])]

In [None]:
get_recommendations = lambda user_id, model, mat, col="name" : [movie_info[movie_info["movie_id"] == mid2movieid[i]][col].to_string() 
                                                    for i in model.recommend(userid2uid[user_id], mat)]

In [None]:
get_user_history = lambda user_id, implicit_ratings : [movie_info[movie_info["movie_id"] == mid2movieid[x]]["name"].to_string() 
                                            for x in implicit_ratings[implicit_ratings[:,0] == userid2uid[user_id]][:,1]]

Рекомендовать будем все 4 пользователю со следующей историей:

In [None]:
get_user_history(4, implicit)

['3399    Hustler, The (1961)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '2882    Fistful of Dollars, A (1964)',
 '1196    Alien (1979)',
 '1023    Die Hard (1988)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '1959    Saving Private Ryan (1998)',
 '476    Jurassic Park (1993)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1180    Raiders of the Lost Ark (1981)',
 '1885    Rocky (1976)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '3349    Thelma & Louise (1991)',
 '3633    Mad Max (1979)',
 '2297    King Kong (1933)',
 '1366    Jaws (1975)',
 '3458    Predator (1987)',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '2623    Run Lola Run (Lola rennt) (1998)',
 '2878    Goldfinger (1964)',
 '1220    Terminator, The (1984)']

In [397]:
def test_model(model, sim=True, rec=True):
    if sim:
        print(f"{'-'*5}SIMILARS{'-'*5}")
        res = get_similars(1, model)
        for r in get_similars(1, model):
            print(r.split(maxsplit=1)[1])
        
    if rec:
        print(f"{'-'*5}RECOMENDATIONS{'-'*5}")
        res1 = get_recommendations(4, model, train_data_coo, col="category")
        res2 = get_recommendations(4, model, train_data_coo, col="name")
        for r1, r2 in zip(res2, res1):
            print(f"{r1.split(maxsplit=1)[1]:50} {r2.split()[1]}")

В качестве метрик качества возьмем популярные метрики для рекомендаций HR@k и nDCG@k. Для каждого из отложенного фильма, добавим к нему 99 случайных фильмов, который пользователь не смотерл и вот для такого набора из 100 фильмов будем считать эти метрики

In [None]:
def get_hr_ndcg(model, k=10, prnt=True):
    y_true = np.zeros(100, dtype=int)
    y_true[0] = 1 
    ndcg = []
    hr = []  
    for u in tqdm(range(user_count)):
        pos_ex = test_data[u].copy()
        neg_ex = np.vstack((u * np.ones(99), np.random.choice(neg_dict[u], 99, replace=False))).T
        test = np.vstack((pos_ex, neg_ex))
        y_pred = model.predict(test)
        ndcg.append(ndcg_score([y_true], 
                               [y_pred], 
                               k=k))
        hr.append((np.argsort(y_pred)[-10:] == 0).sum())
    if prnt:
        print(f"nDCG@{k}={np.mean(ndcg):4f}")
        print(f"HR@{k}  ={np.mean(hr):4f}")
    else:
        return np.mean(ndcg), np.mean(hr)

# WARP

В качестве простого рекомендера возьмем WARP из lightfm. Лучший результат (по метрикам) приведен ниже.

In [None]:
alpha = 1e-03
epochs = 40
num_components = 64
lr = 0.05

In [None]:
warp_model = LightFM(no_components=num_components,
                    learning_rate = lr,
                    loss='warp',
                    learning_schedule='adagrad',
                    max_sampled=100,
                    user_alpha=alpha,
                    item_alpha=alpha)

In [None]:
warp_model.fit_partial(train_data_coo, epochs=epochs, verbose=True,)

Epoch: 100%|██████████| 40/40 [02:41<00:00,  4.05s/it]


<lightfm.lightfm.LightFM at 0x7f8e521f9400>

In [None]:
class WARPRecommender():
    def __init__(self, warp):
        self.model = warp
        self.u = warp.user_embeddings
        self.v = warp.item_embeddings
        self.bv = warp.item_biases
        self.bu = warp.user_biases
    
    def similar_items(self, item_id, N=10):
        scores = self.v @ self.v[item_id] / np.linalg.norm(self.v, axis=-1)
        ind = np.argsort(scores)[::-1][:N]
        return ind

    def recommend(self, user_id, user_items, N=10):
        unseen = np.array([False if i in user_items[user_id].nonzero()[1] else True for i in np.arange(len(self.v))])
        scores = self.v[unseen] @ self.u[user_id] + self.bv[unseen]
        ind = np.argsort(scores)[::-1]
        real_ind = np.arange(len(self.v))[unseen][ind]
        return real_ind[:N]
    
    def predict(self, test):
        return self.model.predict(test[:, 0], test[:, 1])

In [None]:
warp = WARPRecommender(warp_model)

In [None]:
test_model(warp)

-----SIMILARS-----
Toy Story (1995)
Forrest Gump (1994)
Groundhog Day (1993)
Ghostbusters (1984)
Toy Story 2 (1999)
Nightmare Before Christmas, The (1993)
Aladdin (1992)
Lion King, The (1994)
Bug's Life, A (1998)
Who Framed Roger Rabbit? (1988)
-----RECOMENDATIONS-----
Terminator 2: Judgment Day (1991)                  Action|Sci-Fi|Thriller
Back to the Future (1985)                          Comedy|Sci-Fi
Matrix, The (1999)                                 Action|Sci-Fi|Thriller
American Beauty (1999)                             Comedy|Drama
Godfather, The (1972)                              Action|Crime|Drama
Men in Black (1997)                                Action|Adventure|Comedy|Sci-Fi
Silence of the Lambs, The (1991)                   Drama|Thriller
Braveheart (1995)                                  Action|Drama|War
Terminator, The (1984)                             Action|Sci-Fi|Thriller
Fargo (1996)                                       Crime|Drama|Thriller


Рекоммендации и симилары получились вполне адекватными

In [None]:
get_hr_ndcg(warp)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


nDCG@k=0.317216
hr@k  =0.547351


Метрики лучше, чем у случайного рекомендера (для теста, который я провожу), а остальное можно будет сказать только в сравнении с другими моделями

# NCF

небольшой процессинг даты для ncf

In [None]:
class Data:
    def __init__(self, dataset, neg_dict):
        self.pos = dataset.copy()
        self.data = None
        self.neg = None
        self.neg_dict = neg_dict


    def set_neg(self, mode="lazy"):
        if mode == "correct":
            self.neg = self.pos.copy()
            new_negs = None
            for i in range(user_count):
                cur_pos = self.pos[:,0] == i
                len_cur = (self.pos[:,0] == i).sum()
                cur_negs = np.random.choice(self.neg_dict[i], len_cur)
                if new_negs is None:
                    new_negs = cur_negs
                else:
                    new_negs = np.hstack((new_negs, cur_negs))
            self.neg[:,1] = new_negs
        elif mode == "lazy":
            self.neg = self.pos.copy()
            self.neg[:,1] = np.random.choice(np.arange(movie_count), len(self.pos))
        self.data = np.vstack((self.pos, self.neg))
        self.labels = np.hstack(( np.ones(len(self.pos)), np.zeros(len(self.neg)) ))
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return (*self.data[i], self.labels[i])
        # pos_ex = self.pos[i]
        # neg_ex = self.neg[i]
        # # negs = self.neg[self.neg[:,0]==pos_ex[0]]
        # # neg_ex = self.neg[np.random.randint(len(self.neg))]

        # return [*pos_ex, neg_ex[1]]

In [None]:
traindata = Data(train_data, neg_dict)

In [None]:
class RecData:
    def __init__(self, uid, unseen):
        self.uid = uid
        self.unseen = unseen
    
    def __len__(self):
        return len(self.unseen)

    def __getitem__(self, i):
        return np.array([self.uid, self.unseen[i]])

NCF берем из [статьи](https://arxiv.org/abs/1708.05031). Она состоит из трех частей: MLP, GMF, NeuMF. GMF аналог простой MF, MLP заменяет скалярное произведение для скора на нейронку, а NeuMF комбинируют их внутри себя. Мы сперва предобучаем GMF и MLP, а потом обучаем NeuMF с предобучеными частями (можно ничего не обучать заранее, но так результаты ниже).

In [None]:
class NCFBase(nn.Module):
    def __init__(self, user_count, movie_count):
        super().__init__()
        self.n = user_count
        self.m = movie_count
    
    
    def part_forward(self, uid, mid):
        raise NotImplementedError()

    def forward(self, uid, mid):
        raise NotImplementedError()
    
    def set_emb(self):
        raise NotImplementedError()

    def predict(self, test):
        loader = torch.utils.data.DataLoader(test, batch_size=512)
        y_pred = None
        
        for batch in loader:
            users = batch[:, 0].to(device)
            items = batch[:, 1].to(device)
            out = self(users.long(), items.long())
            if y_pred is None:
                y_pred = out.squeeze().cpu().detach().numpy()
            else:
                y_pred = np.hstack((y_pred, out.squeeze().cpu().detach().numpy()))
        return y_pred
                

    def fit(self,
            train_data, 
            test_data, 
            num_epochs=100,
            lr=1e-3,
            bs=128, 
            mode='lazy'):    
        opt = optim.Adam(self.parameters(), lr=lr)
        loss_func = nn.BCELoss()
        for epoch in tqdm(range(num_epochs)):
            running_loss = 0
            train_data.set_neg(mode)
            trainloader = torch.utils.data.DataLoader(train_data, 
                                                      batch_size=bs, 
                                                      shuffle=True)

            for (users, items, target) in trainloader:
                users = users.to(device)
                items = items.to(device)
                target = target.to(device)
                    
                out = self(users, items)
                loss = loss_func(out, target.float().view(-1, 1))
                opt.zero_grad()
                loss.backward()
                opt.step()
                running_loss += loss.item()
            # if (epoch % 5 == 4):
            #     ndcg, hr = get_hr_ndcg(self, prnt=False)
            #     print()
            # y_pred = self.predict(test_data)
            # auc = roc_auc_score(test_data[:, 2], y_pred)
            print(f"[{epoch}/{num_epochs}]:loss={running_loss:.4f}")
        
        self.set_emb()

    def similar_items(self, item_id, N=10):
        scores = self.v @ self.v[item_id] / np.linalg.norm(self.v, axis=-1)
        ind = np.argsort(scores)[::-1][:N]
        return ind

    def recommend(self, user_id, user_items, N=10, new=True):
        if new:
            unseen = np.array([i for i in np.arange(len(self.v)) if i not in user_items[user_id].nonzero()[1]])
            td = RecData(user_id, unseen)
            scores = self.predict(td)
        else:
            unseen = np.array([False if i in user_items[user_id].nonzero()[1] else True for i in np.arange(len(self.v))])
            scores = self.v[unseen] @ self.u[user_id]
        ind = np.argsort(scores)[::-1]
        real_ind = np.arange(len(self.v))[unseen][ind]
        return real_ind[:N]

In [None]:
class MLP(NCFBase):
    def __init__(self, user_count, movie_count, ls, hd):
        super().__init__(user_count, movie_count)
        self.ls = ls

        self.embedding_user = nn.Embedding(num_embeddings=self.n, embedding_dim=ls)
        self.embedding_item = nn.Embedding(num_embeddings=self.m, embedding_dim=ls)
        self.model = nn.Sequential(nn.Linear(2 * ls, hd), 
                                   nn.ReLU(), 
                                   nn.Linear(hd, hd),
                                   nn.ReLU(), 
                                   nn.Linear(hd, hd),
                                   nn.ReLU(), 
                                   nn.Linear(hd, hd), 
                                   nn.ReLU())
        
        self.last =  nn.Sequential(nn.Linear(hd, 1), 
                                   nn.Sigmoid())
        
        self.set_emb()
    
    
    def part_forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)
        out = self.model(vector)
        return out

    def forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)
        out = self.model(vector)
        rating = self.last(out)
        return rating

    def set_emb(self):
        self.v = self.embedding_item.weight.detach().cpu().numpy()
        self.u = self.embedding_user.weight.detach().cpu().numpy()

In [None]:
class GMF(NCFBase):
    def __init__(self, user_count, movie_count, ls):
        super().__init__(user_count, movie_count)
        self.ls = ls

        self.embedding_user = nn.Embedding(num_embeddings=self.n, embedding_dim=ls)
        self.embedding_item = nn.Embedding(num_embeddings=self.m, embedding_dim=ls)
        self.last = nn.Sequential(nn.Linear(ls, 1), 
                                 nn.Sigmoid())
        
        self.set_emb()

    
    def part_forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        out = torch.mul(user_embedding, item_embedding)
        return out
    
    def forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        out = torch.mul(user_embedding, item_embedding)
        rating = self.last(out)
        return rating
    
    def set_emb(self):
        self.v = self.embedding_item.weight.detach().cpu().numpy()
        self.u = self.embedding_user.weight.detach().cpu().numpy()

In [None]:
class NeuMF(NCFBase):
    def __init__(self, gmf, mlp, ls, hd):
        super().__init__(0, 0)
        self.gmf = GMF(user_count, movie_count, 32).to(device)
        self.mlp = MLP(user_count, movie_count, ls, hd)

        self.gmf.load_state_dict(torch.load(gmf))
        self.mlp.load_state_dict(torch.load(mlp))

        self.last = nn.Sequential(nn.Linear(ls+hd, 1), 
                                  nn.Sigmoid())
        
        self.set_emb()
    
    
    def forward(self, uid, mid):
        mlp_out = self.mlp.part_forward(uid, mid)
        gmf_out = self.gmf.part_forward(uid, mid)
        vector = torch.cat([mlp_out, gmf_out], dim=-1)
        out = self.last(vector)
        return out
    
    def predict(self, test):
        loader = torch.utils.data.DataLoader(test, batch_size=512)
        y_pred = None
        
        for batch in loader:
            users = batch[:, 0].to(device)
            items = batch[:, 1].to(device)
            out = self(users.long(), items.long())
            if y_pred is None:
                y_pred = out.squeeze().cpu().detach().numpy()
            else:
                y_pred = np.hstack((y_pred, out.squeeze().cpu().detach().numpy()))
        return y_pred
    
    
    def set_emb(self):
        self.v = np.hstack((self.gmf.embedding_item.weight.detach().cpu().numpy(), 
                            self.mlp.embedding_item.weight.detach().cpu().numpy()))
        self.u = np.hstack((self.gmf.embedding_user.weight.detach().cpu().numpy(), 
                            self.mlp.embedding_user.weight.detach().cpu().numpy()))

In [None]:
mlp = MLP(user_count, movie_count, 32, 32).to(device)
mlp.fit(traindata, None, num_epochs=50, bs=512, lr=1e-2)
torch.save(mlp.state_dict(), 'mlp.pcl')

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

[0/50]:loss=2031.2471
[1/50]:loss=1979.6251
[2/50]:loss=1916.5971
[3/50]:loss=1859.7805
[4/50]:loss=1834.4064
[5/50]:loss=1812.4085
[6/50]:loss=1787.8782
[7/50]:loss=1757.8768
[8/50]:loss=1723.3966
[9/50]:loss=1698.1964
[10/50]:loss=1675.8695
[11/50]:loss=1655.7937
[12/50]:loss=1640.1495
[13/50]:loss=1633.4913
[14/50]:loss=1618.4547
[15/50]:loss=1607.1899
[16/50]:loss=1596.9737
[17/50]:loss=1586.3233
[18/50]:loss=1589.2720
[19/50]:loss=1587.4194
[20/50]:loss=1570.6417
[21/50]:loss=1565.2540
[22/50]:loss=1561.1546
[23/50]:loss=1548.9564
[24/50]:loss=1543.6197
[25/50]:loss=1547.4085
[26/50]:loss=1536.5562
[27/50]:loss=1532.1210
[28/50]:loss=1534.2732
[29/50]:loss=1522.7113
[30/50]:loss=1524.7834
[31/50]:loss=1519.3715
[32/50]:loss=1512.9406
[33/50]:loss=1517.2314
[34/50]:loss=1515.4944
[35/50]:loss=1508.1586
[36/50]:loss=1507.1967
[37/50]:loss=1500.4756
[38/50]:loss=1499.8569
[39/50]:loss=1493.7774
[40/50]:loss=1497.0947
[41/50]:loss=1488.4523
[42/50]:loss=1480.6989
[43/50]:loss=1476.199

In [None]:
test_model(mlp)

-----SIMILARS-----
Toy Story (1995)
Aladdin (1992)
Groundhog Day (1993)
Toy Story 2 (1999)
Babe (1995)
Bug's Life, A (1998)
Beauty and the Beast (1991)
Pleasantville (1998)
Lion King, The (1994)
Wizard of Oz, The (1939)
-----RECOMENDATIONS-----
Fausto (1993)                                      Comedy
Project Moon Base (1953)                           Sci-Fi
Uninvited Guest, An (2000)                         Drama
Tainted (1998)                                     Comedy|Thriller
Naked Man, The (1998)                              Drama
Tashunga (1995)                                    Adventure|Western
Price of Glory (2000)                              Drama
New Jersey Drive (1995)                            Crime|Drama
Return with Honor (1998)                           Documentary
Ogre, The (Der Unhold) (1996)                      Drama


Симилары хорошие, рекомендации нет (почему так будет потом)

In [None]:
get_hr_ndcg(mlp)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


nDCG@k=0.378524
hr@k  =0.650828


метрики выше, чем у warp вышли

In [None]:
gmf = GMF(user_count, movie_count, 32).to(device)
gmf.fit(traindata, None, num_epochs=50, bs=512, lr=1e-2)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

[0/50]:loss=2699.3218
[1/50]:loss=2097.3683
[2/50]:loss=1993.3396
[3/50]:loss=1916.0854
[4/50]:loss=1784.8657
[5/50]:loss=1714.0966
[6/50]:loss=1656.9484
[7/50]:loss=1614.3511
[8/50]:loss=1577.3499
[9/50]:loss=1546.8125
[10/50]:loss=1525.5604
[11/50]:loss=1511.7216
[12/50]:loss=1496.1344
[13/50]:loss=1482.3676
[14/50]:loss=1473.7327
[15/50]:loss=1466.3357
[16/50]:loss=1457.2535
[17/50]:loss=1454.3055
[18/50]:loss=1445.3621
[19/50]:loss=1445.1067
[20/50]:loss=1438.6987
[21/50]:loss=1436.0245
[22/50]:loss=1433.3581
[23/50]:loss=1429.5126
[24/50]:loss=1427.6231
[25/50]:loss=1427.0428
[26/50]:loss=1426.4441
[27/50]:loss=1420.2050
[28/50]:loss=1418.7742
[29/50]:loss=1417.9611
[30/50]:loss=1417.1271
[31/50]:loss=1415.4214
[32/50]:loss=1414.0894
[33/50]:loss=1414.4089
[34/50]:loss=1415.4257
[35/50]:loss=1410.0749
[36/50]:loss=1412.0815
[37/50]:loss=1410.8522
[38/50]:loss=1411.3642
[39/50]:loss=1409.3220
[40/50]:loss=1407.2053
[41/50]:loss=1406.2625
[42/50]:loss=1407.5091
[43/50]:loss=1406.446

In [None]:
test_model(gmf)

-----SIMILARS-----
Toy Story (1995)
Toy Story 2 (1999)
Groundhog Day (1993)
Full Monty, The (1997)
Babe (1995)
Bug's Life, A (1998)
Monty Python and the Holy Grail (1974)
Pleasantville (1998)
There's Something About Mary (1998)
Clueless (1995)
-----RECOMENDATIONS-----
M�nage (Tenue de soir�e) (1986)                    Comedy|Drama
Terror in a Texas Town (1958)                      Western
Rhyme & Reason (1997)                              Documentary
Bloody Child, The (1996)                           Drama|Thriller
McCullochs, The (1975)                             Drama
Wooden Man's Bride, The (Wu Kui) (1994)            Drama
Resurrection Man (1998)                            Drama|Thriller
Project Moon Base (1953)                           Sci-Fi
Eden (1997)                                        Drama
Song of Freedom (1936)                             Drama


In [None]:
get_hr_ndcg(gmf)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


nDCG@k=0.365738
hr@k  =0.656623


все аналогично MLP

In [None]:
neumf = NeuMF('gmf.pcl', 'mlp.pcl', 32, 32).to(device)
neumf.fit(traindata, None, num_epochs=50, bs=512, lr=5e-3)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

[0/50]:loss=1312.1774
[1/50]:loss=1291.7234
[2/50]:loss=1282.7264
[3/50]:loss=1275.0847
[4/50]:loss=1270.6640
[5/50]:loss=1264.8783
[6/50]:loss=1258.4113
[7/50]:loss=1256.6471
[8/50]:loss=1251.5765
[9/50]:loss=1250.0716
[10/50]:loss=1244.3271
[11/50]:loss=1249.5747
[12/50]:loss=1248.0741
[13/50]:loss=1245.1180
[14/50]:loss=1244.0900
[15/50]:loss=1244.5465
[16/50]:loss=1240.6143
[17/50]:loss=1239.4776
[18/50]:loss=1238.4899
[19/50]:loss=1238.2035
[20/50]:loss=1237.5422
[21/50]:loss=1234.7058
[22/50]:loss=1234.4714
[23/50]:loss=1234.8072
[24/50]:loss=1234.8152
[25/50]:loss=1231.4519
[26/50]:loss=1228.1906
[27/50]:loss=1229.0726
[28/50]:loss=1224.6955
[29/50]:loss=1226.7620
[30/50]:loss=1222.6571
[31/50]:loss=1224.1797
[32/50]:loss=1223.6722
[33/50]:loss=1219.0579
[34/50]:loss=1222.4396
[35/50]:loss=1222.5192
[36/50]:loss=1218.8496
[37/50]:loss=1221.0877
[38/50]:loss=1219.9449
[39/50]:loss=1220.7537
[40/50]:loss=1220.2462
[41/50]:loss=1217.4853
[42/50]:loss=1220.9035
[43/50]:loss=1217.869

In [None]:
test_model(neumf)

-----SIMILARS-----
Toy Story (1995)
Toy Story 2 (1999)
Groundhog Day (1993)
Babe (1995)
Clueless (1995)
Wizard of Oz, The (1939)
Beauty and the Beast (1991)
Lion King, The (1994)
Back to the Future (1985)
Pleasantville (1998)
-----RECOMENDATIONS-----
Ring, The (1927)                                   Drama
Bandits (1997)                                     Drama
Eden (1997)                                        Drama
Live Virgin (1999)                                 Comedy
Tainted (1998)                                     Comedy|Thriller
Boy Called Hate, A (1995)                          Drama
Santa with Muscles (1996)                          Comedy
Tough and Deadly (1995)                            Action|Drama|Thriller
Project Moon Base (1953)                           Sci-Fi
Snowriders (1996)                                  Documentary


Симилары и рекомендации аналогично предыдущим двум моделям

In [None]:
get_hr_ndcg(neumf)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


nDCG@k=0.378194
hr@k  =0.663245


Обе метрики при объединении выросли

In [None]:
torch.save(neumf.state_dict(), 'neumf.pcl')

Проблема с рекомендациями в том, что они считались через скалярное произведение, а в данных моделях предсказания делаются не так, поэтому само скалярное произведение ничего не говорит о рекомендациях. Правильно считать рекомендации прогоняя все пары через модели, что и показано ниже и дают уже хорошие фильмы

In [398]:
mlp = MLP(user_count, movie_count, 32, 32).to(device)
mlp.load_state_dict((torch.load('mlp.pcl')))
test_model(mlp, sim=False, rec=True)

-----RECOMENDATIONS-----
Godfather, The (1972)                              Action|Crime|Drama
Blade Runner (1982)                                Film-Noir|Sci-Fi
Godfather: Part II, The (1974)                     Action|Crime|Drama
Aliens (1986)                                      Action|Sci-Fi|Thriller|War
Terminator, The (1984)                             Action|Sci-Fi|Thriller
Close Encounters of the Third Kind (1977)          Drama|Sci-Fi
2001: A Space Odyssey (1968)                       Drama|Mystery|Sci-Fi|Thriller
Schindler's List (1993)                            Drama|War
Terminator 2: Judgment Day (1991)                  Action|Sci-Fi|Thriller
L.A. Confidential (1997)                           Crime|Film-Noir|Mystery|Thriller


In [399]:
gmf = GMF(user_count, movie_count, 32).to(device)
gmf.load_state_dict((torch.load('gmf.pcl')))
test_model(mlp, sim=False, rec=True)

-----RECOMENDATIONS-----
Godfather, The (1972)                              Action|Crime|Drama
Blade Runner (1982)                                Film-Noir|Sci-Fi
Godfather: Part II, The (1974)                     Action|Crime|Drama
Aliens (1986)                                      Action|Sci-Fi|Thriller|War
Terminator, The (1984)                             Action|Sci-Fi|Thriller
Close Encounters of the Third Kind (1977)          Drama|Sci-Fi
2001: A Space Odyssey (1968)                       Drama|Mystery|Sci-Fi|Thriller
Schindler's List (1993)                            Drama|War
Terminator 2: Judgment Day (1991)                  Action|Sci-Fi|Thriller
L.A. Confidential (1997)                           Crime|Film-Noir|Mystery|Thriller


In [400]:
neumf = NeuMF('gmf.pcl', 'mlp.pcl', 32, 32).to(device)
neumf.load_state_dict((torch.load('neumf.pcl')))
test_model(mlp, sim=False, rec=True)

-----RECOMENDATIONS-----
Godfather, The (1972)                              Action|Crime|Drama
Blade Runner (1982)                                Film-Noir|Sci-Fi
Godfather: Part II, The (1974)                     Action|Crime|Drama
Aliens (1986)                                      Action|Sci-Fi|Thriller|War
Terminator, The (1984)                             Action|Sci-Fi|Thriller
Close Encounters of the Third Kind (1977)          Drama|Sci-Fi
2001: A Space Odyssey (1968)                       Drama|Mystery|Sci-Fi|Thriller
Schindler's List (1993)                            Drama|War
Terminator 2: Judgment Day (1991)                  Action|Sci-Fi|Thriller
L.A. Confidential (1997)                           Crime|Film-Noir|Mystery|Thriller


# Attention

В целом, подход к attention будет аналогичен [этой работе](https://arxiv.org/abs/1809.07053), хотя они его напрямую не используют. Будем пытаться предсказывать оценку фильма на основе какой-то прошлой истории пользователя и использовать attention для поиска 'похожести' между фильмами из истории и кандидатом. 

*   Теперь есть только один набор эмбедингов для фильмов. Эмбединг для пользователя считается суммой эмбедингов в его истории
*   Датасет теперь пара из фильмов из истории и фильма следующего за ними

*   Число фильмов в истории всегда фиксируются, для работы с бачами

*   Так как получить из исходного датасета все последовательности фильмов какой-то длины долго, ограничемся только половиной их
 
*   Во время обучения для каждой из историй случайно сэмплируем еще по одному фильму, который будем считать неправильным продолжением (те таргет у него 1)

*   Сама модель получает id фильмов из истории и id кандидата, берет их эмбединги и отдает из MultiheadAttention. Как query берется эмбединг кандидата, а как key, value матрица эмбедингов истории (если брать только self-attention, то зависимость от таргета пропадет и для боевика и комедии внимание будет одинаковым, что кажется плохо). Далее находится эмбединг истории с учетом внимания. После этого можно поступить несколькоми способами: взять их сигмоид произведения эмбедингов и выдать как скор (аналог WARP); взять поточечное произведение и отдать в сетку (аналог GMF); сконкатенировать эмбединги и отдать в сетку (аналог MLP). Лучший результат дал последний подход, поэтому его и используем.





In [None]:
class AttentionData:
    def __init__(self, pos_dict, neg_dict, l=10):
        self.data = np.empty((0, l))
        self.pos = []
        for u in tqdm(range(user_count)):
            pos = pos_dict[u]
            for i in range(len(pos) // 2, len(pos) - l - 1):
                self.data = np.vstack((self.data, pos[i:i+l]))
                self.pos.append(pos[i+l])
        self.pos = np.array(self.pos).reshape(-1, 1)



    def set_neg(self, mode="lazy"):
        if mode == "correct":
            raise NotImplementedError()
        elif mode == "lazy":
            self.neg = np.random.choice(np.arange(movie_count), len(self.pos))
            # self.new = np.hstack((self.pos, self.neg))
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return (self.data[i], self.pos[i], self.neg[i])

In [None]:
attData = AttentionData(pos_dict, neg_dict, l=19)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




In [None]:
class TestAttention:
    def __init__(self, uid, old, new, l=10):
        self.uid = uid
        self.old = old
        self.new = new
        self.l = l
    
    def __len__(self):
        return len(self.new)
    
    def __getitem__(self, i):
        return self.old[-self.l:], self.new[i]

смена функции для подсчета метрик

In [None]:
def get_attention_hr_ndcg(model, l=19, k=10, prnt=True):
    y_true = np.zeros(100, dtype=int)
    y_true[0] = 1 
    ndcg = []
    hr = []  
    for u in tqdm(range(user_count)):
        test = TestAttention(u, pos_dict[u], np.hstack((test_data[u, 1], np.random.choice(neg_dict[u], 99, replace=False))), l=l)
        y_pred = model.predict(test, False, train_data_coo)
        ndcg.append(ndcg_score([y_true], 
                               [y_pred], 
                               k=k))
        hr.append((np.argsort(y_pred)[-k:] == 0).sum())
    if prnt:
        print(f"nDCG@{k}={np.mean(ndcg):4f}")
        print(f"HR@{k}  ={np.mean(hr):4f}")
    else:
        return np.mean(ndcg), np.mean(hr)

In [None]:
class Attention(nn.Module):
    def __init__(self, movie_count, ls, heads=5, hd=32):
        super().__init__()

        self.m = movie_count
        self.ls = ls
        self.heads = heads

        self.embedding_item = nn.Embedding(num_embeddings=self.m, embedding_dim=ls)
        self.model = nn.MultiheadAttention(ls * heads, heads, kdim=ls, vdim=ls)
        self.last = nn.Sequential(nn.Linear(2 * ls, 2 * hd), 
                                   nn.ReLU(),  
                                   nn.Linear(2 * hd, hd),
                                   nn.ReLU(), 
                                   nn.Linear(hd, 1), 
                                   nn.Sigmoid())
                
        self.set_emb()
        
    
    
    def forward(self, old_mid, new_mid):
        old_embedding = self.embedding_item(old_mid).permute(1, 0, 2)
        new_embedding = self.embedding_item(new_mid).unsqueeze(0)
        if len(new_embedding.shape) == 2:
            new_embedding = new_embedding.unsqueeze(0) 
        _, out = self.model(new_embedding.repeat(1, 1, self.heads), 
                            old_embedding, 
                            old_embedding)

        att_embedding = (out * old_embedding.permute(1, 2, 0)).sum(dim=2)
        new_embedding = new_embedding.squeeze()
        vector = torch.cat([new_embedding, att_embedding], dim=-1)
        out = self.last(vector)
        return out.squeeze()

    def set_emb(self):
        self.v = self.embedding_item.weight.detach().cpu().numpy()

    def predict(self, test, simple=False, user_items=None):
    
        loader = torch.utils.data.DataLoader(test, batch_size=128)
        y_pred = None
        
        for (old_batch, new_batch) in loader:
            old_batch = old_batch.to(device)
            new_batch = new_batch.to(device)
            out = self(old_batch.long(), new_batch.long())
            if y_pred is None:
                y_pred = out.squeeze().cpu().detach().numpy()
            else:
                y_pred = np.hstack((y_pred, out.squeeze().cpu().detach().numpy()))
        return y_pred
                

    def fit(self,
            train_data, 
            num_epochs=100,
            lr=1e-3,
            bs=128, 
            mode='lazy'):    
        opt = optim.Adam(self.parameters(), lr=lr)
        loss_func = nn.BCELoss()
        for epoch in tqdm(range(num_epochs)):
            running_loss = 0
            train_data.set_neg(mode)
            trainloader = torch.utils.data.DataLoader(train_data, 
                                                      batch_size=bs, 
                                                      shuffle=True)

            for (old_batch, pos_item, neg_item) in trainloader:
                old_batch = old_batch.to(device).long()

                pos_item = pos_item.to(device).long().squeeze()
                neg_item = neg_item.to(device).long().squeeze()
                target = torch.ones(old_batch.shape[0]).to(device)

                pos_out = self(old_batch, pos_item)
                neg_out = self(old_batch, neg_item)


                pos_loss = loss_func(pos_out, target.float())
                neg_loss = loss_func(neg_out, (1-target).float())
                
                loss = neg_loss + pos_loss
                opt.zero_grad()
                loss.backward()
                opt.step()
                running_loss += loss.item()
            print(f"[{epoch}/{num_epochs}]:loss={running_loss:.4f}")
        
        self.set_emb()

    def similar_items(self, item_id, N=10):
        scores = self.v @ self.v[item_id] / np.linalg.norm(self.v, axis=-1)
        ind = np.argsort(scores)[::-1][:N]
        return ind

    def similar_items2(self, item_id, N=10):
        item_embedding = self.embedding_item(torch.LongTensor([item_id]).to(device))
        data = torch.utils.data.DataLoader(np.arange(self.m), 
                                           batch_size=100)
        
        scores = None
        for batch in data:
            test_emb = self.embedding_item(batch.to(device).long())
            # print(test_emb.shape)
            # print(item_embedding.shape)
            rep_emb = item_embedding.repeat(len(batch), 1)
            vector = torch.cat([rep_emb, test_emb], dim=-1)
            out = self.last(vector).squeeze()
            if scores is None:
                scores = out.cpu().detach().numpy()
            else:
                scores = np.hstack((scores, out.cpu().detach().numpy()))
        print(scores.shape)
        ind = np.argsort(scores)[::-1][:N]
        return ind

    def recommend(self, user_id, user_items, N=10):
        seen = np.array([True if i in user_items[user_id].nonzero()[1] else True for i in np.arange(len(self.v))])
        unseen = np.array([False if i in user_items[user_id].nonzero()[1] else True for i in np.arange(len(self.v))])
        scores = self.v[unseen] @ (self.v[seen].sum(axis=0))
        ind = np.argsort(scores)[::-1]
        real_ind = np.arange(len(self.v))[unseen][ind]
        return real_ind[:N]

In [None]:
model = Attention(movie_count, 32, 10, 32).to(device)
model.fit(attData, num_epochs=20, bs=1024, lr=5e-3)
model.eval()

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

[0/20]:loss=318.4575
[1/20]:loss=213.0389
[2/20]:loss=158.8854
[3/20]:loss=128.6983
[4/20]:loss=111.4927
[5/20]:loss=99.2976
[6/20]:loss=90.7407
[7/20]:loss=85.4807
[8/20]:loss=80.9144
[9/20]:loss=77.6957
[10/20]:loss=74.9587
[11/20]:loss=72.9357
[12/20]:loss=70.9073
[13/20]:loss=68.9574
[14/20]:loss=68.0276
[15/20]:loss=66.3530
[16/20]:loss=64.8377
[17/20]:loss=64.2510
[18/20]:loss=63.4513
[19/20]:loss=62.9676



Attention(
  (embedding_item): Embedding(3706, 32)
  (model): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=320, out_features=320, bias=True)
  )
  (last): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [None]:
test_model(model)

-----SIMILARS-----
Toy Story (1995)
Scream (1996)
Death Wish II (1982)
Beauty and the Beast (1991)
Hi-Lo Country, The (1998)
Two Much (1996)
Monument Ave. (1998)
Still Crazy (1998)
Death in Brunswick (1991)
Project Moon Base (1953)
-----RECOMENDATIONS-----
Last Klezmer: Leopold Kozlowski, His Life and ...  Documentary
Hot Lead and Cold Feet (1978)                      Comedy|Western
Scorta, La (1993)                                  Thriller
Tashunga (1995)                                    Adventure|Western
Lost Horizon (1937)                                Drama
Princess Caraboo (1994)                            Drama
Six Ways to Sunday (1997)                          Comedy
Repossessed (1990)                                 Comedy
Ghost (1990)                                       Comedy|Romance|Thriller
Angel on My Shoulder (1946)                        Crime|Drama


Рекомендации и симилары ужасные

In [None]:
get_attention_hr_ndcg(model)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


nDCG@10=0.629193
HR@10  =0.888742


Но метрики получились очень хорошими и сильно выше, чем у предыдущих моделей (даже больше, чем цифры в статьях про них)

Почему симилары вышли плохими идей нет, но с рекомендациями проблема как и раньше была в том, что рекомендуются не по скалярному произведению, поэтому поменяем подсчет рекомендаций.

In [None]:
def attention_recommend(user_id, model, mat, l=19, N=10):
    uid = userid2uid[user_id]
    unseen = np.arange(len(model.v))[np.array([False if i in mat[uid].nonzero()[1] else True for i in np.arange(len(model.v))])]
    data = TestAttention(uid, pos_dict[uid], unseen, l=l)
    scores = model.predict(data)
    ind = np.argsort(scores)[::-1]
    real_ind = np.arange(len(model.v))[unseen][ind]
    real_ind = real_ind[:N]
    names = [movie_info[movie_info["movie_id"] == mid2movieid[i]]['name'].to_string() for i in real_ind]
    cats = [movie_info[movie_info["movie_id"] == mid2movieid[i]]['category'].to_string() for i in real_ind]
    for r1, r2 in zip(names, cats):
        print(f"{r1.split(maxsplit=1)[1]:50} {r2.split()[1]}")

In [None]:
attention_recommend(4, model, train_data_coo)

Glory (1989)                                       Action|Drama|War
Terminator, The (1984)                             Action|Sci-Fi|Thriller
Voyage to the Bottom of the Sea (1961)             Adventure|Sci-Fi
Flight of the Navigator (1986)                     Adventure|Children's|Sci-Fi
Fantastic Voyage (1966)                            Adventure|Sci-Fi
Abbott and Costello Meet Frankenstein (1948)       Comedy|Horror
Oliver & Company (1988)                            Animation|Children's
Bank Dick, The (1940)                              Comedy
Robocop (1987)                                     Action|Crime|Sci-Fi
Emerald Forest, The (1985)                         Action|Adventure|Drama


Теперь с рекомендации как надо