In [None]:
!pip install lightfm

Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/e9/8e/5485ac5a8616abe1c673d1e033e2f232b4319ab95424b42499fabff2257f/lightfm-1.15.tar.gz (302kB)
[K     |█                               | 10kB 24.9MB/s eta 0:00:01[K     |██▏                             | 20kB 15.9MB/s eta 0:00:01[K     |███▎                            | 30kB 13.4MB/s eta 0:00:01[K     |████▍                           | 40kB 12.5MB/s eta 0:00:01[K     |█████▍                          | 51kB 8.7MB/s eta 0:00:01[K     |██████▌                         | 61kB 9.0MB/s eta 0:00:01[K     |███████▋                        | 71kB 9.3MB/s eta 0:00:01[K     |████████▊                       | 81kB 10.3MB/s eta 0:00:01[K     |█████████▊                      | 92kB 9.5MB/s eta 0:00:01[K     |██████████▉                     | 102kB 8.4MB/s eta 0:00:01[K     |████████████                    | 112kB 8.4MB/s eta 0:00:01[K     |█████████████                   | 122kB 8.4MB/s eta 0:00:01

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM
from lightfm.evaluation import auc_score

In [None]:
from lightfm.datasets import fetch_movielens

In [None]:
from tqdm.notebook import tqdm

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F

In [None]:
from itertools import chain

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data

Датасет взял movielens потому что он использовался и в статье про NCF и в первом задание на BPR/WARP

In [None]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
            names=['user_id', 'movie_id', 'rating', 'timestamp'], 
            usecols=['user_id', 'movie_id', 'rating'], engine='python')

In [None]:
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

меняю id, так как исходно id не идут подряд

In [None]:
uid2userid = np.unique(ratings["user_id"])
userid2uid = {i: j for (j, i) in enumerate(uid2userid)}

In [None]:
mid2movieid = np.unique(ratings["movie_id"])
movieid2mid = {i: j for (j, i) in enumerate(mid2movieid)}

In [None]:
train, test = train_test_split(ratings, test_size=0.2)

In [None]:
train_users = [userid2uid[i] for i in train['user_id']]
train_movies = [movieid2mid[i] for i in train['movie_id']]

In [None]:
explicit = np.vstack((train_users, train_movies, train["rating"].to_numpy())).T

In [None]:
user_item = sp.coo_matrix((train["rating"], (train_users, train_movies)))
user_item_explicit = user_item.tocsr()

In [None]:
user_count, movie_count = len(uid2userid), len(mid2movieid)

In [None]:
implicit_train = explicit[explicit[:,2] >= 4][:, :2]
# implicit_train[:,2] = 1

In [None]:
train_data = sp.coo_matrix((np.ones_like(implicit_train[:,0]), (implicit_train[:,0], implicit_train[:,1])))
train_data = train_data.tocsr()

In [None]:
get_similars = lambda item_id, model : [movie_info[movie_info["movie_id"] == mid2movieid[i]]["name"].to_string()
                                        for i in model.similar_items(movieid2mid[item_id])]

In [None]:
get_recommendations = lambda user_id, model, mat, col="name" : [movie_info[movie_info["movie_id"] == mid2movieid[i]][col].to_string() 
                                                    for i in model.recommend(userid2uid[user_id], mat)]

In [None]:
get_user_history = lambda user_id, implicit_ratings : [movie_info[movie_info["movie_id"] == mid2movieid[x]]["name"].to_string() 
                                            for x in implicit_ratings[implicit_ratings[:,0] == userid2uid[user_id]][:,1]]

In [None]:
test_users =  [userid2uid[i] for i in test['user_id']]
test_movies = [movieid2mid[i] for i in test['movie_id']]
explicit_test =  np.vstack((test_users, test_movies, test["rating"].to_numpy())).T
labels = np.zeros(len(explicit_test), dtype=int)
labels[explicit_test[:,2] >=4] = 1
implicit_test = np.hstack((explicit_test[:, :2], labels.reshape(-1, 1)))

In [None]:
test_data = sp.coo_matrix((np.ones_like(implicit_test[:,0]), (implicit_test[:,0], implicit_test[:,1])))
test_data = test_data.tocsr()

история юзера, кому будут дававться рекомендации

In [None]:
get_user_history(4, implicit_train)

['1183    Good, The Bad and The Ugly, The (1966)',
 '3633    Mad Max (1979)',
 '1180    Raiders of the Lost Ark (1981)',
 '1959    Saving Private Ryan (1998)',
 '1220    Terminator, The (1984)',
 '1196    Alien (1979)',
 '1885    Rocky (1976)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '3399    Hustler, The (1961)',
 '2878    Goldfinger (1964)',
 '2882    Fistful of Dollars, A (1964)',
 '476    Jurassic Park (1993)',
 '3349    Thelma & Louise (1991)',
 '1366    Jaws (1975)',
 '1023    Die Hard (1988)',
 '2297    King Kong (1933)']

In [None]:
def test_model(model):
    print("SIMILARS")
    res = get_similars(1, model)
    for r in get_similars(1, model):
        print(r.split(maxsplit=1)[1])
    
    print('RECOMENDATIONS')
    res1 = get_recommendations(4, model, user_item_explicit, col="category")
    res2 = get_recommendations(4, model, user_item_explicit, col="name")
    for r1, r2 in zip(res2, res1):
        print(f"{r1.split(maxsplit=1)[1]:50} {r2.split()[1]}")

# WARP

In [None]:
alpha = 1e-03
epochs = 40
num_components = 64
lr = 0.05

In [None]:
warp_model = LightFM(no_components=num_components,
                    learning_rate = lr,
                    loss='warp',
                    learning_schedule='adagrad',
                    max_sampled=100,
                    user_alpha=alpha,
                    item_alpha=alpha)

In [None]:
warp_model.fit_partial(train_data, epochs=epochs, verbose=True,)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39


<lightfm.lightfm.LightFM at 0x7fedc464cb38>

In [None]:
scores = warp_model.predict(implicit_test[:, 0], implicit_test[:, 1])
roc_auc_score(implicit_test[:, 2], scores)

0.6863134838544033

In [None]:
class WARPRecommender():
    def __init__(self, warp):
        self.u = warp.user_embeddings
        self.v = warp.item_embeddings
        self.bv = warp.item_biases
        self.bu = warp.user_biases
    
    def similar_items(self, item_id, N=10):
        scores = self.v @ self.v[item_id] / np.linalg.norm(self.v, axis=-1)
        ind = np.argsort(scores)[::-1][:N]
        return ind

    def recommend(self, user_id, user_items, N=10):
        unseen = np.array([False if i in user_items[user_id].nonzero()[1] else True for i in np.arange(len(self.v))])
        scores = self.v[unseen] @ self.u[user_id] + self.bv[unseen]
        ind = np.argsort(scores)[::-1]
        real_ind = np.arange(len(self.v))[unseen][ind]
        return real_ind[:N]

In [None]:
warp = WARPRecommender(warp_model)

In [None]:
get_similars(1, warp)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '1245    Groundhog Day (1993)',
 "2286    Bug's Life, A (1998)",
 "2849    Ferris Bueller's Day Off (1986)",
 '1899    Breakfast Club, The (1985)',
 '1179    Princess Bride, The (1987)',
 '2647    Ghostbusters (1984)',
 '584    Aladdin (1992)',
 '2252    Pleasantville (1998)']

In [None]:
get_recommendations(4, warp, user_item_explicit, col="name")

['585    Terminator 2: Judgment Day (1991)',
 '2502    Matrix, The (1999)',
 '847    Godfather, The (1972)',
 '1250    Back to the Future (1985)',
 '589    Silence of the Lambs, The (1991)',
 '108    Braveheart (1995)',
 '1179    Princess Bride, The (1987)',
 '2789    American Beauty (1999)',
 '1182    Aliens (1986)',
 '537    Blade Runner (1982)']

In [None]:
get_recommendations(4, warp, user_item_explicit, col="category")

['585    Action|Sci-Fi|Thriller',
 '2502    Action|Sci-Fi|Thriller',
 '847    Action|Crime|Drama',
 '1250    Comedy|Sci-Fi',
 '589    Drama|Thriller',
 '108    Action|Drama|War',
 '1179    Action|Adventure|Comedy|Romance',
 '2789    Comedy|Drama',
 '1182    Action|Sci-Fi|Thriller|War',
 '537    Film-Noir|Sci-Fi']

Получились и хорошие рекомендации и хорошии симилары

# NCF

In [None]:
class Data:
    def __init__(self, dataset):
        self.pos = dataset
        self.data = None
        self.neg = None
        self.neg_dict = {}
        for i in tqdm(range(user_count)):
            seen = set(self.pos[self.pos[:,0] == i][:,1])
            unseen = set(np.arange(movie_count)) - seen
            unseen = np.array(list(unseen), dtype=int)
            self.neg_dict[i] = unseen


    def set_neg(self, mode="lazy"):
        if mode == "correct":
            self.neg = self.pos.copy()
            new_negs = None
            for i in range(user_count):
                cur_pos = self.pos[:,0] == i
                len_cur = (self.pos[:,0] == i).sum()
                cur_negs = np.random.choice(self.neg_dict[i], len_cur)
                if new_negs is None:
                    new_negs = cur_negs
                else:
                    new_negs = np.hstack((new_negs, cur_negs))
            self.neg[:,1] = new_negs
        elif mode == "lazy":
            self.neg = self.pos.copy()
            self.neg[:,1] = np.random.choice(np.arange(movie_count), len(self.pos))
        self.data = np.vstack((self.pos, self.neg))
        self.labels = np.hstack(( np.ones(len(self.pos)), np.zeros(len(self.neg)) ))
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return (*self.data[i], self.labels[i])
        # pos_ex = self.pos[i]
        # neg_ex = self.neg[i]
        # # negs = self.neg[self.neg[:,0]==pos_ex[0]]
        # # neg_ex = self.neg[np.random.randint(len(self.neg))]

        # return [*pos_ex, neg_ex[1]]

In [None]:
traindata = Data(implicit_train)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




следующие классы это имплементация из статьи Neural Collaborative Filtering

In [None]:
class NCFBase(nn.Module):
    def __init__(self, user_count, movie_count):
        super().__init__()
        self.n = user_count
        self.m = movie_count
    
    
    def part_forward(self, uid, mid):
        raise NotImplementedError()

    def forward(self, uid, mid):
        raise NotImplementedError()
    
    def set_emb(self):
        raise NotImplementedError()

    def predict(self, test):
        loader = torch.utils.data.DataLoader(test, batch_size=512)
        y_pred = None
        
        for batch in loader:
            users = batch[:, 0].to(device)
            items = batch[:, 1].to(device)
            out = self(users, items)
            if y_pred is None:
                y_pred = out.squeeze().cpu().detach().numpy()
            else:
                y_pred = np.hstack((y_pred, out.squeeze().cpu().detach().numpy()))
        return y_pred
                

    def fit(self, train_data, test_data, num_epochs=100, lr=1e-3, bs=128):    
        opt = optim.Adam(self.parameters(), lr=lr)
        loss_func = nn.BCELoss()
        for epoch in range(num_epochs):
            running_loss = 0
            train_data.set_neg('lazy')
            trainloader = torch.utils.data.DataLoader(train_data, 
                                                      batch_size=bs, 
                                                      shuffle=True)

            for (users, items, target) in trainloader:
                users = users.to(device)
                items = items.to(device)
                target = target.to(device)
                    
                out = self(users, items)
                loss = loss_func(out, target.float().view(-1, 1))
                opt.zero_grad()
                loss.backward()
                opt.step()
                running_loss += loss.item()
            y_pred = self.predict(test_data)
            auc = roc_auc_score(test_data[:, 2], y_pred)
            print(f"[{epoch}/{num_epochs}]:loss={running_loss:.4f}; auc={auc:.4f}")
        
        self.set_emb()

    def similar_items(self, item_id, N=10):
        scores = self.v @ self.v[item_id] / np.linalg.norm(self.v, axis=-1)
        ind = np.argsort(scores)[::-1][:N]
        return ind

    def recommend(self, user_id, user_items, N=10):
        unseen = np.array([False if i in user_items[user_id].nonzero()[1] else True for i in np.arange(len(self.v))])
        scores = self.v[unseen] @ self.u[user_id]
        ind = np.argsort(scores)[::-1]
        real_ind = np.arange(len(self.v))[unseen][ind]
        return real_ind[:N]

In [None]:
class MLP(NCFBase):
    def __init__(self, user_count, movie_count, ls, hd):
        super().__init__(user_count, movie_count)
        self.ls = ls

        self.embedding_user = nn.Embedding(num_embeddings=self.n, embedding_dim=ls)
        self.embedding_item = nn.Embedding(num_embeddings=self.m, embedding_dim=ls)

        self.model = nn.Sequential(nn.Linear(2 * ls, hd), 
                                   nn.ReLU(), 
                                   nn.Linear(hd, hd), 
                                   nn.ReLU())
        
        self.last =  nn.Sequential(nn.Linear(hd, 1), 
                                   nn.Sigmoid())
        
        self.set_emb()
    
    
    def part_forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)
        out = self.model(vector)
        return out

    def forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)
        out = self.model(vector)
        rating = self.last(out)
        return rating

    def set_emb(self):
        self.v = self.embedding_item.weight.detach().cpu().numpy()
        self.u = self.embedding_user.weight.detach().cpu().numpy()

In [None]:
class GMF(NCFBase):
    def __init__(self, user_count, movie_count, ls):
        super().__init__(user_count, movie_count)
        self.ls = ls

        self.embedding_user = nn.Embedding(num_embeddings=self.n, embedding_dim=ls)
        self.embedding_item = nn.Embedding(num_embeddings=self.m, embedding_dim=ls)

        self.last = nn.Sequential(nn.Linear(ls, 1), 
                                 nn.Sigmoid())
        
        self.set_emb()

    
    def part_forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        out = torch.mul(user_embedding, item_embedding)
        return out
    
    def forward(self, uid, mid):
        user_embedding = self.embedding_user(uid)
        item_embedding = self.embedding_item(mid)
        out = torch.mul(user_embedding, item_embedding)
        rating = self.last(out)
        return rating
    
    def set_emb(self):
        self.v = self.embedding_item.weight.detach().cpu().numpy()
        self.u = self.embedding_user.weight.detach().cpu().numpy()

In [None]:
class NeuMF(NCFBase):
    def __init__(self, gmf, mlp, ls, hd):
        super().__init__(0, 0)
        self.gmf = gmf
        self.mlp = mlp

        self.last = nn.Sequential(nn.Linear(ls+hd, 1), 
                                  nn.Sigmoid())
        
        self.set_emb()
    
    
    def forward(self, uid, mid):
        mlp_out = self.mlp.part_forward(uid, mid)
        gmf_out = self.gmf.part_forward(uid, mid)
        vector = torch.cat([mlp_out, gmf_out], dim=-1)
        out = self.last(vector)
        return out
    
    def predict(self, test):
        loader = torch.utils.data.DataLoader(test, batch_size=512)
        y_pred = None
        
        for batch in loader:
            users = batch[:, 0].to(device)
            items = batch[:, 1].to(device)
            out = self(users, items)
            if y_pred is None:
                y_pred = out.squeeze().cpu().detach().numpy()
            else:
                y_pred = np.hstack((y_pred, out.squeeze().cpu().detach().numpy()))
        return y_pred
    
    
    def set_emb(self):
        self.v = np.hstack((self.gmf.embedding_item.weight.detach().cpu().numpy(), 
                            self.mlp.embedding_item.weight.detach().cpu().numpy()))
        self.u = np.hstack((self.gmf.embedding_user.weight.detach().cpu().numpy(), 
                            self.mlp.embedding_user.weight.detach().cpu().numpy()))

In [None]:
mlp = MLP(user_count, movie_count, 32, 32).to(device)
mlp.fit(traindata, implicit_test, num_epochs=10, bs=512, lr=1e-2)

[0/10]:loss=870.9032; auc=0.6763
[1/10]:loss=829.4932; auc=0.6779
[2/10]:loss=824.0259; auc=0.6791
[3/10]:loss=810.9609; auc=0.6891
[4/10]:loss=775.5384; auc=0.7013
[5/10]:loss=756.8107; auc=0.6939
[6/10]:loss=739.2789; auc=0.7005
[7/10]:loss=720.4769; auc=0.7027
[8/10]:loss=703.2066; auc=0.7012
[9/10]:loss=688.3626; auc=0.7103


In [None]:
test_model(mlp)

SIMILARS
Toy Story (1995)
Groundhog Day (1993)
Toy Story 2 (1999)
Sixth Sense, The (1999)
Saving Private Ryan (1998)
Stand by Me (1986)
Star Wars: Episode V - The Empire Strikes Back...
Braveheart (1995)
Princess Bride, The (1987)
Chicken Run (2000)
RECOMENDATIONS
Bay of Blood (Reazione a catena) (1971)            Horror
Moonlight Murder (1936)                            Mystery
Brothers in Trouble (1995)                         Drama
Last of the High Kings, The (a.k.a. Summer Fli...  Drama
Rent-A-Cop (1988)                                  Action|Comedy
Digging to China (1998)                            Drama
Buck and the Preacher (1972)                       Western
Bootmen (2000)                                     Comedy|Drama
Let's Talk About Sex (1998)                        Drama
For Ever Mozart (1996)                             Drama


In [None]:
gmf = GMF(user_count, movie_count, 32).to(device)
gmf.fit(traindata, implicit_test, num_epochs=10, bs=512, lr=1e-2)

[0/10]:loss=1249.8260; auc=0.5005
[1/10]:loss=1136.8895; auc=0.6637
[2/10]:loss=849.6198; auc=0.6702
[3/10]:loss=812.2539; auc=0.6844
[4/10]:loss=755.5984; auc=0.6908
[5/10]:loss=708.4372; auc=0.6965
[6/10]:loss=667.6031; auc=0.6957
[7/10]:loss=639.3491; auc=0.6995
[8/10]:loss=616.4024; auc=0.7042
[9/10]:loss=600.1062; auc=0.7045


In [None]:
test_model(gmf)

SIMILARS
Toy Story (1995)
Toy Story 2 (1999)
Aladdin (1992)
Babe (1995)
Bug's Life, A (1998)
Princess Bride, The (1987)
Wrong Trousers, The (1993)
Little Mermaid, The (1989)
Prince of Egypt, The (1998)
Back to the Future (1985)
RECOMENDATIONS
French Kiss (1995)                                 Comedy|Romance
Marlene Dietrich: Shadow and Light (1996)          Documentary
Evita (1996)                                       Drama|Musical
Englishman Who Went Up a Hill, But Came Down a...  Comedy|Romance
Patch Adams (1998)                                 Comedy|Drama
Eye for an Eye (1996)                              Drama|Thriller
Immortal Beloved (1994)                            Drama|Romance
Dolores Claiborne (1994)                           Drama|Thriller
Steel Magnolias (1989)                             Drama
Life Less Ordinary, A (1997)                       Romance|Thriller


In [None]:
neumf = NeuMF(gmf, mlp, 32, 32).to(device)
neumf.fit(traindata, implicit_test, num_epochs=20, bs=512, lr=5e-3)

[0/20]:loss=525.5681; auc=0.7019
[1/20]:loss=516.4890; auc=0.7036
[2/20]:loss=508.0946; auc=0.7038
[3/20]:loss=502.1100; auc=0.7021
[4/20]:loss=497.3195; auc=0.7048
[5/20]:loss=492.0189; auc=0.7087
[6/20]:loss=489.1685; auc=0.7055
[7/20]:loss=484.2637; auc=0.7038
[8/20]:loss=480.2401; auc=0.7034
[9/20]:loss=475.5336; auc=0.7035
[10/20]:loss=470.6948; auc=0.7031
[11/20]:loss=469.4879; auc=0.7031
[12/20]:loss=464.4485; auc=0.7038
[13/20]:loss=464.1475; auc=0.7029
[14/20]:loss=460.7595; auc=0.7040
[15/20]:loss=457.8969; auc=0.7032
[16/20]:loss=457.4002; auc=0.7014
[17/20]:loss=455.8002; auc=0.7029
[18/20]:loss=453.3930; auc=0.7035
[19/20]:loss=450.5301; auc=0.7028


Для симиларов и рекомендаций в полной NeuMF модели эмбединги конкатенируются для GBF и MLP

In [None]:
test_model(neumf)

SIMILARS
Toy Story (1995)
Toy Story 2 (1999)
Groundhog Day (1993)
Sixth Sense, The (1999)
Back to the Future (1985)
Who Framed Roger Rabbit? (1988)
Star Wars: Episode IV - A New Hope (1977)
Saving Private Ryan (1998)
Babe (1995)
Silence of the Lambs, The (1991)
RECOMENDATIONS
Jason's Lyric (1994)                               Crime|Drama
Interview with the Vampire (1994)                  Drama|Horror
Astronaut's Wife, The (1999)                       Sci-Fi|Thriller
Mr. Jones (1993)                                   Drama|Romance
Hitch-Hiker, The (1953)                            Film-Noir
Castaway Cowboy, The (1974)                        Comedy|Western
Something Wicked This Way Comes (1983)             Children's|Horror
Friday the 13th: The Final Chapter (1984)          Horror
Jaws 3-D (1983)                                    Action|Horror
Tarantula (1955)                                   Horror|Sci-Fi


В сравнение с warp auc лучше у этой модели. Но симиляры и рекомендации хуже. С рекомендациями плохо у всех частей и это скорее всего из-за отсутсвия байеса. Симилары у части gbf хорошие, а у части с MLP не очень и вместе тоже не очень.