***

# Movie Recommendations
### - dataset : [movielens 1m](https://grouplens.org/datasets/movielens/)
### - method : Item2Vec embedding
### - reference : [ITEM2VEC: NEURAL ITEM EMBEDDING FOR COLLABORATIVE FILTERING](https://arxiv.org/ftp/arxiv/papers/1603/1603.04259.pdf)
### - feature : \[numeric\] implicit rating

***

## 0) Import library

In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
pd.options.display.float_format = '{:.4f}'.format

***

## 1) Data processing

### load data

In [3]:
ratings = pd.read_csv('data/movie_lens/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0000,964982703
1,1,3,4.0000,964981247
2,1,6,4.0000,964982224
3,1,47,5.0000,964983815
4,1,50,5.0000,964982931
...,...,...,...,...
100831,610,166534,4.0000,1493848402
100832,610,168248,5.0000,1493850091
100833,610,168250,5.0000,1494273047
100834,610,168252,5.0000,1493846352


### transform dictionary id to index

In [4]:
num_user = len(ratings['userId'].unique())
num_item = len(ratings['movieId'].unique())

unique_item_ids = ratings['movieId'].unique()

index_to_id = {i :unique_item_ids[i] for i in range(num_item)}
id_to_index = {unique_item_ids[i] :i for i in range(num_item)}

ratings['movieId'] = ratings['movieId'].apply(lambda x: id_to_index[x])
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,0,4.0000,964982703
1,1,1,4.0000,964981247
2,1,2,4.0000,964982224
3,1,3,5.0000,964983815
4,1,4,5.0000,964982931
...,...,...,...,...
100831,610,3120,4.0000,1493848402
100832,610,2035,5.0000,1493850091
100833,610,3121,5.0000,1494273047
100834,610,1392,5.0000,1493846352


### user as session

In [5]:
sessions = [ratings.loc[ratings['userId']==i]['movieId'].tolist() for i in range(1, num_user+1)]
sessions[0]

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


### define data iterator

In [6]:
class DataGenerator(IterableDataset):
    def __init__(self, sessions, n_item, n_neg):
        super().__init__()

        self.sessions = sessions
        self.num_item = n_item
        self.num_neg = n_neg

        self.count_df = self._count_fw(sessions)
        self.max_length = max(map(len, sessions))
        
        self.num_data = None

    def __len__(self):
        return self.num_data

    def __iter__(self):
        for i in tqdm(range(len(self.sessions))):
            discarded = self._discard(self.sessions[i])
            pos_items = discarded

            prob = np.ones(self.num_item)
            prob[pos_items] = 0
            prob /= prob.sum()

            num_pos = len(pos_items)
            if num_pos <2:continue

            num_sample = (self.num_neg * num_pos) * num_pos

            neg_items = np.random.choice(self.num_item, int(num_sample), replace=True, p=prob)
            neg_items = neg_items.reshape(num_pos, -1)
            neg_items = neg_items.tolist()

            pos_pad = [self.num_item]*(self.max_length-num_pos+1)
            neg_pad = [self.num_item]*(self.max_length-num_pos)*self.num_neg

            for j in range(len(pos_items)):
                item = torch.LongTensor(torch.tensor(pos_items[j]))
                pos = torch.LongTensor(torch.tensor(pos_items[:j]+pos_items[j+1:] + pos_pad))
                neg = torch.LongTensor(torch.tensor(neg_items[j] + neg_pad))

                yield item, pos, neg

    def _count_fw(self, sessions):
        unnested = [item for session in sessions for item in session]
        count_df = pd.DataFrame({"item_id": unnested,
                                 "count": [1] * len(unnested)}).groupby("item_id").count()
        return count_df

    def _discard(self, session, rho=1e-5):
        fw = [self.count_df.loc[w] for w in session]
        probs = [1 - (rho / c) ** 0.5 for c in fw]
        discard_mask = [np.random.choice([False, True], 1, [p, 1 - p])[0] for p in probs]

        ret = [item for item in session]
        ret = np.array(ret)[discard_mask].tolist()

        return ret

***

## 2) Modeling

### define model

In [7]:
class Item2Vec(nn.Module):
    def __init__(self, n_item, dim_embedding, n_neg ,epochs=5, print_step=1):
        super().__init__()

        self.name = 'Item2Vec'

        self.n_item = n_item
        self.dim_embedding = dim_embedding
        self.epochs = epochs
        self.print_step = print_step
        self.num_neg = n_neg

        self.in_embedding = nn.Embedding(self.n_item + 1, dim_embedding, padding_idx=self.n_item)
        self.out_embedding = nn.Embedding(self.n_item + 1, dim_embedding, padding_idx=self.n_item)

        self.similarity = None
        self.measure_train = {}

        self._set_optimizer()
        self._set_loss_fn()

    def _init_weight(self):
        pass

    def _set_optimizer(self, method="Adam", lr=0.01, reg=1e-5):
        optimizer = getattr(torch.optim, method)
        self.optimizer = optimizer(self.parameters(),
                                   lr=lr,
                                   weight_decay=reg)

    def _set_loss_fn(self):
        self.measure_train['measure'] = "SGNS loss"
        self.loss_fn = lambda pos, neg: \
            -(torch.log(torch.sigmoid(pos)) + torch.log(torch.sigmoid(-neg)).sum(2)).mean()

    def fit(self, train_loader):
        print(f'\n[{self.name}]' + ' Train '.center(100, '='))
        print(f"[{self.name}] ".ljust(20, ' ') + \
              f"(train: {self.measure_train['measure']})".ljust(20, ' '))

        for epoch in range(1, self.epochs + 1):

            train_loss = 0
            train_cnt = 0

            for batch_idx, samples in enumerate(train_loader):
                self.train()

                item, pos, neg = samples
                pos_prefer, neg_prefer = self.forward(item=item, pos=pos, neg=neg)
                loss = self.backward(pos_prefer, neg_prefer)

                train_loss += loss
                train_cnt += 1

            if epoch % self.print_step == 0:
                print(f"(epoch {epoch}) ".rjust(20, ' ') + \
                      f"{train_loss / train_cnt:.4f}".ljust(20, ' '))

        self.similarity = cosine_similarity(self.in_embedding.weight.data, self.in_embedding.weight.data)
        np.fill_diagonal(self.similarity, -np.inf)

    def forward(self, item, pos, neg):
        embedded_item = self.in_embedding(item)
        embedded_pos = self.out_embedding(pos)
        embedded_neg = self.out_embedding(neg)

        num_batch = embedded_item.shape[0]

        prefer_pos = torch.mul(embedded_item.unsqueeze(1), embedded_pos).sum(dim=2)
        prefer_neg = torch.mul(embedded_item.unsqueeze(1), embedded_neg).sum(dim=2).view(num_batch, -1, self.num_neg)

        return prefer_pos, prefer_neg

    def backward(self, pos_pred, neg_pred):
        self.optimizer.zero_grad()
        loss = self.loss_fn(pos_pred, neg_pred)
        loss.backward()
        self.optimizer.step()

        return loss

    def get_knn(self, item_index, k=100):
        similarities = self.similarity[item_index]
        knn = np.argsort(similarities)[-1::-1][:k]

        return knn

### train model

In [8]:
dataset = DataGenerator(sessions=sessions, n_item=num_item, n_neg=2)
loader = DataLoader(dataset, batch_size=128)
model = Item2Vec(n_item=num_item, n_neg=2, dim_embedding=100)
model.fit(train_loader=loader)


[Item2Vec]          (train: SGNS loss)  


100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [03:58<00:00,  2.56it/s]


          (epoch 1) 2.1657              


100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [03:56<00:00,  2.58it/s]


          (epoch 2) 2.0543              


100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [03:51<00:00,  2.63it/s]


          (epoch 3) 2.0530              


100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [03:52<00:00,  2.63it/s]


          (epoch 4) 2.0520              


100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [03:52<00:00,  2.62it/s]


          (epoch 5) 2.0517              


### see similarity

In [9]:
similarity_df = pd.DataFrame(model.similarity[:num_item, :num_item], index=range(num_item), columns=range(num_item))
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,-inf,0.9580,0.9859,0.9985,0.9907,0.9575,0.9235,0.9924,0.9788,0.8927,...,0.9989,0.9976,0.8366,0.6573,0.9969,0.9168,0.9002,0.9990,0.7210,0.5684
1,0.9580,-inf,0.9919,0.9709,0.9863,0.9999,0.9946,0.9856,0.9954,0.9843,...,0.9573,0.9744,0.9448,0.8272,0.9758,0.8039,0.7838,0.9622,0.8457,0.7394
2,0.9859,0.9919,-inf,0.9930,0.9974,0.9919,0.9740,0.9985,0.9976,0.9546,...,0.9843,0.9942,0.9048,0.7579,0.9948,0.8551,0.8359,0.9876,0.7948,0.6674
3,0.9985,0.9709,0.9930,-inf,0.9958,0.9706,0.9412,0.9972,0.9874,0.9137,...,0.9976,0.9995,0.8598,0.6898,0.9993,0.9010,0.8833,0.9987,0.7460,0.6004
4,0.9907,0.9863,0.9974,0.9958,-inf,0.9855,0.9639,0.9994,0.9971,0.9421,...,0.9915,0.9974,0.8976,0.7462,0.9978,0.8845,0.8667,0.9933,0.7936,0.6597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,0.9168,0.8039,0.8551,0.9010,0.8845,0.7990,0.7432,0.8807,0.8546,0.6989,...,0.9259,0.9014,0.6627,0.4617,0.8994,-inf,0.9967,0.9171,0.5740,0.3966
9720,0.9002,0.7838,0.8359,0.8833,0.8667,0.7786,0.7222,0.8626,0.8365,0.6775,...,0.9104,0.8844,0.6431,0.4429,0.8829,0.9967,-inf,0.9014,0.5571,0.3801
9721,0.9990,0.9622,0.9876,0.9987,0.9933,0.9615,0.9288,0.9941,0.9825,0.8992,...,0.9995,0.9986,0.8474,0.6726,0.9982,0.9171,0.9014,-inf,0.7368,0.5859
9722,0.7210,0.8457,0.7948,0.7460,0.7936,0.8439,0.8687,0.7836,0.8219,0.8843,...,0.7306,0.7579,0.9403,0.9470,0.7612,0.5740,0.5571,0.7368,-inf,0.9310


***

## 3) Recommendation

### define function for recommendation

In [10]:
def get_similar(item_id, top_k):
    similarities = similarity_df[item_id]
    ret = similarities.sort_values(ascending=False)

    return list(map(str,ret[:top_k].index))

### example

In [11]:
movie_id = 10
reco = get_similar(movie_id, top_k=30)
print(f"Similar movies :")
print("\n".join(reco))

Similar movies :
35
2426
1602
2627
144
610
537
3240
8426
217
935
2424
3478
1592
1012
872
889
2229
3785
2709
2320
1034
1639
2151
19
13
439
2376
1938
2342
