In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pds
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
device = torch.device('cuda:0')
from torch.nn import Module
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

In [2]:
df = pds.read_csv('data/ratings.csv')
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [3]:
df['uid2idx'] = LabelEncoder().fit_transform(df['userId'].values)
df['iid2idx'] = LabelEncoder().fit_transform(df['movieId'].values)

In [4]:
df

Unnamed: 0,userId,movieId,rating,timestamp,uid2idx,iid2idx
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,2
2,1,6,4.0,964982224,0,5
3,1,47,5.0,964983815,0,43
4,1,50,5.0,964982931,0,46
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,609,9416
100832,610,168248,5.0,1493850091,609,9443
100833,610,168250,5.0,1494273047,609,9444
100834,610,168252,5.0,1493846352,609,9445


In [5]:
num_users = df['uid2idx'].unique().shape[0]
num_items = df['iid2idx'].unique().shape[0]

In [6]:
sparsity = 1 - len(df) / (num_users * num_items) # 데이터에서 rating이 빈 것의 비율(행렬 만들었을 때)

print(f'# of users : {num_users}') 
print(f'# of contents (movie) : {num_items}')
print(f'sparsity of matrix: {sparsity}')

# of users : 610
# of contents (movie) : 9724
sparsity of matrix: 0.9830003169443864


In [7]:
class CustomDataset(Dataset):
    def __init__(
        self,
        data, 
        train = False, 
        test_ratio = 0.3,
        random_state = 123
    ):
        
        self.train_data, self.test_data = train_test_split(
            data, 
            test_size=test_ratio, 
            random_state=random_state,
            stratify=data.userId
        )
        
        if train:
            self.data = self.train_data
        else:
            self.data = self.test_data
              
        self.users = torch.tensor(self.data['uid2idx'].values) 
        self.items = torch.tensor(self.data['iid2idx'].values)
        self.ratings = torch.tensor(self.data['rating'].values) 
    
    
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self,idx):
        users = self.users[idx]
        items = self.items[idx]
        ratings = self.ratings[idx]
        return users, items, ratings.float()


In [8]:
Train_DL = DataLoader(CustomDataset(df, True), 64, True)
Test_DL = DataLoader(CustomDataset(df, False), 64, True)

In [9]:
from torch.nn import Embedding as embedding

class MF(Module):
    '''
    Matrix Factorization
    '''
    def __init__(
        self, 
        num_factors, 
        num_users, 
        num_items, 
        mu
    ):
        super(MF,self).__init__()
        
        self.P = embedding(num_users, num_factors)
        self.Q = embedding(num_items, num_factors)
        
        self.user_bias = embedding(num_users,  1)
        self.item_bias = embedding(num_items,  1)
        
        self.mu = mu
        
    def forward(self, user_id, item_id):
        P_u = self.P(user_id)
        Q_i = self.Q(item_id)
        b_u = self.user_bias(user_id)
        b_i = self.item_bias(item_id)

        outputs = torch.sum((P_u*Q_i), axis=1) + \
                  torch.squeeze(b_u) +\
                  torch.squeeze(b_i) +\
                  self.mu
                  
        
        return outputs.flatten()

In [10]:
class torch_trainer:
    def __init__(self, model, data, batch_size, loss_fn):
        self.model = model
        self.dataloader = DataLoader(data, batch_size, True)
        self.loss_fn = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = .001)
        
        
    def __training_batch_step__(self, batch):
        u, i, r = batch # u:user, i:item, r:rating
        
        users = u.to(device)
        items = i.to(device)
        ratings = r.to(device)

        preds = self.model(users, items)
        
        L2_reg=0
        for params in self.model.parameters():
            L2_reg += params.norm(2)
        
        losses = self.loss_fn(preds, ratings) + L2_reg
        
        self.optimizer.zero_grad()
        losses.backward()
        self.optimizer.step()
        
        return losses.item()

    
    def __training_epoch__(self, epoch_numb):
        loss_lists = []
        TQ = tqdm(self.dataloader)
        for n, batch in enumerate(TQ,1):
            loss_lists.append(self.__training_batch_step__(batch))
            TQ.set_description_str(f'Epoch : {epoch_numb}')
            TQ.set_postfix_str(f'Loss : {sum(loss_lists) / n:.4}')
        return sum(loss_lists) / n
    
    
    def get_model(self):
        return self.model
    
    
    def fit(self, loop_numb):
        self.model.train()
        history = dict(
            loss = []
        )
        for n in range(loop_numb):
            history['loss'].append(
                self.__training_epoch__(n)
            )
        return history

In [11]:
trainer = torch_trainer(
    model = MF(
        num_factors=6,
        num_users=num_users,
        num_items=num_items,
        mu=torch.tensor(df['rating'].mean())
    ).to(device),
    data = CustomDataset(df, True),
    batch_size = 64,
    loss_fn = torch.nn.MSELoss()
)

In [12]:
losses = trainer.fit(20)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1103.0), HTML(value='')))




KeyboardInterrupt: 

In [13]:
def evaluate(model, loader): 
    model.eval()
    loss_fn = torch.nn.MSELoss()
    infos = dict()
    loss_sum=0
    
    with torch.no_grad():   
        
        for idx, (u,i,r) in enumerate(loader):
            users = u.to(device)
            items = i.to(device)
            ratings = r.to(device)
            
            preds = model(users, items)
            
            losses = loss_fn(preds, ratings)
            loss_sum += losses.item()

    infos['loss'] = (loss_sum / len(loader))
    
    return infos

In [14]:
evaluate(
    model=trainer.get_model(),
    loader=DataLoader(CustomDataset(df, False), 64, True)
)

{'loss': 1.0752955410495622}

In [15]:
test = CustomDataset(df, False)

In [16]:
ratings_hat=trainer.get_model()(test.users.to(device), test.items.to(device))

In [17]:
test.users

tensor([335, 446, 402,  ..., 476, 598, 372])

In [18]:
test.items

tensor([   0,  364, 2155,  ...,  914, 1054,  153])

In [19]:
ratings_hat

tensor([3.5015, 3.5015, 3.5015,  ..., 3.5018, 3.5011, 3.5016], device='cuda:0',
       grad_fn=<AddBackward0>)

In [19]:
df.loc[np.where(df['uid2idx']==335)[0]]

Unnamed: 0,userId,movieId,rating,timestamp,uid2idx,iid2idx
51837,336,1,4.0,1122227329,335,0
51838,336,6,4.0,1122227549,335,5
51839,336,47,4.5,1122227343,335,43
51840,336,50,5.0,1120568496,335,46
51841,336,70,4.0,1120568169,335,62
51842,336,110,4.0,1122227307,335,97
51843,336,150,4.0,1122227547,335,123
51844,336,163,5.0,1120567987,335,136
51845,336,168,4.0,1120568038,335,140
51846,336,186,4.0,1120568049,335,157
