In [109]:
import os
import pandas as pd
import numpy as np

import zipfile
from urllib import request

from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
from torchvision import transforms

In [110]:
def download_ml100k():
    # download
    url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
    savename = "ml-100k.zip"
    request.urlretrieve(url, savename)
    print('Complete!')
    # unzip
    file_name = os.path.join('./', savename)
    file_zip = zipfile.ZipFile(file_name)
    file_zip.extractall('./')
    file_zip.close()

def read_data_ml100k():
    if not os.path.isfile(os.path.join('./ml-100k/', 'u.data')):
        print('Download ...')
        download_ml100k()
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join('./ml-100k/', 'u.data'), '\t', names=names,
                       engine='python')
    num_users = data.user_id.unique().shape[0]
    num_items = data.item_id.unique().shape[0]
    return data, num_users, num_items

In [111]:
def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
    users, items, scores = [], [], []
    inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
    for line in data.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = int(line[3]) if feedback == 'explicit' else 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        if feedback == 'implicit':
            inter.setdefault(user_index, []).append(item_index)
        else:
            inter[item_index, user_index] = score
    return users, items, scores, inter

In [112]:
def split_data_ml100k(data, num_users, num_items,
                      split_mode='random', test_ratio=0.1):
    """Split the dataset in random mode or seq-aware mode."""
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time))
            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        train_data = [item for item in train_list if item not in test_data]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        mask = [True if x == 1 else False for x in np.random.uniform(
            0, 1, (len(data))) < 1 - test_ratio]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

In [113]:
class ArrayDataset(Dataset):

    def __init__(self, user, item, rating, transform=None):
        self.user = user
        self.item = item
        self.rating = rating
        self.transform = transform

    def __getitem__(self, idx):
        arr = np.column_stack((self.user, self.item, self.rating))
        if self.transform:
            arr = self.transform(arr)
        return torch.Tensor(arr[idx])

    def __len__(self):
        return len(self.user)

In [156]:
def collate_batch(batch):
    # print('fn', type(batch), len(batch), batch[0])
    # batch는 tensor가 batch size만큼 들어간 리스트
    batch = torch.stack(batch)
    return batch[:, 0], batch[:, 1], batch[:, 2]

In [157]:
def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
                          test_ratio=0.1, batch_size=256):
    data, num_users, num_items = read_data_ml100k()
    # df load
    train_data, test_data = split_data_ml100k(
        data, num_users, num_items, split_mode, test_ratio)
    # user, item, rating load
    train_u, train_i, train_r, _ = load_data_ml100k(
        train_data, num_users, num_items, feedback)
    test_u, test_i, test_r, _ = load_data_ml100k(
        test_data, num_users, num_items, feedback)
    # u,i,r을 묶어서 해당 idx 반환
    train_set = ArrayDataset(
        np.array(train_u), np.array(train_i), np.array(train_r))
    test_set = ArrayDataset(
        np.array(test_u), np.array(test_i), np.array(test_r))
    # u, i, r를 batch 단위의 tensor로 반환
    train_iter = DataLoader(
        train_set, shuffle=True,
        batch_size=batch_size, 
        collate_fn=collate_batch
        )
    test_iter = DataLoader(
        test_set, batch_size=batch_size, 
        collate_fn=collate_batch
        )
    return num_users, num_items, train_iter, test_iter

In [158]:
from torch import nn

class MF(nn.Module):
    def __init__(self, num_factors, num_users, num_items, **kwargs):
        super(MF, self).__init__(**kwargs)
        self.P = nn.Embedding(num_users, num_factors)
        self.Q = nn.Embedding(num_items, num_factors)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

    def forward(self, user_id, item_id):
        P_u = self.P(user_id)
        Q_i = self.Q(item_id)
        b_u = self.user_bias(user_id)
        b_i = self.item_bias(item_id)
        outputs = (P_u * Q_i).sum(axis=1) + np.squeeze(b_u) + np.squeeze(b_i)
        return outputs.flatten()

In [159]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()

    def forward(self, x, y):
        criterion = nn.MSELoss()
        eps = 1e-6
        loss = torch.sqrt(criterion(x, y) + eps)
        return loss

In [160]:
num_users, num_items, train_iter, test_iter = split_and_load_ml100k(
    test_ratio=0.1, batch_size=512)


  data, num_users, num_items = read_data_ml100k()


In [161]:
batch = next(iter(train_iter))

In [170]:
model = MF(30, num_users, num_items)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


MF(
  (P): Embedding(943, 30)
  (Q): Embedding(1682, 30)
  (user_bias): Embedding(943, 1)
  (item_bias): Embedding(1682, 1)
)

In [179]:
lr, num_epochs, wd = 0.002, 20, 1e-5

loss_func = RMSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [181]:
from tqdm import tqdm
import glob

train_loss = []
best_val_epoch_loss = int(1e9)
for epoch in tqdm(range(num_epochs)):
  loss_arr = []
  for i, values in enumerate(train_iter):
    train_user = values[0].long().to(device)
    train_item = values[1].long().to(device)
    labels = values[2].to(device)
    preds = model(train_user, train_item)
    loss = loss_func(preds, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_arr.append(loss.detach().item())
    if i%10 == 0:
      train_loss.append(np.mean(loss_arr))   
  with torch.no_grad():
    model.eval()
    val_epoch_loss = 0
    for i, values in enumerate(test_iter):
      test_user = values[0].long().to(device)
      test_item = values[1].long().to(device)
      labels = values[2].to(device)
      preds = model(test_user, test_item)
      loss = loss_func(preds, labels)
      val_epoch_loss += loss.detach().item()
  val_epoch_loss /= len(test_iter)
  if val_epoch_loss < best_val_epoch_loss:
    best_val_epoch_loss = val_epoch_loss
    print(f'New best model loss: {best_val_epoch_loss}')
    if not os.path.exists('model'):
      os.mkdir('model')

    if os.path.exists('model/best.pth'):
      os.remove('model/best.pth')
    torch.save(model.state_dict(), 'model/best.pth')
    print('best model is saved!')
    

100%|██████████| 20/20 [18:30<00:00, 55.55s/it]


validation 재는거 추가
loss print 추가 
