In [1]:
import os
import pandas as pd
import numpy as np

import zipfile
from urllib import request

def download_ml100k():
    # download
    url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
    savename = "ml-100k.zip"
    request.urlretrieve(url, savename)
    print('Complete!')
    # unzip
    file_name = os.path.join('./', savename)
    file_zip = zipfile.ZipFile(file_name)
    file_zip.extractall('./')
    file_zip.close()

def read_data_ml100k():
    if not os.path.isfile(os.path.join('./ml-100k/', 'u.data')):
        print('Download ...')
        download_ml100k()
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join('./ml-100k/', 'u.data'), '\t', names=names,
                       engine='python')
    num_users = data.user_id.unique().shape[0]
    num_items = data.item_id.unique().shape[0]
    return data, num_users, num_items

In [2]:
def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
    users, items, scores = [], [], []
    inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
    for line in data.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = int(line[3]) if feedback == 'explicit' else 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        if feedback == 'implicit':
            inter.setdefault(user_index, []).append(item_index)
        else:
            inter[item_index, user_index] = score
    return users, items, scores, inter

In [3]:
def split_data_ml100k(data, num_users, num_items,
                      split_mode='random', test_ratio=0.1):
    """Split the dataset in random mode or seq-aware mode."""
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time))
            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        train_data = [item for item in train_list if item not in test_data]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        mask = [True if x == 1 else False for x in np.random.uniform(
            0, 1, (len(data))) < 1 - test_ratio]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

In [4]:
from torch import nn
class AutoRec(nn.Module):
    def __init__(self, num_hidden, num_users, dropout=0.05, type='train'):
        super(AutoRec, self).__init__()
        self.encoder = nn.Linear(num_users, num_hidden, bias=True)
        self.decoder = nn.Linear(num_hidden, num_users, bias=True)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout)
        self.type = type

    def forward(self, input):
        hidden = self.dropout(self.sigmoid(self.encoder(input)))
        pred = self.decoder(hidden)
        if self.type == 'train':  # Mask the gradient during training
            # 0인 부분의 예측값을 무시
            return pred * torch.sign(input)
        else:
            return pred

In [5]:
data, num_users, num_items = read_data_ml100k()
sparsity = 1 - len(data) / (num_users * num_items)
print(f'number of users: {num_users}, number of items: {num_items}')
print(f'matrix sparsity: {sparsity:f}')
print(data.head(5))

  """Entry point for launching an IPython kernel.


number of users: 943, number of items: 1682
matrix sparsity: 0.936953
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [6]:
import torch
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the MovieLens 100K dataset
df, num_users, num_items = read_data_ml100k()
train_data, test_data = split_data_ml100k(df, num_users, num_items)
_, _, _, train_inter_mat = load_data_ml100k(train_data, num_users,
                                                num_items)
_, _, _, test_inter_mat = load_data_ml100k(test_data, num_users,
                                               num_items)
train_iter = DataLoader(train_inter_mat, shuffle=True, batch_size=256)
test_iter = DataLoader(test_inter_mat, shuffle=True,batch_size=256)

  


In [7]:
next(iter(train_iter)).shape

torch.Size([256, 943])

In [8]:
# Model initialization, training, and evaluation

# param 
lr, num_epoch, weight_decay, num_hidden = 0.002, 100, 1e-5, 500

net = AutoRec(num_hidden, num_users).to(device)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr = lr, weight_decay=weight_decay)

In [9]:
from tqdm import tqdm
train_loss = []
for i in tqdm(range(num_epoch)):
    loss_arr =[]
    for j, x in enumerate(train_iter):
        x = x.to(device).float()
        # ===================forward=====================
        output = net.forward(x)
        loss = loss_func(output,x)
         # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
         # =================loss calculate==================
        loss_arr.append(loss.cpu().detach().numpy())
    if i%10 == 0:
        train_loss.append(np.mean(loss_arr))

100%|██████████| 100/100 [00:01<00:00, 50.21it/s]


In [14]:
with torch.no_grad():
    for j, x in enumerate(test_iter):
        x = x.to(device).float()
        print(x.shape)
        optimizer.zero_grad()
        output = net.forward(x)

    # if j % 1000 == 0:
        print(loss)        

AttributeError: 'DataLoader' object has no attribute 'shape'

In [11]:
def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
                        devices=d2l.try_all_gpus(), evaluator=None,
                        **kwargs):
    timer = d2l.Timer()
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2],
                            legend=['train loss', 'test RMSE'])
    for epoch in range(num_epochs):
        metric, l = d2l.Accumulator(3), 0.
        for i, values in enumerate(train_iter):
            timer.start()
            input_data = []
            values = values if isinstance(values, list) else [values]
            for v in values:
                input_data.append(gluon.utils.split_and_load(v, devices))
            train_feat = input_data[0:-1] if len(values) > 1 else input_data
            train_label = input_data[-1]
            with autograd.record():
                preds = [net(*t) for t in zip(*train_feat)]
                ls = [loss(p, s) for p, s in zip(preds, train_label)]
            [l.backward() for l in ls]
            l += sum([l.asnumpy() for l in ls]).mean() / len(devices)
            trainer.step(values[0].shape[0])
            metric.add(l, values[0].shape[0], values[0].size)
            timer.stop()
        if len(kwargs) > 0:  # It will be used in section AutoRec
            test_rmse = evaluator(net, test_iter, kwargs['inter_mat'],
                                  devices)
        else:
            test_rmse = evaluator(net, test_iter, devices)
        train_l = l / (i + 1)
        animator.add(epoch + 1, (train_l, test_rmse))
    print(f'train loss {metric[0] / metric[1]:.3f}, '
          f'test RMSE {test_rmse:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(devices)}')

NameError: name 'd2l' is not defined