In [1]:
# -*- coding:utf-8 -*-
# Modified Author: Inyong Hwang (inyong1020@gmail.com)
# Date: 2019-08-08-Thu
# 파이토치 첫걸음 Chapter 6. 추천 시스템과 행렬 분해
# In Colab

# 6.1 행렬 인수분해

import torch
from torch import nn, optim
from torch.utils.data import (Dataset,
                              DataLoader,
                              TensorDataset)
import tqdm
import pandas as pd
from sklearn import model_selection

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!ls '/content/drive/My Drive/PUBLIC/GITHUB/Study/Book/C. Korean/파이토치 첫걸음/Chapter_6_input/ml-20m'

genome-scores.csv  links.csv   ratings.csv  tags.csv
genome-tags.csv    movies.csv  README.txt


In [0]:
df = pd.read_csv('/content/drive/My Drive/PUBLIC/GITHUB/Study/Book/C. Korean/파이토치 첫걸음/Chapter_6_input/ml-20m/ratings.csv')
x = df[["userId", "movieId"]].values
y = df[["rating"]].values

train_x, test_x, train_y, test_y = model_selection.train_test_split(x, y, test_size=0.1)

train_dataset = TensorDataset(torch.tensor(train_x, dtype=torch.int64), torch.tensor(train_y, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(test_x, dtype=torch.int64), torch.tensor(test_y, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=1024, num_workers=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=4)

In [0]:
class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        out = torch.sum(user_feature * item_feature, 1)
        
        out = nn.functional.sigmoid(out) * 5
        # out = torch.sigmoid(5) * 5
        return out

In [0]:
max_user, max_item = x.max(0)
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user+1, max_item+1)

In [0]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    for x, y in loader:
        x = x.to(device)
        ys.append(y)
        with torch.no_grad():
            ypred = net(x).to("cpu").view(-1)
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score.item()

In [7]:
from statistics import mean

net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log), test_score, flush=True)

100%|██████████| 17579/17579 [02:47<00:00, 105.17it/s]


0 1.6050101528531748 0.7351292967796326


100%|██████████| 17579/17579 [02:53<00:00, 101.27it/s]


1 0.885911953236315 0.7105944156646729


100%|██████████| 17579/17579 [02:56<00:00, 99.75it/s]


2 0.8392625856849818 0.7011438608169556


100%|██████████| 17579/17579 [02:56<00:00, 99.55it/s]


3 0.8146167476773025 0.6961936354637146


100%|██████████| 17579/17579 [02:57<00:00, 99.15it/s]


4 0.8003950544175071 0.6935644745826721


In [8]:
net.to("cpu")

query = (1, 10)

query = torch.tensor(query, dtype=torch.int64).view(1, -1)

net(query)



tensor([3.5506], grad_fn=<MulBackward0>)

In [0]:
# 6.2 신경망 행렬 인수분해

class NeuralMatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        out = torch.cat([user_feature, item_feature], 1)
        out = self.mlp(out)
        out = nn.functional.sigmoid(out) * 5
        # out = torch.sigmoid(out) * 5
        return out.squeeze()

In [0]:
import csv
from sklearn.feature_extraction.text import CountVectorizer

with open('/content/drive/My Drive/PUBLIC/GITHUB/Study/Book/C. Korean/파이토치 첫걸음/Chapter_6_input/ml-20m/movies.csv') as fp:
    reader = csv.DictReader(fp)
    def parse(d):
        movieId = int(d["movieId"])
        genres = d["genres"]
        return movieId, genres
    data = [parse(d) for d in reader]

movieIds = [x[0] for x in data]
genres = [x[1] for x in data]

cv = CountVectorizer(dtype="f4").fit(genres)
num_genres = len(cv.get_feature_names())

it = cv.transform(genres).toarray()
it = (torch.tensor(g, dtype=torch.float32) for g in it)
genre_dict = dict(zip(movieIds, it))

In [0]:
def first(xs):
    it = iter(xs)
    return next(it)

class MovieLensDataset(Dataset):
    def __init__(self, x, y, genres):
        assert len(x) == len(y)
        self.x = x
        self.y = y
        self.genres = genres
        
        self.null_genre = torch.zeros_like(first(genres.values()))

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        # x = (userId, movieId)
        movieId = x[1]
        g = self.genres.get(movieId, self.null_genre)
        return x, y, g

In [0]:
train_dataset = MovieLensDataset(
    torch.tensor(train_x, dtype=torch.int64),
    torch.tensor(train_y, dtype=torch.float32),
    genre_dict
)
test_dataset = MovieLensDataset(
    torch.tensor(test_x, dtype=torch.int64),
    torch.tensor(test_y, dtype=torch.float32),
    genre_dict
)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=4)

In [0]:
class NeuralMatrixFactorization2(nn.Module):
    def __init__(self, max_user, max_item, num_genres, user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k + num_genres, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x, g):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        out = torch.cat([user_feature, item_feature, g], 1)
        out = nn.functional.sigmoid(out) * 5
        # out = torch.sigmoid(out) * 5
        return out.squeeze()

In [0]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    for x, y, g in loader:
        x = x.to(device)
        g = g.to(device)
        ys.append(y)
        with torch.no_grad():
            ypred = net(x, g).to("cpu")
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score

In [22]:
net = NeuralMatrixFactorization2(max_user+1, max_item+1, num_genres)
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

net.to("cuda:0")
for epoch in range(5):
    loss_log = []
    net.train()
    for x, y, g in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        g = g.to("cuda:0")
        o = net(x, g)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    net.eval()
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log), test_score.item(), flush=True)

  return F.mse_loss(input, target, reduction=self.reduction)



RuntimeError: ignored