In [43]:
import torch

In [16]:
ratings_path = 'data/ml-1m/ratings.dat'


In [59]:
import pandas as pd
ratings = pd.read_csv(ratings_path, names=['user', 'item', 'rating', 'timestamp'], sep='::', engine='python')
ratings

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [99]:
uidx_map = {user:i for i, user in enumerate(set(ratings['user']))}
iidx_map = {item:i for i, item in enumerate(set(ratings['item']))}
ratings['uidx'] = ratings['user'].map(uidx_map)
ratings['iidx'] = ratings['item'].map(iidx_map)

In [100]:
n_user = ratings['uidx'].nunique()
n_item = ratings['iidx'].nunique()

In [101]:
import torch.nn as nn

In [None]:
emb_dim = 64


In [67]:
class NeuMF(nn.Module):
    def __init__(self, n_user, n_item, emb_dim):
        super().__init__()
        # GMF
        self.GMF_user = nn.Embedding(n_user, emb_dim)
        self.GMF_item = nn.Embedding(n_item, emb_dim)

        # MLP
        self.MLP_user = nn.Embedding(n_user, emb_dim)
        self.MLP_item = nn.Embedding(n_item, emb_dim)
        self.MLP_linear = nn.Sequential(
            nn.Linear(emb_dim*2, emb_dim//2),
            nn.ReLU(),
            nn.Linear(emb_dim//2, emb_dim//(2*2)),
            nn.ReLU(),
            nn.Linear(emb_dim//(2*2), emb_dim//(2*2*2)),
        )

        # output layer
        self.output_layer = nn.Linear(emb_dim + (emb_dim//(2*2*2)), 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, item):
        # GMF
        phi_gmf = torch.mul(self.GMF_user(user), self.GMF_item(item))

        # MLP
        concat = torch.cat([self.MLP_user(user), self.MLP_item(item)], dim=1)
        phi_mlp = self.MLP_linear(concat)
        
        # output layer
        output = self.output_layer(torch.cat([phi_gmf, phi_mlp], dim=1))
        logit = self.sigmoid(output)

        return logit




In [68]:
model = NeuMF(n_user, n_item, emb_dim)

In [75]:
logits = model(torch.tensor([3]), torch.tensor([4]))

In [87]:
logits

tensor([[0.8450]], grad_fn=<SigmoidBackward0>)

In [81]:
logits.size()

torch.Size([1, 1])

In [85]:
torch.tensor([1]).view(-1, 1).size()

torch.Size([1, 1])

In [92]:
from torch.utils.data import Dataset

In [94]:
ratings

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [190]:
class CustomDataset(Dataset):
    def __init__(self, user, item, label):
        self.user = user
        self.item = item
        self.label = label

    def __getitem__(self, i):
        return self.user[i], self.item[i], self.label[i]
        
    def __len__(self):
        return len(self.user)


In [191]:
pos_items = ratings.groupby('uidx')['iidx'].agg(lambda x: set(x)).to_dict()

In [192]:
u = 1
u_pos_items = pos_items[u]

In [193]:
import numpy as np
neg_ratio = 3

neg_user, neg_item = [], []
for u in range(n_user):
    neg_items = list(set(range(n_item)) - u_positive_items)
    neg_samples = np.random.choice(neg_items, min(len(u_pos_items) * neg_ratio, len(neg_items)), replace=False)
    neg_user.extend([u] * len(neg_samples))
    neg_item.extend(neg_samples)



In [194]:
dataset = CustomDataset(ratings['uidx'].tolist() + neg_user,
            ratings['iidx'].tolist() + neg_item,
            [1] * len(ratings) + [0] * len(neg_user)
)

In [195]:
from torch.utils.data import DataLoader

In [196]:
from torch.utils.data import Dataset, DataLoader, random_split

In [197]:
n_train_sample = len(dataset)
n_train_sample


3337689

In [198]:
n_sample = len(dataset)
n_train = int(n_sample*0.9)
n_valid = n_sample - n_train
train_dataset, test_dataset = random_split(dataset, [n_train, n_valid])  # del dataset? memory problem

In [199]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [202]:
label

tensor([1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
        0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0])

In [218]:
import torch.optim as optim


In [220]:

label

tensor([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [222]:

from tqdm.auto import tqdm

In [223]:
criterion = nn.BCELoss()
learning_rate = 0.01
n_epoch = 30
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(1, n_epoch+1):
    print(f"epoch : {epoch}")
    for user, item, label in tqdm(train_dataloader):
        optimizer.zero_grad() 
        logits = model(user, item)
        label = label.type(torch.FloatTensor)
        loss = criterion(logits.flatten(), label)
        loss.backward()
        optimizer.step()



 30%|███       | 14096/46937 [00:29<01:09, 470.02it/s]


KeyboardInterrupt: 

tensor([0.4288, 0.2783, 0.6990, 0.4985, 0.5362, 0.6302, 0.5806, 0.6039, 0.5049,
        0.5777, 0.5041, 0.4923, 0.4877, 0.5171, 0.5693, 0.7296, 0.4382, 0.5482,
        0.3729, 0.5660, 0.4740, 0.3764, 0.3212, 0.5239, 0.2964, 0.5475, 0.3921,
        0.4462, 0.5564, 0.4387, 0.7376, 0.3994, 0.5036, 0.6234, 0.6680, 0.3902,
        0.2830, 0.6937, 0.5037, 0.4346, 0.4465, 0.1807, 0.1832, 0.3297, 0.4973,
        0.2980, 0.5144, 0.6504, 0.2807, 0.6000, 0.5585, 0.6143, 0.5434, 0.4911,
        0.4733, 0.3918, 0.2482, 0.5487, 0.3957, 0.5818, 0.5712, 0.6077, 0.4549,
        0.3707], grad_fn=<ReshapeAliasBackward0>)

tensor(0.7841, grad_fn=<BinaryCrossEntropyBackward0>)