In [19]:
# Standard library imports
import random
import time

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

In [20]:
columns_name=['user_id','item_id','rating','timestamp']
df = pd.read_csv("../../data/raw/ml-100k/u.data",sep="\t",names=columns_name)
print(len(df))
display(df.head(5))

100000


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [21]:
# Perform a 80/20 train-test split on the interactions in the dataset
train, test = train_test_split(df.values, test_size=0.2, random_state=16)
train_df = pd.DataFrame(train, columns=df.columns)
test_df = pd.DataFrame(test, columns=df.columns)

In [22]:
print("Train Size  : ", len(train_df))
print("Test Size : ", len (test_df))

Train Size  :  80000
Test Size :  20000


In [23]:
le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
train_df['user_id_idx'] = le_user.fit_transform(train_df['user_id'].values)
train_df['item_id_idx'] = le_item.fit_transform(train_df['item_id'].values)
train_user_ids = train_df['user_id'].unique()
train_item_ids = train_df['item_id'].unique()

print(len(train_user_ids), len(train_item_ids))

test_df = test_df[
  (test_df['user_id'].isin(train_user_ids)) & \
  (test_df['item_id'].isin(train_item_ids))
]
print(len(test))

943 1645
20000


In [24]:
test_df['user_id_idx'] = le_user.transform(test_df['user_id'].values)
test_df['item_id_idx'] = le_item.transform(test_df['item_id'].values)

In [25]:
n_users = train_df['user_id_idx'].nunique()
n_items = train_df['item_id_idx'].nunique()
print("Number of Unique Users : ", n_users)
print("Number of unique Items : ", n_items)

Number of Unique Users :  943
Number of unique Items :  1645


In [26]:
def data_loader(data, batch_size, n_usr, n_itm):

    def sample_neg(x):
        while True:
            neg_id = random.randint(0, n_itm - 1)
            if neg_id not in x:
                return neg_id

    interected_items_df = data.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
    indices = [x for x in range(n_usr)]

    if n_usr < batch_size:
        users = [random.choice(indices) for _ in range(batch_size)]
    else:
        users = random.sample(indices, batch_size)
    users.sort()
    users_df = pd.DataFrame(users,columns = ['users'])

    interected_items_df = pd.merge(interected_items_df, users_df, how = 'right', left_on = 'user_id_idx', right_on = 'users')
    pos_items = interected_items_df['item_id_idx'].apply(lambda x : random.choice(x)).values
    neg_items = interected_items_df['item_id_idx'].apply(lambda x: sample_neg(x)).values

    return (
        torch.LongTensor(list(users)), 
        torch.LongTensor(list(pos_items)) + n_usr, 
        torch.LongTensor(list(neg_items)) + n_usr
    )

data_loader(train_df, 16, n_users, n_items)

(tensor([ 68, 169, 182, 207, 229, 238, 389, 392, 444, 595, 623, 799, 821, 879,
         894, 931]),
 tensor([1179, 1815, 1317, 1721, 1562, 1121, 1244, 1438, 1684, 1619, 1815, 1236,
         2029,  945, 1093, 1401]),
 tensor([1833, 2454, 1106, 1768, 1387, 1513, 1435, 1854, 2208, 1963, 2241, 1403,
         2353, 1562, 2295, 1023]))

In [27]:
u_t = torch.LongTensor(train_df.user_id_idx)
i_t = torch.LongTensor(train_df.item_id_idx) + n_users

train_edge_index = torch.stack((
  torch.cat([u_t, i_t]),
  torch.cat([i_t, u_t])
))
train_edge_index

tensor([[ 523,  428,  504,  ..., 1128, 1164, 1242],
        [1356, 1034, 1114,  ...,  637,  837,   11]])

In [28]:
train_edge_index[:,-1], train_edge_index[:, 0]

(tensor([1242,   11]), tensor([ 523, 1356]))

In [29]:
train_edge_index[:, len(train)-1], train_edge_index[:, len(train)]

(tensor([  11, 1242]), tensor([1356,  523]))

In [30]:
class LightGCNConv(MessagePassing):
  def __init__(self, **kwargs):  
    super().__init__(aggr='add')

  def forward(self, x, edge_index):
    # Compute normalization
    from_, to_ = edge_index
    deg = degree(to_, x.size(0), dtype=x.dtype)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    norm = deg_inv_sqrt[from_] * deg_inv_sqrt[to_]

    # Start propagating messages (no update after aggregation)
    return self.propagate(edge_index, x=x, norm=norm)

  def message(self, x_j, norm):
    return norm.view(-1, 1) * x_j

In [None]:
class LightGCNModel(nn.Module):
  def __init__(
      self,
      latent_dim, 
      num_layers,
      num_users,
      num_items,
  ):
    super(LightGCNModel, self).__init__()

    self.embedding = nn.Embedding(num_users + num_items, latent_dim)
    self.convs = nn.ModuleList(LightGCNConv() for _ in range(num_layers))
    self.init_parameters()


  def init_parameters(self):
    ## przetestowac inicjalizacje
    #   nn.init.xavier_uniform_(self.embedding.weight, gain=1)
    # Authors of LightGCN report higher results with normal initialization
    nn.init.normal_(self.embedding.weight, std=0.1) 


  def forward(self, edge_index):
    emb0 = self.embedding.weight
    embs = [emb0]

    emb = emb0
    for conv in self.convs:
      emb = conv(x=emb, edge_index=edge_index)
      embs.append(emb)

    out = torch.mean(torch.stack(embs, dim=0), dim=0)
    
    return emb0, out


  def encode_minibatch(self, users, pos_items, neg_items, edge_index):
    emb0, out = self(edge_index)
    return (
        out[users], 
        out[pos_items], 
        out[neg_items], 
        emb0[users],
        emb0[pos_items],
        emb0[neg_items]
    )