In [1]:
# Standard library imports
import random
import time

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree
from torch_geometric.nn.models import lightgcn

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

In [2]:
columns_name=['user_id','item_id','rating','timestamp']
df = pd.read_csv("../data/ml-100k/u.data",sep="\t",names=columns_name)
print(len(df))
display(df.head(5))

100000


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# remove low rating
df = df[df['rating']>=3]
print(len(df))

82520


In [4]:
# train test split
train, test = train_test_split(df.values, test_size=0.2, random_state=16)
train_df = pd.DataFrame(train, columns=df.columns)
test_df = pd.DataFrame(test, columns=df.columns)

In [5]:
print("Train Size  : ", len(train_df))
print("Test Size : ", len (test_df))

Train Size  :  66016
Test Size :  16504


In [6]:
# relabeling nodes
le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
train_df['user_id_idx'] = le_user.fit_transform(train_df['user_id'].values)
train_df['item_id_idx'] = le_item.fit_transform(train_df['item_id'].values)

In [7]:
# remove user item in test but not in train
train_user_ids = train_df['user_id'].unique()
train_item_ids = train_df['item_id'].unique()

test_df = test_df[
  (test_df['user_id'].isin(train_user_ids)) & \
  (test_df['item_id'].isin(train_item_ids))
]
print('Size of test set before/ after(remove user/item not in train set):', len(test), len(test_df))

Size of test set before/ after(remove user/item not in train set): 16504 16472


In [8]:
test_df['user_id_idx'] = le_user.transform(test_df['user_id'].values)
test_df['item_id_idx'] = le_item.transform(test_df['item_id'].values)

In [9]:
n_users = train_df['user_id_idx'].nunique()
n_items = train_df['item_id_idx'].nunique()
print("Number of Unique Users : ", n_users)
print("Number of unique Items : ", n_items)

Number of Unique Users :  943
Number of unique Items :  1546


In [10]:
interected_items_df = train_df.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
interected_items_df

Unnamed: 0,user_id_idx,item_id_idx
0,0,"[37, 112, 206, 79, 189, 13, 32, 57, 3, 247, 194, 269, 221, 186, 29, 200, 160, 54, 158, 67, 155, 223, 89, 27, 187, 249, 138, 234, 87, 179, 81, 49, 14, 126, 31, 47, 133, 52, 63, 53, 240, 44, 113, 256, 58, 6, 245, 66, 238, 86, 199, 209, 116, 170, 192, 205, 75, 157, 225, 180, 195, 153, 105, 150, 241, 85, 9, 210, 2, 17, 69, 173, 182, 222, 177, 5, 226, 134, 15, 204, 255, 235, 45, 41, 167, 12, 184, 71, 207, 203, 132, 55, 183, 233, 171, 190, 122, 215, 188, 88, ...]"
1,1,"[304, 283, 18, 281, 305, 311, 296, 290, 289, 302, 287, 284, 295, 24, 298, 279, 285, 250, 13, 309, 0, 314, 12, 278, 273, 110, 275, 310, 282, 256, 286, 297, 291, 312, 292, 49, 99, 307, 126, 300, 268, 294, 254, 271, 272, 299, 257, 301, 288]"
2,2,"[341, 346, 327, 337, 345, 302, 347, 349, 343, 306, 353, 180, 325, 267, 344, 319, 270, 332, 348, 320, 318, 338, 342, 340, 298]"
3,3,"[326, 359, 270, 352, 302, 358, 357, 257, 355, 299, 360, 354, 259, 325, 263, 287, 49, 300, 209, 327, 10]"
4,4,"[180, 20, 94, 266, 436, 226, 182, 420, 41, 238, 184, 150, 120, 173, 61, 427, 49, 372, 93, 209, 415, 397, 424, 377, 249, 210, 24, 412, 143, 382, 69, 416, 16, 361, 142, 97, 373, 229, 162, 1, 134, 89, 430, 410, 432, 185, 406, 371, 411, 399, 256, 203, 448, 425, 153, 78, 207, 418, 167, 378, 433, 365, 100, 138, 99, 429, 232, 364, 104, 152, 171, 218, 213, 381, 426, 428, 168, 417, 101, 225, 439, 440, 221, 403, 188, 423, 421, 23]"
...,...,...
938,938,"[297, 469, 273, 282, 251, 409, 590, 8, 830, 978, 464, 219, 1259, 681, 253, 221, 274, 284, 808, 403, 1013, 126, 117, 14, 407, 748, 734, 919, 1008, 236, 105, 1173, 324, 120, 256, 584, 539, 254, 279]"
939,939,"[65, 136, 314, 190, 355, 475, 167, 49, 203, 199, 501, 99, 171, 6, 643, 345, 13, 193, 94, 173, 212, 859, 237, 1150, 309, 88, 258, 865, 738, 380, 69, 204, 288, 299, 268, 509, 146, 313, 701, 169, 301, 215, 271, 620, 95, 315, 11, 843, 670, 175, 684, 284, 433, 464, 192, 312, 522, 467, 743, 427, 561, 7, 700, 182, 160, 649, 293, 1122, 180, 424, 97, 647]"
940,940,"[904, 123, 257, 978, 14, 116, 0, 406, 297, 256, 293, 180, 448, 272, 755]"
941,941,"[49, 30, 878, 489, 477, 432, 70, 320, 264, 326, 214, 130, 513, 316, 930, 599, 258, 480, 697, 504, 209, 196, 654, 491, 532, 192, 865, 424, 742, 314, 313, 954, 321, 116, 312, 199, 507, 653, 473, 215, 596, 493, 98, 521, 257, 94, 1187, 1204, 182, 577, 123, 355, 651, 607, 1013, 309, 96, 472, 271]"


In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# data loader

In [12]:
def data_loader(data, batch_size, n_usr, n_itm):

    def sample_neg(x):
        while True:
            neg_id = random.randint(0, n_itm - 1)
            if neg_id not in x:
                return neg_id

    interected_items_df = data.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
    indices = [x for x in range(n_usr)]

    if n_usr < batch_size:
        users = [random.choice(indices) for _ in range(batch_size)]
    else:
        users = random.sample(indices, batch_size)
    users.sort()
    users_df = pd.DataFrame(users,columns = ['users'])

    interected_items_df = pd.merge(interected_items_df, users_df, how = 'right', left_on = 'user_id_idx', right_on = 'users')
    pos_items = interected_items_df['item_id_idx'].apply(lambda x : random.choice(x)).values
    neg_items = interected_items_df['item_id_idx'].apply(lambda x: sample_neg(x)).values

    print(
        torch.LongTensor(list(users)).to(device),'\n',
        torch.LongTensor(list(pos_items)).to(device) + n_usr,'\n',
        torch.LongTensor(list(neg_items)).to(device) + n_usr, '\n'
    )
    return (
        torch.LongTensor(list(users)).to(device),
        torch.LongTensor(list(pos_items)).to(device) + n_usr,
        torch.LongTensor(list(neg_items)).to(device) + n_usr
    )

# data_loader(train_df, 16, n_users, n_items)

# Edge Index

In [13]:
u_t = torch.LongTensor(train_df.user_id_idx)
i_t = torch.LongTensor(train_df.item_id_idx) + n_users

train_edge_index = torch.stack((
  torch.cat([u_t, i_t]),
  torch.cat([i_t, u_t])
)).to(device)
train_edge_index

tensor([[ 769,  168,  326,  ..., 1683, 2006,  989],
        [1192, 1272, 1085,  ...,  601,  621,   59]])

# LGConv layer

In [14]:
test_x = torch.Tensor(np.eye(5))
test_edge_index = torch.LongTensor(np.array([
  [0, 0, 1, 1, 2, 3, 3, 4],
  [2, 3, 3, 4, 0, 0, 1, 1]
]))

In [15]:
from torch_geometric.nn import LGConv
LGConv()(test_x, test_edge_index)

tensor([[0.0000, 0.0000, 0.7071, 0.5000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.5000, 0.7071],
        [0.7071, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7071, 0.0000, 0.0000, 0.0000]])

# BPR Loss + Eval_Metrics

# Train_and_eval

In [44]:
users, pos_items, neg_items = data_loader(train_df, , n_users, n_items)

tensor([232, 295, 308, 451, 544, 546, 566, 702, 712, 843]) 
 tensor([1319, 1122, 1275, 1410, 1084, 1286, 1434, 1975, 1286, 1059]) 
 tensor([2317, 1422, 1545, 1087, 1386, 1151, 1602, 1881, 1053, 1132]) 



In [16]:
from torch_geometric.nn import LightGCN
from torch_geometric.loader import DataLoader
from torch_geometric.utils import negative_sampling

In [17]:
latent_dim = 64
n_layers = 3

EPOCHS = 50
BATCH_SIZE = 10
LR = 0.005
K = 20

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
model = LightGCN(num_nodes=n_users+n_items, embedding_dim=latent_dim, num_layers=n_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
print("Size of Learnable Embedding : ", [x.shape for x in list(model.parameters())])

Size of Learnable Embedding :  [torch.Size([2489, 64])]


In [90]:
# edge_label need to indicate positive and negative edges
# edge_index for the graph
# data = dataset[0].to(device)

index = [[0, 0, 1, 1, 2, 3, 3, 4], [2, 3, 3, 4, 0, 0, 1, 1]]

loader = DataLoader(index, batch_size=2, shuffle=True)
# loader = DataLoader(train_edge_index, batch_size=BATCH_SIZE, shuffle=True)

In [93]:
from torch_geometric.datasets import LastFM

In [94]:
dataset = LastFM(root="./LastFM")

In [100]:
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [99]:
dataset[0]

HeteroData(
  [1muser[0m={ num_nodes=1892 },
  [1martist[0m={ num_nodes=17632 },
  [1mtag[0m={ num_nodes=1088 },
  [1m(user, to, artist)[0m={
    train_neg_edge_index=[2, 33294760],
    val_pos_edge_index=[2, 9283],
    val_neg_edge_index=[2, 9283],
    test_pos_edge_index=[2, 18567],
    test_neg_edge_index=[2, 18567],
    edge_index=[2, 64984]
  },
  [1m(user, to, user)[0m={ edge_index=[2, 25434] },
  [1m(artist, to, user)[0m={ edge_index=[2, 64984] },
  [1m(artist, to, tag)[0m={ edge_index=[2, 23253] },
  [1m(tag, to, artist)[0m={ edge_index=[2, 23253] }
)

In [104]:
for batch in loader:
    print(batch.num_edges)

33431684


In [46]:
model.train()

for epoch in range(EPOCHS):
    for batch in loader:
        optimizer.zero_grad()
        edge_index = train_edge_index
        pos_label =
        neg_label = negative_sampling(pos_label)
        total_label =
        out = model(edge_index, total_label)
        loss = model.recommendation_loss(out[pos_label], out[neg_label])
        loss.backward()
        optimizer.step()

In [51]:
# cc = LGCN.get_embedding(train_edge_index)
# cc

tensor([[-2.9700e-03,  6.7198e-03, -7.0369e-03,  ..., -4.2175e-03,
         -1.1041e-02, -1.3709e-02],
        [-6.5093e-03,  7.2192e-03, -1.0881e-02,  ...,  9.0594e-03,
         -8.7273e-03,  1.1111e-02],
        [-3.9874e-03, -2.0468e-03,  7.7710e-03,  ..., -1.7289e-03,
          1.8313e-04,  4.8628e-03],
        ...,
        [-7.5256e-03,  1.3774e-03,  9.3665e-03,  ...,  1.3144e-02,
          1.3366e-03,  8.6798e-03],
        [-3.0878e-04, -4.0927e-03, -3.4533e-05,  ...,  5.2982e-03,
          2.0726e-03,  6.3662e-03],
        [ 5.5121e-03, -1.9212e-03,  1.1485e-02,  ..., -1.0579e-02,
         -8.9293e-03, -3.5650e-03]], grad_fn=<AddBackward0>)

In [52]:
# cc.shape

torch.Size([2489, 64])

In [1]:
# light_loss, light_bpr, light_reg, light_recall, light_precision = train_and_eval(lightgcn, optimizer, train_df)

In [None]:
LGCN.eval()

# pred = model(data).argmax(dim=1)
# correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
# acc = int(correct) / int(data.test_mask.sum())
# print(f'Accuracy: {acc:.4f}')

In [20]:
# usr: 0, 1; itm: 2, 3, 4
# positive: 0 - 2,3 ; 1 - 3, 4
# negative: 0 - 4; 1 - 2

test_edge_index

tensor([[0, 0, 1, 1, 2, 3, 3, 4],
        [2, 3, 3, 4, 0, 0, 1, 1]])

In [45]:
test_edge_index_1 = torch.as_tensor([[0,  0,  1,  1,  2,  3,  3,  4],
                                     [12, 13, 13, 14, 10, 10, 11, 11]])

In [46]:
from torch_geometric.utils import negative_sampling
from torch_geometric.utils import structured_negative_sampling
neg_edge_index = structured_negative_sampling(test_edge_index_1)
neg_edge_index

(tensor([0, 0, 1, 1, 2, 3, 3, 4]),
 tensor([12, 13, 13, 14, 10, 10, 11, 11]),
 tensor([4, 0, 3, 8, 0, 6, 1, 2]))

In [52]:
from torch_geometric.data import Data
data = Data(x = ,edge_index=test_edge_index_1)

In [53]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(is_undirected=False, add_negative_train_samples=True)
train_data, val_data, test_data = transform(data)



In [56]:
train_data.edge_index

tensor([[ 0,  4,  3,  3,  2,  1,  0],
        [12, 11, 10, 11, 10, 13, 13]])

In [57]:
val_data.edge_index

tensor([[ 0,  4,  3,  3,  2,  1,  0],
        [12, 11, 10, 11, 10, 13, 13]])

In [58]:
test_data.edge_index

tensor([[ 0,  4,  3,  3,  2,  1,  0],
        [12, 11, 10, 11, 10, 13, 13]])

In [59]:
x_s = torch.randn(2, 16)

In [60]:
x_s

tensor([[-0.3352,  1.6670, -1.1732,  0.2516, -0.1516,  0.6130,  1.2359,  0.2416,
          0.1629,  0.7882,  1.4856, -0.1718, -0.4131, -0.2350,  1.1422,  1.0242],
        [-0.3668, -0.1371,  0.9067, -0.9148,  0.3964, -1.0921, -0.7567, -0.1679,
          0.6423, -0.1423, -0.9084, -0.0144, -0.9528, -0.6947, -0.4912,  1.5782]])