### LightGCN
- https://arxiv.org/abs/2002.02126

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from torch_geometric.nn import GCNConv

# Load and preprocess data
data_dir = 'data/'
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

all_data = pd.read_csv(os.path.join(data_dir, "train_data.csv"), names=None)
train, valid = train_test_split(all_data, test_size=0.2, stratify=all_data['user_id'], random_state=1234)

test_data = pd.read_csv(os.path.join(data_dir, "test_data.csv"), names=None)

user_list = list(all_data['user_id'].unique())
item_list = list(all_data['item_id'].unique())

num_users = len(user_list)
num_items = len(item_list)
num_ratings = len(all_data)

user_id_dict = {old_uid: new_uid for new_uid, old_uid in enumerate(user_list)}
all_data.user_id = [user_id_dict[x] for x in all_data.user_id.tolist()]

item_id_dict = {old_uid: new_uid for new_uid, old_uid in enumerate(item_list)}
all_data.item_id = [item_id_dict[x] for x in all_data.item_id.tolist()]

train = train[['user_id', 'item_id']].to_numpy()
valid = valid[['user_id', 'item_id']].to_numpy()
print(train.shape)
print(valid.shape)

# Encode user and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train[:, 0] = user_encoder.fit_transform(train[:, 0])
train[:, 1] = item_encoder.fit_transform(train[:, 1])

valid[:, 0] = user_encoder.transform(valid[:, 0])
valid[:, 1] = item_encoder.transform(valid[:, 1])

# Convert train and valid data to TensorDataset
train_dataset = TensorDataset(torch.tensor(train, dtype=torch.long))
valid_dataset = TensorDataset(torch.tensor(valid, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=80, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=40, shuffle=True)

import torch
import torch.nn as nn
import torch_geometric.nn as pyg_nn
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# class GCNRec(nn.Module):
#     def __init__(self, num_users, num_items, embedding_size, n_layers, dropout=0.5):
#         super(GCNRec, self).__init__()
#         self.embedding_size = embedding_size
#         self.n_layers = n_layers
#         self.user_embeddings = nn.Embedding(num_users, embedding_size)
#         self.item_embeddings = nn.Embedding(num_items, embedding_size)
#         self.dropout = nn.Dropout(dropout)
#         self.layers = nn.ModuleList([
#             GCNConv(embedding_size, embedding_size) for _ in range(n_layers)
#         ])
#         self.output_layer = nn.Linear(embedding_size, 1)

#     def forward(self, users, items, edge_index):
#         user_embeddings = self.user_embeddings(users)
#         item_embeddings = self.item_embeddings(items)
#         x = torch.cat([user_embeddings, item_embeddings], dim=0)
#         for layer in self.layers:
#             x = layer(x, edge_index)
#             x = F.relu(x)
#             x = self.dropout(x)
#         user_item_embeddings = self.output_layer(x[:num_users] * x[num_users:])
#         return user_item_embeddings.squeeze()

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_size, n_layers,dropout=0.5):
        super(LightGCN, self).__init__()
        self.embedding_size = embedding_size
        self.n_layers = n_layers
        self.user_embeddings = nn.Embedding(num_users, embedding_size)
        self.item_embeddings = nn.Embedding(num_items, embedding_size)
        self.dropout = nn.Dropout(dropout)
        # self.layers = nn.ModuleList([GraphConv(embedding_size, embedding_size) for _ in range(n_layers)])
        self.layers = nn.ModuleList([
            nn.Linear(embedding_size, embedding_size) for _ in range(n_layers)
        ])
    def forward(self, users, items, edge_index):
        user_embeddings = self.user_embeddings(users)
        item_embeddings = self.item_embeddings(items)

        for layer in self.layers:
            user_embeddings = layer(user_embeddings)
            item_embeddings = layer(item_embeddings)
            user_embeddings = self.dropout(user_embeddings)
            item_embeddings = self.dropout(item_embeddings)

        user_item_embeddings = (user_embeddings * item_embeddings).sum(dim=1)
        return user_item_embeddings
train_users, train_items = train[:, 0], train[:, 1]
train_items += num_users  # Shift item indices
edge_index = torch.tensor(np.vstack([train_users, train_items]), dtype=torch.long)

embedding_size = 128
n_layers = 3
dropout = 0.3
lr = 0.001
weight_decay = 1e-5
epochs = 100
# batch_size = 256
top_k = 100
model = LightGCN(num_users, num_items, embedding_size=embedding_size, n_layers=n_layers,dropout=dropout).to(device)

# model = GCNRec(num_users,num_items,embedding_size=embedding_size,n_layers=n_layers,dropout=dropout).to(device)


In [None]:
import torch.nn.functional as F

# Set up the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
# Train the model and evaluate on valid data
best_val_loss = float('inf')
patience = 20
cnt = 0
best_model_state = None
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        users, items = batch[0][:, 0].to(device), batch[0][:, 1].to(device)
        # Generate negative samples
        negative_items = torch.randint(0, num_items, size=(len(items),)).to(device)

        positive_preds = model(users, items, edge_index.to(device))
        negative_preds = model(users, negative_items, edge_index.to(device))

        # Compute BPR loss
        loss = -F.logsigmoid(positive_preds - negative_preds).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Evaluate on validation data
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in valid_loader:
            user, item = batch[0][:, 0].to(device), batch[0][:, 1].to(device)
            # Generate negative samples
            negative_items = torch.randint(0, num_items, size=(len(items),)).to(device)

            positive_preds = model(users, items, edge_index.to(device))
            negative_preds = model(users, negative_items, edge_index.to(device))

            # Compute BPR loss
            loss = -F.logsigmoid(positive_preds - negative_preds).sum()
            val_loss += loss.item()
    val_loss /= len(valid_loader)

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_loss:.4f}, Validation Loss : {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(),'best_model.pth')
        # best_model_state = model.state_dict()
        cnt = 0
    else:
        cnt += 1
    # if cnt == patience:
    #     print("Early stopping")
    #     break
        
## Load the best model
if best_model_state is not None:
    best_model = LightGCN(num_users, num_items, embedding_size, n_layers).to(device)
    best_model.load_state_dict(torch.load('best_model.pth'))
else:
    best_model = model

In [None]:
# Preprocess the test data
test_users = test_data['user_id'].to_numpy()
test_users = user_encoder.transform(test_users)
test_users_tensor = torch.tensor(test_users, dtype=torch.long).to(device)


# Make predictions for each user in the test data
best_model.eval()
with torch.no_grad():
    user_embeddings, item_embeddings = best_model.user_embeddings.weight, best_model.item_embeddings.weight
    all_scores = torch.matmul(user_embeddings[test_users], item_embeddings.t())
    top_k_items = torch.topk(all_scores, top_k, dim=1).indices

# Convert the predicted item IDs back to their original IDs
predicted_item_ids = item_encoder.inverse_transform(top_k_items.cpu().numpy().flatten()).reshape(top_k_items.shape)

# Save the predictions as a CSV file
from tqdm import tqdm

submission_data = [
    {"user_id": user_id, "item_id": item_id}
    for user_id, user_predictions in tqdm(enumerate(predicted_item_ids))
    for item_id in user_predictions
]

# Convert the list of dictionaries to a DataFrame
submission = pd.DataFrame(submission_data)

# Save the predictions as a CSV file
submission.to_csv('submission_lightgcn7.csv', index=False)


### Embarrassingly Shallow Autoencoders for Sparse Data
- https://arxiv.org/abs/1905.03375

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from ease_rec.model import EASE

# Load and preprocess data
data_dir = 'data/'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

all_data = pd.read_csv(os.path.join(data_dir, "train_data.csv"), names=None)
train, valid = train_test_split(all_data, test_size=0.2, stratify=all_data['user_id'], random_state=42)

test_data = pd.read_csv(os.path.join(data_dir, "test_data.csv"), names=None)

user_list = list(all_data['user_id'].unique())
item_list = list(all_data['item_id'].unique())

num_users = len(user_list)
num_items = len(item_list)
num_ratings = len(all_data)

user_id_dict = {old_uid: new_uid for new_uid, old_uid in enumerate(user_list)}
all_data.user_id = [user_id_dict[x] for x in all_data.user_id.tolist()]

item_id_dict = {old_uid: new_uid for new_uid, old_uid in enumerate(item_list)}
all_data.item_id = [item_id_dict[x] for x in all_data.item_id.tolist()]

train = train[['user_id', 'item_id']].to_numpy()
valid = valid[['user_id', 'item_id']].to_numpy()
print(train.shape)
print(valid.shape)

# Encode user and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train[:, 0] = user_encoder.fit_transform(train[:, 0])
train[:, 1] = item_encoder.fit_transform(train[:, 1])

valid[:, 0] = user_encoder.transform(valid[:, 0])
valid[:, 1] = item_encoder.transform(valid[:, 1])

# Convert train and valid data to TensorDataset
train_dataset = TensorDataset(torch.tensor(train, dtype=torch.long))
valid_dataset = TensorDataset(torch.tensor(valid, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=True)

(677494, 2)
(169374, 2)


In [3]:
# Instantiate an object of the EASE class
ease = EASE()

# Fit the EASE model on the training data
ease.fit(all_data)

# Preprocess the test data
test_users = test_data['user_id'].to_numpy()
test_users = user_encoder.transform(test_users)
test_items = item_encoder.transform(item_encoder.classes_)
test_items_tensor = torch.tensor(test_items, dtype=torch.long).to(device)

# Use the EASE object to get the top-k recommendations for each user in the test data
top_k = 20
predictions = ease.predict(all_data, test_users, test_items, k=top_k)

# Convert the predicted item IDs back to their original IDs
predicted_item_ids = item_encoder.inverse_transform(predictions['item_id'].to_numpy())

# Reshape the predicted item IDs to match the shape of the top-k items tensor
predicted_item_ids = predicted_item_ids.reshape((len(test_users), top_k))
from tqdm import tqdm
# Save the predictions as a CSV file
submission_data = [
    {"user_id": user_id, "item_id": item_id}
    for user_id, user_predictions in tqdm(enumerate(predicted_item_ids))
    for item_id in user_predictions
]
submission = pd.DataFrame(submission_data)
submission.to_csv('submission_ease4.csv', index=False)

18397it [00:00, 180963.40it/s]
