In [27]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
from sklearn.metrics import roc_auc_score

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [29]:
user_df = pd.read_csv("./data/user.csv")
rating_df = pd.read_csv("./data/tourism_rating.csv")
place_df = pd.read_csv("./data/tourism_with_id.csv", usecols=["Place_Id", "Place_Name", "City", "Rating"])

In [30]:
user_df = user_df.rename(columns={"User_Id": "user_id", "Location": "location", "Age": "age"})
place_df = place_df.rename(columns={"Place_Id": "place_id", "Place_Name": "place_name", "City": "city", "Rating": "rating"})
rating_df = rating_df.rename(columns={"User_Id": "user_id", "Place_Id": "place_id", "Place_Ratings": "place_rating"})

In [31]:
# print(user_df.describe(), "\n")
# print(user_df.isnull().sum(), "\n")
# print(user_df[user_df.duplicated(keep=False)], "\n")

# print(rating_df.describe(), "\n")
# print(rating_df.isnull().sum(), "\n")
# print(rating_df[rating_df.duplicated(keep=False)], "\n")
user_df = user_df.drop_duplicates(subset=["user_id"], keep="last")
place_df = place_df.drop_duplicates(subset=["place_id"], keep="last")
rating_df = rating_df.drop_duplicates(subset=["user_id", "place_id"], keep="last")


# print(place_df.describe(), "\n")
# print(place_df.isnull().sum(), "\n")
# print(place_df[place_df.duplicated(keep=False)], "\n")


In [32]:
# Converting rating to interaction bool. (Explicit to Implicit)
rating_df["implicit_label"] = (rating_df["place_rating"] >= 4).astype(int)

popular_places = (
    rating_df['place_id']
    .value_counts()
    .sort_values(ascending=False)
    .head(10) 
    .index
    .tolist()
)

default_user_id = 301
default_rating = 5

new_user_data = pd.DataFrame({
    'user_id': [default_user_id] * len(popular_places),
    'place_id': popular_places,
    'place_rating': [default_rating] * len(popular_places),
    'implicit_label': [1] * len(popular_places)  # or 0 if you want to distinguish
})

rating_df = pd.concat([rating_df, new_user_data], ignore_index=True)

rating_df["user_id"] = rating_df["user_id"] - 1
rating_df["place_id"] = rating_df["place_id"] - 1

user_df["user_id"] = user_df["user_id"] - 1
place_df["place_id"] = rating_df["place_id"] - 1

user_len = rating_df["user_id"].max() + 1
place_len = rating_df["place_id"].max() + 1
# rating_df
print(place_len)
print(user_len)

437
301


In [33]:
# rating_df[rating_df['implicit_label'] == 1].groupby('user_id')['place_id'].apply(set).to_dict()
rating_df
# rating_df[rating_df['implicit_label'] == 1].describe()


Unnamed: 0,user_id,place_id,place_rating,implicit_label
0,0,178,3,0
1,0,343,2,0
2,0,4,5,1
3,0,372,3,0
4,0,100,4,1
...,...,...,...,...
9602,300,343,5,1
9603,300,207,5,1
9604,300,401,5,1
9605,300,293,5,1


In [34]:
#Dataset Class

class RatingDataset(torch.utils.data.Dataset):
    def __init__(self, num_negatives=5):
        self.users = rating_df[rating_df['implicit_label'] == 1]["user_id"].values
        self.positives = rating_df[rating_df['implicit_label'] == 1].groupby('user_id')['place_id'].apply(set).to_dict()

        self.num_negatives = num_negatives
        
        self.user_item_pairs = []
        for user, pos_items in self.positives.items():
            for item in pos_items:
                self.user_item_pairs.append((user, item))

    def __len__(self):
        return len(self.users) * (1 + self.num_negatives)

    def __getitem__(self, idx):
        group_size = 1 + self.num_negatives
        pos_idx = idx // group_size
        offset = idx % group_size

        user, pos_item = self.user_item_pairs[pos_idx]

        if offset == 0:
            # Positive sample
            return torch.tensor(user), torch.tensor(pos_item), torch.tensor(1.0)
        else:
            # Negative sample
            while True:
                neg_item = random.randint(0, place_len - 1)
                if neg_item not in self.positives.get(user, set()):
                    return torch.tensor(user), torch.tensor(neg_item), torch.tensor(0.0)
    
dataset = RatingDataset()

In [35]:
class EvaluationDataset(torch.utils.data.Dataset):
    def __init__(self):
        self.users = torch.tensor(rating_df["user_id"].values, dtype=torch.long)
        self.places = torch.tensor(rating_df["place_id"].values, dtype=torch.long)
        self.labels = torch.tensor(rating_df["implicit_label"].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.places[idx], self.labels[idx]

In [36]:
class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NeuMF, self).__init__()
        
        # GMF embeddings
        self.gmf_user = nn.Embedding(num_users, embedding_dim)
        self.gmf_item = nn.Embedding(num_items, embedding_dim)

        # MLP embeddings
        self.mlp_user = nn.Embedding(num_users, embedding_dim)
        self.mlp_item = nn.Embedding(num_items, embedding_dim)

        # MLP layers
        self.linear_1 = nn.Linear(embedding_dim * 2, 64)
        self.relu_1 = nn.LeakyReLU()
        self.linear_2 = nn.Linear(64, 32)
        self.relu_2 = nn.LeakyReLU()
        self.linear_3 = nn.Linear(32, 16)

        # NeuMF layer (concat GMF and MLP)
        final_input_dim = embedding_dim + 16
        self.output_layer = nn.Linear(final_input_dim, 1)
    
    def forward(self, user_ids, item_ids):
        # GMF part
        gmf_u = self.gmf_user(user_ids)
        gmf_i = self.gmf_item(item_ids)
        gmf_output = gmf_u * gmf_i  # element-wise product

        # MLP part
        mlp_u = self.mlp_user(user_ids)
        mlp_i = self.mlp_item(item_ids)
        mlp_input = torch.cat([mlp_u, mlp_i], dim=-1)

        x = self.linear_1(mlp_input)
        x = self.relu_1(x)
        x = self.linear_2(x)
        x = self.relu_2(x)
        mlp_output = self.linear_3(x)

        # Concatenate GMF and MLP output
        concat = torch.cat([gmf_output, mlp_output], dim=-1)
        logits = self.output_layer(concat)
        return logits
    
model = NeuMF(user_len, place_len)
model = model.to(device)

In [37]:
# class NeuralCF(nn.Module):
#     def __init__(self, num_users, num_items, embedding_dim=32):
#         super().__init__()
#         self.user_embedding = nn.Embedding(num_users, embedding_dim)
#         self.item_embedding = nn.Embedding(num_items, embedding_dim)

#         # MLP layers (simple 2-layer example)
#         self.fc1 = nn.Linear(embedding_dim * 2, 64)
#         self.relu1 = nn.ReLU()
#         self.fc2 = nn.Linear(64, 32)
#         self.relu2 = nn.ReLU()
#         self.output = nn.Linear(32, 1)  # output raw logits for BCEWithLogitsLoss

#     def forward(self, user_ids, item_ids):
#         user_vec = self.user_embedding(user_ids)
#         item_vec = self.item_embedding(item_ids)
#         x = torch.cat([user_vec, item_vec], dim=-1)  # concat user and item embeddings
#         x = self.relu1(self.fc1(x))
#         x = self.relu2(self.fc2(x))
#         logits = self.output(x)
#         return logits  # raw score, use sigmoid or BCEWithLogitsLoss
    
# model = NeuralCF(num_users=user_len, num_items=place_len, embedding_dim=32).to(device)

In [38]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.0001)

epochs = 100

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_dataset = RatingDataset(num_negatives=10)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)

test_dataset = EvaluationDataset()
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

In [39]:
def hit_at_k(user_item_scores, user_item_labels, k):
    hits = []
    for user in user_item_scores:
        scores = np.array(user_item_scores[user])
        labels = np.array(user_item_labels[user])
        if labels.sum() == 0:  # no positives for user, skip
            continue
        # Get indices of top-k scores
        top_k_indices = scores.argsort()[::-1][:k]
        # Check if any of the top-k are positive
        hits.append(1 if labels[top_k_indices].sum() > 0 else 0)
    return np.mean(hits)

In [40]:
for epoch in range(epochs):
    model.train()
    train_correct = 0
    train_loss_total = 0.0

    for x1, x2, y in train_loader:
        x1, x2, y = x1.to(device), x2.to(device), y.to(device)
        y = y.unsqueeze(1).float()
        y_logits = model(x1, x2)
        y_preds = (y_logits > 0)

        train_correct += (y_preds == y).sum().item()
        train_loss = loss_fn(y_logits, y)
        train_loss_total += train_loss.item()

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    model.eval()

    all_scores = []
    all_labels = []
    all_users = []

    test_loss_total = 0.0
    test_correct = 0

    with torch.inference_mode():
        for x1, x2, y in test_loader:
            x1, x2, y = x1.to(device), x2.to(device), y.to(device)
            y = y.unsqueeze(1).float()
            y_logits = model(x1, x2)
            y_preds = (y_logits > 0)

            test_correct += (y_preds == y).sum().item()

            all_scores.extend(y_logits.squeeze().cpu().tolist())
            all_labels.extend(y.squeeze().cpu().tolist())
            # Assuming you have a user tensor x1 (user ids)
            all_users.extend(x1.cpu().tolist())

    # ROC-AUC (on entire test set)
    roc_auc = roc_auc_score(all_labels, all_scores)

    # Group scores and labels by user for hit@k
    user_item_scores = {}
    user_item_labels = {}

    for user, score, label in zip(all_users, all_scores, all_labels):
        if user not in user_item_scores:
            user_item_scores[user] = []
            user_item_labels[user] = []
        user_item_scores[user].append(score)
        user_item_labels[user].append(label)

    k = 5
    hit_k = hit_at_k(user_item_scores, user_item_labels, k)

    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train Loss: {train_loss_total / len(train_loader):.4f} | "
          f"ROC-AUC: {roc_auc:.4f} | "
          f"Hit@{k}: {hit_k:.4f}")

Epoch 1/100 | Train Loss: 0.5304 | ROC-AUC: 0.4933 | Hit@5: 0.9203
Epoch 2/100 | Train Loss: 0.3187 | ROC-AUC: 0.4982 | Hit@5: 0.9236
Epoch 3/100 | Train Loss: 0.3129 | ROC-AUC: 0.5043 | Hit@5: 0.9103
Epoch 4/100 | Train Loss: 0.3106 | ROC-AUC: 0.5095 | Hit@5: 0.9169
Epoch 5/100 | Train Loss: 0.3089 | ROC-AUC: 0.5146 | Hit@5: 0.9435
Epoch 6/100 | Train Loss: 0.3078 | ROC-AUC: 0.5194 | Hit@5: 0.9402
Epoch 7/100 | Train Loss: 0.3063 | ROC-AUC: 0.5240 | Hit@5: 0.9435
Epoch 8/100 | Train Loss: 0.3059 | ROC-AUC: 0.5278 | Hit@5: 0.9502
Epoch 9/100 | Train Loss: 0.3053 | ROC-AUC: 0.5317 | Hit@5: 0.9435
Epoch 10/100 | Train Loss: 0.3047 | ROC-AUC: 0.5356 | Hit@5: 0.9502
Epoch 11/100 | Train Loss: 0.3044 | ROC-AUC: 0.5394 | Hit@5: 0.9502
Epoch 12/100 | Train Loss: 0.3040 | ROC-AUC: 0.5428 | Hit@5: 0.9535
Epoch 13/100 | Train Loss: 0.3036 | ROC-AUC: 0.5466 | Hit@5: 0.9701
Epoch 14/100 | Train Loss: 0.3034 | ROC-AUC: 0.5496 | Hit@5: 0.9767
Epoch 15/100 | Train Loss: 0.3032 | ROC-AUC: 0.5523 | Hit

In [41]:
torch.save(model.state_dict(), "model.pth")

In [42]:
# del model
# del train_loader
# torch.cuda.empty_cache()