In [1]:
# Matrix Factorization Model for Friend Recommendation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# -----------------------------
# 1. Load relationship data
# -----------------------------
relationships = pd.read_csv('../data/pokec/train_relationships.txt', sep='\t', names=['user_id', 'friend_id'])
test_edges = pd.read_csv('../data/pokec/test_relationships.txt', sep='\t', names=['user_id', 'friend_id'])


In [4]:
# Create unique user ID map
unique_users = pd.Index(pd.concat([relationships['user_id'], relationships['friend_id']]).unique())
user_to_index = {user: idx for idx, user in enumerate(unique_users)}
index_to_user = {idx: user for user, idx in user_to_index.items()}

relationships['user_idx'] = relationships['user_id'].map(user_to_index)
relationships['friend_idx'] = relationships['friend_id'].map(user_to_index)

n_users = len(user_to_index)

In [6]:
# -----------------------------
# 2. Matrix Factorization Model
# -----------------------------
class MF(nn.Module):
    def __init__(self, num_users, embedding_dim):
        super(MF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.friend_embedding = nn.Embedding(num_users, embedding_dim)

    def forward(self, user_idx, friend_idx):
        user_vec = self.user_embedding(user_idx)
        friend_vec = self.friend_embedding(friend_idx)
        return (user_vec * friend_vec).sum(1)

In [None]:
# -----------------------------
# 3. Train the Model
# -----------------------------
embedding_dim = 64
model = MF(n_users, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

# Create training data
user_tensor = torch.LongTensor(relationships['user_idx'].values)
friend_tensor = torch.LongTensor(relationships['friend_idx'].values)
labels = torch.FloatTensor([1.0] * len(user_tensor))

# Generate negative samples
neg_samples = []
for u in relationships['user_idx'].unique():
    positives = relationships[relationships['user_idx'] == u]['friend_idx'].values
    negatives = np.setdiff1d(np.arange(n_users), positives)
    sampled = np.random.choice(negatives, size=min(5, len(negatives)), replace=False)
    for f in sampled:
        neg_samples.append((u, f))

neg_user_tensor = torch.LongTensor([x[0] for x in neg_samples])
neg_friend_tensor = torch.LongTensor([x[1] for x in neg_samples])
neg_labels = torch.FloatTensor([0.0] * len(neg_user_tensor))

# Combine
train_user = torch.cat([user_tensor, neg_user_tensor])
train_friend = torch.cat([friend_tensor, neg_friend_tensor])
train_labels = torch.cat([labels, neg_labels])

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    preds = model(train_user, train_friend)
    loss = criterion(preds, train_labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 1/5, Loss: 0.6931
Epoch 2/5, Loss: 0.6712
Epoch 3/5, Loss: 0.6489
Epoch 4/5, Loss: 0.6305
Epoch 5/5, Loss: 0.6123


In [None]:
# -----------------------------
# 4. Save Model and Metadata
# -----------------------------
save_path = '../models/saved_models/pokec/'
os.makedirs(save_path, exist_ok=True)

torch.save(model.state_dict(), save_path + 'mf_model.pth')
joblib.dump(user_to_index, save_path + 'user_to_index.pkl')
joblib.dump(index_to_user, save_path + 'index_to_user.pkl')
print(f'✅ Matrix Factorization model and metadata saved to {save_path}')

✅ Matrix Factorization model and metadata saved to ../models/saved_models/pokec/


In [None]:
# -----------------------------
# 5. Evaluation
# -----------------------------
hits = 0
total = 0
N = 10
model.eval()
    
# Filter test set
valid_test = test_edges[test_edges['user_id'].isin(user_to_index) & test_edges['friend_id'].isin(user_to_index)]
print(f"✅ Test relationships loaded: {len(valid_test)}")

for _, row in tqdm(valid_test.iterrows(), total=len(valid_test), desc="🔎 Evaluating (Matrix Factorization)"):
    u = user_to_index[row['user_id']]
    actual_friend = user_to_index[row['friend_id']]

    # Predict scores with all other users
    user_vec = model.user_embedding(torch.LongTensor([u]))
    all_friends = model.friend_embedding(torch.arange(n_users))
    scores = (user_vec * all_friends).sum(1)

    top_n = torch.topk(scores, N + 1).indices.tolist()
    top_n = [i for i in top_n if i != u][:N]  # remove self if present

    if actual_friend in top_n:
        hits += 1
    total += 1

if total > 0:
    precision_at_n = hits / (total * N)
    recall_at_n = hits / total
    print(f'\n✅ Matrix Factorization Model Results:')
    print(f'Precision@{N}: {precision_at_n:.4f}')
    print(f'Recall@{N}: {recall_at_n:.4f}')
else:
    print("⚠️ No valid test edges found for evaluation.")


✅ Test relationships loaded: 789669
🔎 Evaluating (Matrix Factorization): 100%|████████████████████████████████████████| 789669/789669 [07:09<00:00, 1839.02it/s]

✅ Matrix Factorization Model Results:
Precision@10: 0.0179
Recall@10: 0.0964
