In [5]:
# Cosine Similarity Model for Friend Recommendation

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# -----------------------------
# 1. Load Data
# -----------------------------
profiles = pd.read_csv('../data/pokec/profiles.csv')
relationships = pd.read_csv('../data/pokec/relationships.txt', sep='\t', names=['user_id', 'friend_id'])
test_edges = pd.read_csv('../data/pokec/test_relationships.txt', sep='\t', names=['user_id', 'friend_id'])



In [3]:
# -----------------------------
# 2. Select and Clean Features
# -----------------------------
selected_cols = [
    'user_id', 'age', 'height', 'weight', 'gender', 'region',
    'hobbies', 'favorite_movie_genres', 'music_genres'
]
df = profiles[selected_cols].dropna()

In [4]:

# Scale numeric features
num_scaler = StandardScaler()
num_scaled = num_scaler.fit_transform(df[['age', 'height', 'weight']])


In [5]:
# Encode categorical features
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = ohe.fit_transform(df[['gender', 'region']])

In [6]:
# Vectorize textual features
vectorizer = TfidfVectorizer(max_features=50)
hobbies_vec = vectorizer.fit_transform(df['hobbies'].astype(str)).toarray()
movies_vec = vectorizer.fit_transform(df['favorite_movie_genres'].astype(str)).toarray()
music_vec = vectorizer.fit_transform(df['music_genres'].astype(str)).toarray()

In [7]:
# Combine all features
X = np.hstack([num_scaled, cat_encoded, hobbies_vec, movies_vec, music_vec])
user_ids = df['user_id'].values
user_id_to_index = {uid: idx for idx, uid in enumerate(user_ids)}

In [1]:
# -----------------------------
# 3. Recommend Top-N Friends (Efficient)
# -----------------------------
def recommend_top_n(user_idx, X, user_ids, n=10):
    user_vector = X[user_idx].reshape(1, -1)
    similarities = cosine_similarity(user_vector, X)[0]
    top_indices = np.argsort(similarities)[::-1]
    top_indices = [i for i in top_indices if i != user_idx][:n]
    return user_ids[top_indices]

In [None]:
from tqdm import tqdm
import pandas as pd

# -----------------------------
# 4. Evaluate on Test Set (Full Evaluation)
# -----------------------------

# Filter test edges for valid users
test_edges = test_edges[test_edges['user_id'].isin(user_id_to_index)]
test_edges = test_edges[test_edges['friend_id'].isin(user_id_to_index)]

# Log filtered size
print(f"✅ Test relationships loaded: {len(test_edges)}")
print(f"✅ Users in features: {len(user_ids)}")

# 🚫 No sampling — evaluate on full test set
print(f"🔁 Using full test set of {len(test_edges)} edges for evaluation.")

# Setup metrics
hits = 0
total = 0
N = 10  # Top-N friends to recommend

# Progress bar evaluation loop
for _, row in tqdm(test_edges.iterrows(), total=len(test_edges), desc="🔎 Evaluating"):
    user = row['user_id']
    actual_friend = row['friend_id']

    if user not in user_id_to_index or actual_friend not in user_id_to_index:
        continue

    uidx = user_id_to_index[user]
    recommended = recommend_top_n(uidx, X, user_ids, n=N)

    if actual_friend in recommended:
        hits += 1
    total += 1

# Final metrics
if total > 0:
    precision_at_n = hits / (total * N)
    recall_at_n = hits / total

    print(f'\n✅ Cosine Similarity Model Results:')
    print(f'Precision@{N}: {precision_at_n:.4f}')
    print(f'Recall@{N}: {recall_at_n:.4f}')
else:
    print("⚠️ No valid test edges found. Precision and Recall cannot be computed.")

✅ Test relationships loaded: 789669
✅ Users in features: 427990
🔁 Using full test set of 789669 edges for evaluation.
🔎 Evaluating: 100%|████████████████████████████████████████| 789669/789669 [06:11<00:00, 2123.45it/s]

✅ Cosine Similarity Model Results:
Precision@10: 0.0058
Recall@10: 0.0612


In [None]:
# -----------------------------
# 5. Save the Model Components
# -----------------------------
save_path = '../models/saved_models/pokec/'
os.makedirs(save_path, exist_ok=True)

np.save(save_path + 'cosine_features.npy', X)
joblib.dump(user_ids, save_path + 'user_ids.pkl')
joblib.dump(num_scaler, save_path + 'scaler.pkl')
joblib.dump(ohe, save_path + 'encoder.pkl')
joblib.dump(vectorizer, save_path + 'tfidf_vectorizer.pkl')

print(f'✅ Model and assets saved to {save_path}')

✅ Model and assets saved to ../models/saved_models/pokec/
