In [1]:
# KNN Model for Friend Recommendation

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score
import joblib
import os
from tqdm import tqdm

In [2]:
# -----------------------------
# 1. Load Data
# -----------------------------
profiles = pd.read_csv('../data/pokec/profiles.csv')
test_edges = pd.read_csv('../data/pokec/test_relationships.txt', sep='\t', names=['user_id', 'friend_id'])


In [3]:
# -----------------------------
# 2. Select and Clean Features
# -----------------------------
selected_cols = [
    'user_id', 'age', 'height', 'weight', 'gender', 'region',
    'hobbies', 'favorite_movie_genres', 'music_genres'
]
df = profiles[selected_cols]
df = df[df['age'].notna() & df['gender'].notna() & df['region'].notna()]  # Essential features


In [4]:
from scipy import sparse

# Numeric features
scaler = StandardScaler()
numeric = scaler.fit_transform(df[['age', 'height', 'weight']].fillna(0))
numeric_sparse = sparse.csr_matrix(numeric)

# Categorical features
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
categorical = encoder.fit_transform(df[['gender', 'region']])

# Textual features (already sparse)
vectorizer = TfidfVectorizer(max_features=50)
hobbies_vec = vectorizer.fit_transform(df['hobbies'].fillna('').astype(str))
movies_vec = vectorizer.fit_transform(df['favorite_movie_genres'].fillna('').astype(str))
music_vec = vectorizer.fit_transform(df['music_genres'].fillna('').astype(str))

# Combine all as sparse matrix
X = sparse.hstack([
    numeric_sparse,
    categorical,
    hobbies_vec,
    movies_vec,
    music_vec
]).tocsr()

# Final mapping
user_ids = df['user_id'].values
user_id_to_index = {uid: idx for idx, uid in enumerate(user_ids)}


In [5]:
# -----------------------------
# 3. Train KNN Model
# -----------------------------
k = 11  # 10 neighbors + self
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X)

print(f"✅ KNN model trained with k={k}")


✅ KNN model trained with k=11


In [1]:
# -----------------------------
# 4. Recommend Top-N Friends
# -----------------------------
def recommend_knn(user_idx, X, user_ids, n=10):
    distances, indices = knn.kneighbors(X[user_idx].reshape(1, -1))
    neighbors = indices[0][1:]  # Exclude the user itself
    return user_ids[neighbors[:n]]


In [None]:
from tqdm import tqdm

# -----------------------------
# 5. Evaluate on Full Test Set (KNN)
# -----------------------------

# Filter test edges to include only valid users
test_edges = test_edges[test_edges['user_id'].isin(user_id_to_index)]
test_edges = test_edges[test_edges['friend_id'].isin(user_id_to_index)]

# ✅ No sampling — use the entire test set
print(f"🔁 Using full test set of {len(test_edges)} edges for evaluation.")

hits = 0
total = 0
N = 10  # Top-N recommendations

# Progress bar for the loop
for _, row in tqdm(test_edges.iterrows(), total=len(test_edges), desc="🔎 Evaluating (KNN - full set)"):
    user = row['user_id']
    friend = row['friend_id']

    if user not in user_id_to_index or friend not in user_id_to_index:
        continue

    uidx = user_id_to_index[user]
    recommended = recommend_knn(uidx, X, user_ids, n=N)

    if friend in recommended:
        hits += 1
    total += 1

if total > 0:
    precision_at_n = hits / (total * N)
    recall_at_n = hits / total
    print(f'\n✅ KNN Model Results:')
    print(f'Precision@{N}: {precision_at_n:.4f}')
    print(f'Recall@{N}: {recall_at_n:.4f}')
else:
    print("⚠️ No valid test edges found for evaluation.")

🔁 Using full test set of 789669 edges for evaluation.
🔎 Evaluating (KNN - full set): 100%|████████████████████████████████████████| 789669/789669 [12:10<00:00, 1081.52it/s]

✅ KNN Model Results:
Precision@10: 0.0117
Recall@10: 0.0521


In [None]:
# -----------------------------
# 6. Save Model and Components
# -----------------------------
save_path = '../models/saved_models/pokec/'
os.makedirs(save_path, exist_ok=True)

joblib.dump(knn, save_path + 'knn_model.pkl')
np.save(save_path + 'knn_features.npy', X)
joblib.dump(user_ids, save_path + 'user_ids.pkl')
joblib.dump(scaler, save_path + 'scaler.pkl')
joblib.dump(encoder, save_path + 'encoder.pkl')
joblib.dump(vectorizer, save_path + 'tfidf_vectorizer.pkl')

print(f'✅ KNN model and features saved to {save_path}')


✅ KNN model and features saved to ../models/saved_models/pokec/
