In [3]:
# KMeans Clustering for Friend Recommendation

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score, recall_score
import joblib
import os

In [4]:
# -----------------------------
# 1. Load Data
# -----------------------------
profiles = pd.read_csv('../data/pokec/profiles.csv')
test_edges = pd.read_csv('../data/pokec/test_relationships.txt', sep='\t', names=['user_id', 'friend_id'])


In [5]:
# -----------------------------
# 2. Select and Clean Features
# -----------------------------
selected_cols = [
    'user_id', 'age', 'height', 'weight', 'gender', 'region',
    'hobbies', 'favorite_movie_genres', 'music_genres'
]
df = profiles[selected_cols]
df = df[df['age'].notna() & df['gender'].notna() & df['region'].notna()]  # Keep essential fields


In [6]:
# Scale numeric features
scaler = StandardScaler()
numeric = scaler.fit_transform(df[['age', 'height', 'weight']].fillna(0))


In [7]:
# One-hot categorical
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categorical = encoder.fit_transform(df[['gender', 'region']])


In [8]:
# TF-IDF text
vectorizer = TfidfVectorizer(max_features=50)
hobbies_vec = vectorizer.fit_transform(df['hobbies'].fillna('').astype(str)).toarray()
movies_vec = vectorizer.fit_transform(df['favorite_movie_genres'].fillna('').astype(str)).toarray()
music_vec = vectorizer.fit_transform(df['music_genres'].fillna('').astype(str)).toarray()


In [9]:
# Combine all features
X = np.hstack([numeric, categorical, hobbies_vec, movies_vec, music_vec])
user_ids = df['user_id'].values
user_id_to_index = {uid: idx for idx, uid in enumerate(user_ids)}


In [10]:
# -----------------------------
# 3. Train KMeans Model
# -----------------------------
n_clusters = 200
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

print(f"✅ KMeans clustering completed. {n_clusters} clusters formed.")


✅ KMeans clustering completed. 200 clusters formed.


In [11]:
# -----------------------------
# 4. Recommend Users from Same Cluster
# -----------------------------
def recommend_from_cluster(user_idx, n=10):
    user_cluster = labels[user_idx]
    cluster_indices = np.where(labels == user_cluster)[0]
    similar_indices = [i for i in cluster_indices if i != user_idx]
    top_indices = np.random.choice(similar_indices, min(n, len(similar_indices)), replace=False)
    return user_ids[top_indices]

In [None]:
from tqdm import tqdm

# -----------------------------
# 5. Evaluate on Full Test Set
# -----------------------------

# Filter test edges to include only valid user IDs
test_edges = test_edges[test_edges['user_id'].isin(user_id_to_index)]
test_edges = test_edges[test_edges['friend_id'].isin(user_id_to_index)]

print(f"✅ Test relationships loaded: {len(test_edges)}")
print(f"✅ Users in features: {len(user_ids)}")

hits = 0
total = 0
N = 10  # top-N recommendations

# Evaluate with full loop and progress bar
for _, row in tqdm(test_edges.iterrows(), total=len(test_edges), desc="🔎 Evaluating (KMeans - full set)"):
    user = row['user_id']
    friend = row['friend_id']

    if user not in user_id_to_index or friend not in user_id_to_index:
        continue

    uidx = user_id_to_index[user]
    recommended = recommend_from_cluster(uidx, n=N)

    if friend in recommended:
        hits += 1
    total += 1

# -----------------------------
# Final metrics
# -----------------------------
if total > 0:
    precision_at_n = hits / (total * N)
    recall_at_n = hits / total
    print(f'\n✅ KMeans Model Results (Full Test Set):')
    print(f'Precision@{N}: {precision_at_n:.4f}')
    print(f'Recall@{N}: {recall_at_n:.4f}')
else:
    print("⚠️ No valid test edges found for evaluation.")

✅ Test relationships loaded: 789669
✅ Users in features: 427990
🔎 Evaluating (KMeans - full set): 100%|████████████████████████████████████████| 789669/789669 [26:09<00:00, 502.20it/s]

✅ KMeans Model Results (Full Test Set):
Precision@10: 0.0083
Recall@10: 0.0297


In [4]:
# -----------------------------
# 6. Save Model and Components
# -----------------------------
save_path = '../models/saved_models/pokec/'
# os.makedirs(save_path, exist_ok=True)

# joblib.dump(kmeans, save_path + 'kmeans_model.pkl')
# np.save(save_path + 'kmeans_features.npy', X)
# joblib.dump(user_ids, save_path + 'user_ids.pkl')
# joblib.dump(scaler, save_path + 'scaler.pkl')
# joblib.dump(encoder, save_path + 'encoder.pkl')
# joblib.dump(vectorizer, save_path + 'tfidf_vectorizer.pkl')

print(f'✅ Model and features saved to {save_path}')

✅ Model and features saved to ../models/saved_models/pokec/
