#No LLM-enhanced Movie Recommendation System

**Our Goal**

1. Build a baseline Colloborative Filtering
2. Include Knowledge Graph + Semantic search on graph using FAISS
3. Build a Graph based on user-item and use retreival from Graph

All these are effectively carried out without real LLM just to study/understand the efficiency of graph embeddings. We plan to use LLM retreival later. This is a POC as experimenting with LLMs anything >1B is extememely resource intensive

**I hope if the above results are obtained as per expectation then we can expect the same upon including LLM**

**Let us start with baseline CF and later add KG**

**Push all required files to folder ml10k**

In [1]:
import os
import shutil

# Create the destination folder
os.makedirs("ml-100k", exist_ok=True)

# Move all files (except folders and notebooks) to the ml-100k folder
for file in os.listdir():
    if os.path.isfile(file) and file != "ml-100k":
        shutil.move(file, os.path.join("ml-100k", file))


**CF + LLM+KG**

In [2]:
import pandas as pd
from collections import defaultdict
from math import sqrt
import random

# Load the ratings data (assuming 'u.data' is available in the current directory)
# The u.data file has columns: user_id, item_id, rating, timestamp, separated by tabs.
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Shuffle the data to ensure randomness (especially if splitting globally; we will split per user here)
ratings = ratings.sample(frac=1, random_state=42).reset_index(drop=True)

# Split 80/20 for each user:
train_list = []
test_list = []
random.seed(42)
user_groups = ratings.groupby('user_id')
for user, group in user_groups:
    n_ratings = len(group)
    # Determine number of test items (20% of this user's ratings, at least 1)
    n_test = max(1, int(0.2 * n_ratings))
    # Randomly sample test indices for this user
    test_idx = set(random.sample(list(group.index), n_test))
    for idx, row in group.iterrows():
        if idx in test_idx:
            test_list.append(row)
        else:
            train_list.append(row)

train_df = pd.DataFrame(train_list).reset_index(drop=True)
test_df  = pd.DataFrame(test_list).reset_index(drop=True)

print(f"Total ratings: {len(ratings)}")
print(f"Training set size: {len(train_df)}, Testing set size: {len(test_df)}")

# Quick sanity check: ensure no user has zero train or test ratings
train_counts = train_df['user_id'].value_counts()
test_counts = test_df['user_id'].value_counts()
users_no_train = [u for u in range(1,944) if u not in train_counts.index]
users_no_test  = [u for u in range(1,944) if u not in test_counts.index]
print(f"Users with no train ratings: {users_no_train}")
print(f"Users with no test ratings: {users_no_test}")


Total ratings: 100000
Training set size: 80367, Testing set size: 19633
Users with no train ratings: []
Users with no test ratings: []


**Collaborative Filtering Baseline (User-Based CF)**

In [3]:
# Compute user mean ratings in training set for normalization
user_mean = train_df.groupby('user_id')['rating'].mean().to_dict()

# Build a dictionary of user -> {item: rating_diff} for fast lookup (rating_diff = rating - user_mean)
user_ratings = defaultdict(dict)
for _, row in train_df.iterrows():
    u, i, r = int(row['user_id']), int(row['item_id']), row['rating']
    user_ratings[u][i] = r - user_mean[u]

# Precompute denominator (norm) for each user's rating vector (for Pearson correlation)
user_norm = {}
for u, items in user_ratings.items():
    # Norm is sqrt of sum of squared deviations
    norm = sqrt(sum((diff)**2 for diff in items.values()))
    user_norm[u] = norm

# Function to compute Pearson similarity between two users
def user_similarity(u, v):
    if u not in user_ratings or v not in user_ratings:
        return 0.0
    # Find common items both u and v rated
    common_items = set(user_ratings[u].keys()) & set(user_ratings[v].keys())
    if len(common_items) == 0:
        return 0.0
    # Sum of product of rating diffs for common items
    num = sum(user_ratings[u][i] * user_ratings[v][i] for i in common_items)
    den = user_norm[u] * user_norm[v]
    return num/den if den != 0 else 0.0

# Compute top-N neighbors for each user (to limit computation during predictions)
N = 20  # number of neighbors to use
neighbors = {}
for u in user_ratings:
    # Compute similarity of u with every other user
    sims = []
    for v in user_ratings:
        if u == v:
            continue
        sim_uv = user_similarity(u, v)
        if sim_uv > 0:  # only consider positive similarities for neighbors
            sims.append((sim_uv, v))
    # Sort by similarity and take top N
    sims.sort(reverse=True, key=lambda x: x[0])
    neighbors[u] = sims[:N]

# Function to get top-K recommendations for a user using user-based CF
def recommend_cf(user, K=5):
    # Items the user already rated in train
    seen_items = set(train_df[train_df['user_id'] == user]['item_id'])
    # Predict scores for unseen items
    scores = {}
    for sim, v in neighbors.get(user, []):
        for item, rating_diff in user_ratings[v].items():
            if item in seen_items:
                continue
            # Neighbor v's predicted contribution = neighbor's mean + diff, but we will add diff then add our user's mean later
            # Accumulate weighted sum of (neighbor's rating diff) * sim
            if item not in scores:
                scores[item] = {'num': 0.0, 'den': 0.0}
            scores[item]['num'] += sim * rating_diff
            scores[item]['den'] += abs(sim)
    # Compute final predicted rating = user_mean + weighted sum / sum of weights
    preds = []
    for item, agg in scores.items():
        if agg['den'] == 0:
            pred_rating = user_mean.get(user, 0)  # if no neighbor info, use user mean
        else:
            pred_rating = user_mean.get(user, 0) + agg['num'] / agg['den']
        preds.append((pred_rating, item))
    # Sort predictions by predicted rating descending
    preds.sort(reverse=True, key=lambda x: x[0])
    # Return top K item IDs
    return [item for _, item in preds[:K]]

# Example: get top-5 recommendations for user 1 using CF
print("User 1 CF recommendations:", recommend_cf(1, K=5))


User 1 CF recommendations: [646, 750, 1495, 317, 408]


**Preparing Genre List for KG**

In [4]:
# Load movie genre data (u.item) to build a knowledge graph of movies by genre
genre_list = ["Unknown","Action","Adventure","Animation","Children's","Comedy","Crime",
              "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical",
              "Mystery","Romance","Sci-Fi","Thriller","War","Western"]

# Read u.item (movie metadata). This file is '|' delimited.
# Columns: movie_id | title | release_date | video_release_date | IMDb_url | genre_flags...
movies_df = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None)
movies_df.columns = ['movie_id','title','release_date','video_release','IMDb_url'] + genre_list

# Build dictionary: movie_id -> set of genres for that movie
movie_genres = {}
for _, row in movies_df.iterrows():
    mid = int(row['movie_id'])
    # Collect all genres where value is 1
    genres = {genre for genre in genre_list if row[genre] == 1}
    movie_genres[mid] = genres

# Precompute popularity (or average rating) for tie-breaking: use training data
movie_train_ratings = train_df.groupby('item_id')['rating'].agg(['count','mean']).reset_index()
movie_popularity = movie_train_ratings.set_index('item_id')['count'].to_dict()  # could use 'mean' instead

# Knowledge-Graph-based recommendation for a user:
def recommend_kg(user, K=5):
    # Identify genres liked by the user (from training data, rating >=4)
    liked_genres = set()
    user_train_ratings = train_df[train_df['user_id'] == user]
    for _, row in user_train_ratings.iterrows():
        if row['rating'] >= 4:
            mid = int(row['item_id'])
            liked_genres |= movie_genres.get(mid, set())
    # If no liked genres (user rated nothing >=4), we default to recommending popular movies
    if not liked_genres:
        popular_items = movie_train_ratings.sort_values('count', ascending=False)['item_id'].tolist()
        return [item for item in popular_items if item not in user_train_ratings['item_id'].values][:K]
    # Gather candidate movies that have at least one of these genres and that user hasn't seen
    seen = set(user_train_ratings['item_id'])
    candidates = []
    for mid, genres in movie_genres.items():
        if mid in seen:
            continue
        overlap = len(genres & liked_genres)
        if overlap > 0:
            # Use overlap count as primary score, with popularity as secondary
            pop = movie_popularity.get(mid, 0)
            candidates.append((overlap, pop, mid))
    # Sort candidates: first by overlap (descending), then by popularity (descending)
    candidates.sort(reverse=True, key=lambda x: (x[0], x[1]))
    # Return top K movie IDs
    return [mid for _, _, mid in candidates[:K]]

# Example: get top-5 recommendations for user 1 using LLM+KG simulation
print("User 1 KG recommendations:", recommend_kg(1, K=5))


User 1 KG recommendations: [426, 560, 820, 755, 21]


**Final Eval**

In [5]:
import math

In [6]:
# Prepare a dictionary of test relevant items for each user (items in test with rating >= 4)
test_relevant = defaultdict(set)
for _, row in test_df.iterrows():
    u = int(row['user_id']); i = int(row['item_id']); r = row['rating']
    if r >= 4:
        test_relevant[u].add(i)

# Helper functions for metrics
def precision_at_k(recommended_items, relevant_items, k):
    if k == 0:
        return 0.0
    recommended_items = recommended_items[:k]
    # Count how many of the recommended items are relevant
    hits = sum(1 for item in recommended_items if item in relevant_items)
    return hits / k

def recall_at_k(recommended_items, relevant_items, k):
    if len(relevant_items) == 0:
        return None  # No relevant items to recall (we'll handle this case separately)
    recommended_items = recommended_items[:k]
    hits = sum(1 for item in recommended_items if item in relevant_items)
    return hits / len(relevant_items)

def ndcg_at_k(recommended_items, relevant_items, k):
    recommended_items = recommended_items[:k]
    DCG = 0.0
    for rank, item in enumerate(recommended_items, start=1):
        if item in relevant_items:
            # relevance = 1 if relevant
            DCG += 1.0 / (math.log2(rank + 1))
    # Compute ideal DCG (IDCG) for up to k relevant items
    # In the ideal ranking, all relevant items would be ranked first (up to k)
    relevant_count = min(len(relevant_items), k)
    IDCG = 0.0
    for rank in range(1, relevant_count + 1):
        IDCG += 1.0 / (math.log2(rank + 1))
    return DCG / IDCG if IDCG > 0 else 0.0

# Evaluate both models for K=5 and K=10
Ks = [5, 10]
metrics = { 'Model': [], 'Precision@5': [], 'Recall@5': [], 'NDCG@5': [],
            'Precision@10': [], 'Recall@10': [], 'NDCG@10': [] }

# Iterate over models
for model_name, recommend_func in [("Collaborative Filtering", recommend_cf),
                                   ("KG (Genres)", recommend_kg)]:
    # accumulate metrics for each user
    prec_k = {5: [], 10: []}
    rec_k  = {5: [], 10: []}
    ndcg_k = {5: [], 10: []}
    for user in train_df['user_id'].unique():  # evaluate for all users present in train (which is all users)
        rel_items = test_relevant.get(user, set())
        # Generate recommendations for this user
        recs5 = recommend_func(user, K=5)
        recs10 = recommend_func(user, K=10)
        # Precision and NDCG can be calculated for all; recall only if user has relevant items
        p5 = precision_at_k(recs5, rel_items, 5)
        p10 = precision_at_k(recs10, rel_items, 10)
        ndcg5 = ndcg_at_k(recs5, rel_items, 5)
        ndcg10 = ndcg_at_k(recs10, rel_items, 10)
        # Only compute recall if relevant items exist for user (skip otherwise to avoid division by zero)
        if len(rel_items) > 0:
            r5 = recall_at_k(recs5, rel_items, 5)
            r10 = recall_at_k(recs10, rel_items, 10)
        else:
            r5 = None
            r10 = None
        # Collect metrics
        prec_k[5].append(p5); prec_k[10].append(p10)
        ndcg_k[5].append(ndcg5); ndcg_k[10].append(ndcg10)
        if r5 is not None:  # only include if not None
            rec_k[5].append(r5)
        if r10 is not None:
            rec_k[10].append(r10)
    # Calculate average metrics (treat missing recall as 0 or skip? Here we skipped None, so we average over users with relevant items)
    avg_p5 = sum(prec_k[5]) / len(prec_k[5])
    avg_p10 = sum(prec_k[10]) / len(prec_k[10])
    avg_ndcg5 = sum(ndcg_k[5]) / len(ndcg_k[5])
    avg_ndcg10 = sum(ndcg_k[10]) / len(ndcg_k[10])
    avg_r5 = sum(rec_k[5]) / len(rec_k[5]) if rec_k[5] else 0.0
    avg_r10 = sum(rec_k[10]) / len(rec_k[10]) if rec_k[10] else 0.0

    metrics['Model'].append(model_name)
    metrics['Precision@5'].append(round(avg_p5, 4))
    metrics['Recall@5'].append(round(avg_r5, 4))
    metrics['NDCG@5'].append(round(avg_ndcg5, 4))
    metrics['Precision@10'].append(round(avg_p10, 4))
    metrics['Recall@10'].append(round(avg_r10, 4))
    metrics['NDCG@10'].append(round(avg_ndcg10, 4))

# Convert metrics to DataFrame for a nice display
metrics_df = pd.DataFrame(metrics)
metrics_df


Unnamed: 0,Model,Precision@5,Recall@5,NDCG@5,Precision@10,Recall@10,NDCG@10
0,Collaborative Filtering,0.0115,0.0057,0.0111,0.0138,0.0141,0.0149
1,KG (Genres),0.0549,0.0306,0.063,0.0469,0.0495,0.0622


**Now let us use embeddings ---> user-item graph for retreival**

**Data Loading again! as the format is different now**

In [7]:
import pandas as pd

def load_movielens100k(train_file, test_file, item_file):
    """Load ratings and movie genres from MovieLens 100K files."""
    # Load movie genre information
    genre_names = []
    movie_genres = {}
    # Read genre names from u.genre
    with open(item_file.replace('u.item', 'u.genre'), 'r') as fg:
        genre_names = [line.split('|')[0] for line in fg if line.strip()]
    # Read movie info from u.item
    with open(item_file, 'r', encoding='latin-1') as f:
        for line in f:
            parts = line.strip().split('|')
            if len(parts) < 6:
                continue
            movie_id = int(parts[0])
            genre_flags = list(map(int, parts[5:5+19]))
            # Map genre flags to genre names
            genres = [genre_names[i] for i, flag in enumerate(genre_flags) if flag == 1]
            movie_genres[movie_id] = genres

    # Load train and test ratings into dictionaries
    def load_ratings(file_path):
        ratings = {}
        with open(file_path, 'r') as f:
            for line in f:
                user, item, rating, _ = line.split()
                user, item, rating = int(user), int(item), int(rating)
                ratings.setdefault(user, {})[item] = rating
        return ratings

    train_ratings = load_ratings(train_file)
    test_ratings  = load_ratings(test_file)
    return train_ratings, test_ratings, movie_genres

# Load the data (assuming the files are available in the current directory)
train_ratings, test_ratings, movie_genres = load_movielens100k('ml-100k/u1.base',
                                                               'ml-100k/u1.test',
                                                               'ml-100k/u.item')
print(f"Loaded train users: {len(train_ratings)}, test users: {len(test_ratings)}")
print(f"Example movie genres: movie 1 -> {movie_genres[1]}")


Loaded train users: 943, test users: 459
Example movie genres: movie 1 -> ['Animation', "Children's", 'Comedy']


**Building User Taste Profiles**

In [8]:
def build_user_profiles(train_ratings, movie_genres):
    """Build a taste profile of preferred genres for each user based on top-rated movies."""
    user_profiles = {}
    for user, item_dict in train_ratings.items():
        # Determine the rating threshold for "top-rated" (use 5 and 4 as top ratings)
        max_rating = max(item_dict.values())
        threshold = max(4, max_rating)  # use 4 if user gave no 5s
        # Collect the user's favorite movies (ratings >= threshold)
        favorite_items = [item for item, r in item_dict.items() if r >= threshold]
        # Count genre frequencies from favorite movies
        genre_counts = {}
        for item in favorite_items:
            for genre in movie_genres.get(item, []):
                genre_counts[genre] = genre_counts.get(genre, 0) + 1
        user_profiles[user] = genre_counts
    return user_profiles

user_profiles = build_user_profiles(train_ratings, movie_genres)
# Example: print profile for user 1
print(f"User 1 profile: {user_profiles[1]}")


User 1 profile: {'Animation': 2, "Children's": 1, 'Comedy': 15, 'Drama': 23, 'Romance': 12, 'Documentary': 4, 'Action': 8, 'Adventure': 4, 'Sci-Fi': 10, 'War': 4, 'Crime': 2, 'Thriller': 6, 'Film-Noir': 1, 'Mystery': 2}


**Constructing the Metadata Graph**

In [9]:
import networkx as nx

def build_metadata_graph(movie_genres):
    """Build an undirected bipartite graph linking movies to genre nodes."""
    G = nx.Graph()
    for movie, genres in movie_genres.items():
        movie_node = f"movie_{movie}"
        G.add_node(movie_node, type='movie', movie_id=movie)
        for genre in genres:
            genre_node = f"genre_{genre}"
            G.add_node(genre_node, type='genre', name=genre)
            # Connect movie to each of its genres
            G.add_edge(movie_node, genre_node)
    return G

G = build_metadata_graph(movie_genres)
print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


Graph has 1701 nodes and 2893 edges.


**Generating Candidate Movies via Graph Traversal**

In [10]:
def get_candidate_movies(user_profile, G):
    """Retrieve candidate movies from the graph based on a user's preferred genres."""
    candidates = set()
    for genre, weight in user_profile.items():
        genre_node = f"genre_{genre}"
        if G.has_node(genre_node):
            # Add all movies connected to this genre node
            for neighbor in G.neighbors(genre_node):
                if G.nodes[neighbor].get('type') == 'movie':
                    candidates.add(G.nodes[neighbor]['movie_id'])
    return candidates

# Example: get candidates for user 1
candidates_user1 = get_candidate_movies(user_profiles[1], G)
print(f"User 1 candidate movies (count): {len(candidates_user1)}")


User 1 candidate movies (count): 1618


**Rank the candidates**

In [11]:
def rank_candidates_for_user(user_id, candidates, user_profile, movie_genres, train_ratings):
    """Score and rank candidate movies for a given user."""
    # Precompute popularity (number of ratings in training) for each movie
    popularity = {}
    for u, items in train_ratings.items():
        for item in items:
            popularity[item] = popularity.get(item, 0) + 1

    scored = []
    for movie in candidates:
        if movie in train_ratings.get(user_id, {}):
            continue  # skip movies user already rated
        # Score based on genre overlap
        score = 0
        for genre in movie_genres.get(movie, []):
            if genre in user_profile:
                score += user_profile[genre]  # add weight for each preferred genre
        # Add a small factor for popularity
        score += 0.001 * popularity.get(movie, 0)
        scored.append((movie, score))
    # Sort candidates by score descending
    scored.sort(key=lambda x: x[1], reverse=True)
    return [movie for movie, score in scored]

# Example: rank top 5 candidates for user 1
ranked_user1 = rank_candidates_for_user(1, candidates_user1, user_profiles[1], movie_genres, train_ratings)
print("Top 5 recommendations for User 1:", ranked_user1[:5])


Top 5 recommendations for User 1: [855, 692, 170, 517, 778]


**Top k recommendations for all users**

In [12]:
def recommend_top_k(user_profiles, G, movie_genres, train_ratings, K=10):
    """Generate top-K recommended movies for each user."""
    recommendations = {}
    for user, profile in user_profiles.items():
        candidates = get_candidate_movies(profile, G)
        ranked_list = rank_candidates_for_user(user, candidates, profile, movie_genres, train_ratings)
        recommendations[user] = ranked_list[:K]
    return recommendations

# Get top-10 recommendations for all users
recommendations_10 = recommend_top_k(user_profiles, G, movie_genres, train_ratings, K=10)
# (For evaluation, we'll also get top-5 lists)
recommendations_5 = recommend_top_k(user_profiles, G, movie_genres, train_ratings, K=5)


**Final Eval Now!**

In [13]:
import math

def evaluate_recommendations(recommendations, test_ratings, K):
    total_precision = total_recall = total_ndcg = 0.0
    user_count = 0
    for user, rec_list in recommendations.items():
        if user not in test_ratings:
            continue
        # Relevant test items for this user (rating >= 4)
        relevant = {item for item, r in test_ratings[user].items() if r >= 4}
        if not relevant:
            continue  # skip users with no relevant items in test
        # Compute Precision@K and Recall@K
        hits = 0
        dcg = 0.0
        for rank, item in enumerate(rec_list[:K], start=1):
            if item in relevant:
                hits += 1
                dcg += 1.0 / math.log2(rank + 1)  # gain discounted by log(rank)
        ideal_hits = min(len(relevant), K)
        # Ideal DCG (all top-K are relevant)
        idcg = sum(1.0 / math.log2(r + 1) for r in range(1, ideal_hits+1))
        precision = hits / K
        recall    = hits / len(relevant)
        ndcg      = dcg / idcg if idcg > 0 else 0.0
        total_precision += precision
        total_recall    += recall
        total_ndcg      += ndcg
        user_count += 1
    # Average metrics across users
    return (total_precision / user_count,
            total_recall / user_count,
            total_ndcg / user_count)

# Evaluate for K=5 and K=10
p5, r5, ndcg5   = evaluate_recommendations(recommendations_5, test_ratings, K=5)
p10, r10, ndcg10 = evaluate_recommendations(recommendations_10, test_ratings, K=10)
#print(f"Precision@5 = {p5:.3f}, Recall@5 = {r5:.3f}, NDCG@5 = {ndcg5:.3f}")
#print(f"Precision@10 = {p10:.3f}, Recall@10 = {r10:.3f}, NDCG@10 = {ndcg10:.3f}")


**Final Results**

In [14]:
import pandas as pd

# Assuming metrics_df is already defined with the two rows
new_row = {
    'Model': 'Graph Retrieval',
    'Precision@5': p5,
    'Recall@5': r5,
    'NDCG@5': ndcg5,
    'Precision@10':p10,
    'Recall@10': r10,
    'NDCG@10': ndcg10
}

# Append the new row
#metrics_df = metrics_df.concat(new_row, ignore_index=True)

metrics_df=pd.concat([metrics_df, pd.DataFrame([new_row])], ignore_index=True)

# View the updated DataFrame
metrics_df


Unnamed: 0,Model,Precision@5,Recall@5,NDCG@5,Precision@10,Recall@10,NDCG@10
0,Collaborative Filtering,0.0115,0.0057,0.0111,0.0138,0.0141,0.0149
1,KG (Genres),0.0549,0.0306,0.063,0.0469,0.0495,0.0622
2,Graph Retrieval,0.098246,0.026714,0.114434,0.077412,0.041151,0.100099


**Generating Item Embeddings with a Pre-trained MiniLM**

In [15]:
!pip install -q sentence_transformers faiss-cpu

from sentence_transformers import SentenceTransformer
import numpy as np
import torch

# Use GPU if available for faster encoding
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load a lightweight pre-trained model for sentence embeddings (MiniLM)
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name, device=device)

# Prepare textual descriptions for each movie: "Title, Genre1, Genre2, ..."
id_to_title = dict(zip(movies_df['movie_id'], movies_df['title']))
descriptions = []
movie_ids = sorted(movie_genres.keys())  # all movie IDs present
for mid in movie_ids:
    title = id_to_title.get(mid, "")
    genres = ", ".join(sorted(movie_genres.get(mid, [])))
    desc = title
    if genres:
        desc += ", " + genres
    descriptions.append(desc)

print(f"Encoding {len(descriptions)} movie descriptions into embeddings...")
item_emb_matrix = model.encode(descriptions, batch_size=64, show_progress_bar=True)
item_emb_matrix = np.array(item_emb_matrix)
item_emb_matrix.shape

# item_emb_matrix = np.array(item_emb_matrix)  # In case it’s a list
# item_emb_matrix = np.ascontiguousarray(item_emb_matrix.astype('float32'))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding 1682 movie descriptions into embeddings...


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

(1682, 384)

In [16]:
# Build a dictionary of each user's highly-rated (>=4) movies in the training data
user_liked_items = {}
for u, i, r in train_df[['user_id','item_id','rating']].itertuples(index=False):
    if r >= 4:  # consider ratings 4 or 5 as "liked"
        user_liked_items.setdefault(int(u), []).append(int(i))

# Compute user embedding as average of liked item embeddings
user_embeddings = {}
for user, liked_list in user_liked_items.items():
    if len(liked_list) == 0:
        continue  # user has no liked movies (we will handle this case later)
    # Average the item embeddings for this user's liked movies
    liked_vectors = item_emb_matrix[[mid-1 for mid in liked_list]]
    user_vector = liked_vectors.mean(axis=0)
    # Normalize the user vector to unit length (for cosine similarity comparisons)
    norm = np.linalg.norm(user_vector)
    if norm > 0:
        user_vector = user_vector / norm
    user_embeddings[user] = user_vector

print(f"Computed embeddings for {len(user_embeddings)} users out of {train_df['user_id'].nunique()}.")


Computed embeddings for 942 users out of 943.


In [17]:
import faiss

# Normalize item embedding matrix for cosine similarity (if not already unit norm)
item_norms = np.linalg.norm(item_emb_matrix, axis=1, keepdims=True)
item_norms[item_norms == 0] = 1e-9
item_unit_matrix = item_emb_matrix / item_norms

# Build a FAISS index for fast cosine similarity search (dot product on normalized vectors)
d = item_unit_matrix.shape[1]
index = faiss.IndexFlatIP(d)  # Inner-product index (works as cosine similarity on unit vectors)
index.add(item_unit_matrix.astype('float32'))
# item_unit_matrix = np.ascontiguousarray(item_unit_matrix, dtype=np.float32)
# index.add(item_unit_matrix)


# import faiss
# import numpy as np

# # Normalize item embeddings
# item_norms = np.linalg.norm(item_emb_matrix, axis=1, keepdims=True)
# item_norms[item_norms == 0] = 1e-9
# item_unit_matrix = item_emb_matrix / item_norms

# # Convert to float32, contiguous array
# item_unit_matrix = np.ascontiguousarray(item_unit_matrix.astype('float32'))

# # FAISS GPU setup
# res = faiss.StandardGpuResources()
# d = item_unit_matrix.shape[1]
# flat_config = faiss.GpuIndexFlatConfig()
# flat_config.useFloat16 = False
# flat_config.device = 0  # Assuming you're using CUDA:0

# # Create GPU index
# gpu_index = faiss.GpuIndexFlatIP(res, d, flat_config)

# # Add embeddings to index
# gpu_index.add(item_unit_matrix)



def retrieve_candidates(user_id, N=50):
    """Retrieve top-N candidate item IDs for the user based on cosine similarity."""
    if user_id not in user_embeddings or len(user_liked_items.get(user_id, [])) == 0:
        # If no profile (no liked items), return N most popular unseen movies as fallback
        seen = set(train_df[train_df['user_id'] == user_id]['item_id'])
        popular_items = movie_train_ratings.sort_values('count', ascending=False)['item_id']
        candidates = [int(item) for item in popular_items if item not in seen]
        return candidates[:N]
    # Get user vector and retrieve nearest neighbors by dot product
    user_vec = user_embeddings[user_id].astype('float32')
    # Use Faiss to get top-N item indices (and similarity scores, which we ignore here)
    _, idxs = index.search(user_vec.reshape(1, -1), N)
    idxs = idxs[0]
    candidate_ids = [int(movie_ids[idx]) for idx in idxs]  # map index back to movie ID
    # Exclude any items the user has already seen in training
    seen = set(train_df[train_df['user_id'] == user_id]['item_id'])
    candidates = [mid for mid in candidate_ids if mid not in seen]
    return candidates[:N]

# Example: retrieve top-10 candidates for a sample user (e.g., user 1)
sample_user = 1
candidates = retrieve_candidates(sample_user, N=10)
print(f"User {sample_user} candidate IDs (embedding similarity): {candidates}")


User 1 candidate IDs (embedding similarity): [1069, 129, 1328, 772, 512, 1180]


In [18]:
from sentence_transformers import CrossEncoder

# Load a cross-encoder model for re-ranking (MiniLM cross-encoder fine-tuned for relevance)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)

def get_profile_text(user_id):
    """Construct a textual profile of the user's preferences from their liked movies."""
    liked_items = user_liked_items.get(user_id, [])
    if not liked_items:
        return ""
    # Sort liked movies by rating (descending) so that favorites come first
    user_ratings = train_df[train_df['user_id'] == user_id][['item_id','rating']]
    top_liked = user_ratings[user_ratings['rating'] >= 4].sort_values('rating', ascending=False)['item_id'].tolist()
    # Limit the number of movies to include to avoid very long input
    if len(top_liked) > 30:
        top_liked = top_liked[:30]
    # Join the titles (with genres) of the liked movies
    profile_list = []
    for mid in top_liked:
        title = id_to_title.get(int(mid), "")
        genres = ", ".join(sorted(movie_genres.get(int(mid), [])))
        profile_item = title
        if genres:
            profile_item += f" ({genres})"
        profile_list.append(profile_item)
    profile_text = "; ".join(profile_list)
    return profile_text

def rerank_candidates(user_id, candidates):
    """Re-rank a list of candidate movie IDs for the user using the cross-encoder model."""
    if not candidates:
        return []
    profile_text = get_profile_text(user_id)
    if profile_text == "":
        # No profile available, return candidates as is (or could sort by popularity as secondary criterion)
        return candidates
    # Prepare input pairs: (user_profile, item_description) for each candidate
    candidate_descs = [descriptions[movie_ids.index(mid)] for mid in candidates]  # get description by movie ID
    pairs = [(profile_text, desc) for desc in candidate_descs]
    # Get relevance scores from the cross-encoder
    scores = cross_encoder.predict(pairs, batch_size=16)
    # Sort candidates by score (higher = more relevant)
    ranked_pairs = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    ranked_ids = [mid for mid, score in ranked_pairs]
    return ranked_ids

# Example: re-rank the previously retrieved candidates for the sample user
ranked_candidates = rerank_candidates(sample_user, candidates)
print(f"User {sample_user} re-ranked candidates: {ranked_candidates}")


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

User 1 re-ranked candidates: [129, 1069, 512, 772, 1328, 1180]


In [19]:
# Function to get final top-K recommendations for a user using the LLM-enhanced pipeline
def recommend_llm(user_id, K=10):
    # Retrieve top-N candidates via embedding similarity
    candidates = retrieve_candidates(user_id, N=50)
    # Re-rank the candidates using the cross-encoder
    ranked = rerank_candidates(user_id, candidates)
    return ranked[:K]

# Evaluate Precision, Recall, NDCG for LLM-enhanced model
prec_k = {5: [], 10: []}
rec_k  = {5: [], 10: []}
ndcg_k = {5: [], 10: []}

for user in train_df['user_id'].unique():
    # Get top-10 recommendations
    recs = recommend_llm(user, K=10)
    recs5 = recs[:5]
    recs10 = recs[:10]
    # Relevant items for this user in test set (rating >=4)
    relevant_items = test_relevant.get(user, set())
    # Calculate Precision and NDCG for K=5 and K=10
    p5   = precision_at_k(recs5, relevant_items, 5)
    p10  = precision_at_k(recs10, relevant_items, 10)
    n5   = ndcg_at_k(recs5, relevant_items, 5)
    n10  = ndcg_at_k(recs10, relevant_items, 10)
    # Calculate Recall only if there are relevant items to recall
    if len(relevant_items) > 0:
        r5 = recall_at_k(recs5, relevant_items, 5)
        r10 = recall_at_k(recs10, relevant_items, 10)
    else:
        r5 = None
        r10 = None
    # Collect metrics
    prec_k[5].append(p5); prec_k[10].append(p10)
    ndcg_k[5].append(n5); ndcg_k[10].append(n10)
    if r5 is not None: rec_k[5].append(r5)
    if r10 is not None: rec_k[10].append(r10)

# Compute average metrics across all users
avg_p5  = sum(prec_k[5]) / len(prec_k[5])
avg_p10 = sum(prec_k[10]) / len(prec_k[10])
avg_n5  = sum(ndcg_k[5]) / len(ndcg_k[5])
avg_n10 = sum(ndcg_k[10]) / len(ndcg_k[10])
avg_r5  = sum(rec_k[5]) / len(rec_k[5]) if rec_k[5] else 0.0
avg_r10 = sum(rec_k[10]) / len(rec_k[10]) if rec_k[10] else 0.0

print(f"LLM-enhanced model: Precision@5={avg_p5:.4f}, Recall@5={avg_r5:.4f}, NDCG@5={avg_n5:.4f}")
print(f"LLM-enhanced model: Precision@10={avg_p10:.4f}, Recall@10={avg_r10:.4f}, NDCG@10={avg_n10:.4f}")


LLM-enhanced model: Precision@5=0.0341, Recall@5=0.0216, NDCG@5=0.0379
LLM-enhanced model: Precision@10=0.0304, Recall@10=0.0428, NDCG@10=0.0420


In [20]:
import pandas as pd

# Prepare the new metrics row for the LLM-enhanced model
new_metrics = {
    'Model': 'LLM-Enhanced (Embedding + Re-rank)',
    'Precision@5': round(avg_p5, 4),
    'Recall@5': round(avg_r5, 4),
    'NDCG@5': round(avg_n5, 4),
    'Precision@10': round(avg_p10, 4),
    'Recall@10': round(avg_r10, 4),
    'NDCG@10': round(avg_n10, 4)
}
# Append to the existing metrics DataFrame (assumes metrics_df from baseline exists)
if 'metrics_df' in globals():
    metrics_df = pd.concat([metrics_df, pd.DataFrame([new_metrics])], ignore_index=True)
else:
    metrics_df = pd.DataFrame([new_metrics])

metrics_df


Unnamed: 0,Model,Precision@5,Recall@5,NDCG@5,Precision@10,Recall@10,NDCG@10
0,Collaborative Filtering,0.0115,0.0057,0.0111,0.0138,0.0141,0.0149
1,KG (Genres),0.0549,0.0306,0.063,0.0469,0.0495,0.0622
2,Graph Retrieval,0.098246,0.026714,0.114434,0.077412,0.041151,0.100099
3,LLM-Enhanced (Embedding + Re-rank),0.0341,0.0216,0.0379,0.0304,0.0428,0.042
