In [1]:
import json
import os
import random
from typing import Dict, List, Tuple, Any
from collections import defaultdict, Counter
import networkx as nx
from openai import OpenAI


In [2]:
import pandas as pd

In [None]:
import pandas as pd
import json

class NewsRecommendationSystem:
    def __init__(self, kg_analysis_path: str):
        """Initialize the recommendation system with KG analysis data"""
        # Load API key
        self.client = OpenAI(base_url="https://api.groq.com/openai/v1", api_key= "*******")
        
        # Load KG analysis file
        with open('../data/processed/kg_analysis1.json', 'r') as f:
            self.kg_data = json.load(f)
        
        # Extract context
        self.top_entities = {entity[0]: {'name': entity[2], 'frequency': entity[1]} 
                           for entity in self.kg_data['top_entities']}
        self.stats = self.kg_data['stats']
        
        # Initialize data structure with KG context
        self.user_interactions = self._load_user_interactions()
        self.news_articles = self._load_news_articles()
        self.entity_cooccurrence = self._load_entity_cooccurrence()
        
    def _parse_entities(self, entity_str):
        """Helper function to parse entity JSON strings"""
        if pd.isna(entity_str) or entity_str == '[]':
            return []
        
        try:
            entities = json.loads(entity_str)
            return [entity.get('WikidataId', '') for entity in entities if entity.get('WikidataId')]
        except:
            return []

    def _get_article_entities(self, article_id):
        """Helper function to get entities for a specific article"""
        if hasattr(self, 'news_articles') and article_id in self.news_articles:
            return self.news_articles[article_id]["entities"]
        
        # Fallback: load article on demand if not in memory
        try:
            news_path = '../data/mind/MINDsmall_train/news.tsv'
            news_df = pd.read_csv(news_path, sep='\t', 
                                names=['news_id', 'category', 'subcategory', 'title', 'abstract', 
                                       'url', 'title_entities', 'abstract_entities'])
            
            article_row = news_df[news_df['news_id'] == article_id]
            if not article_row.empty:
                row = article_row.iloc[0]
                title_entities = self._parse_entities(row['title_entities'])
                abstract_entities = self._parse_entities(row['abstract_entities'])
                return list(set(title_entities + abstract_entities))
        except:
            pass
        
        return []
        
    def _load_user_interactions(self) -> Dict:
        """Load actual user interaction data from MIND dataset"""
        try:
            behaviors_path = '../data/mind/MINDsmall_train/behaviors.tsv'
            behaviors_df = pd.read_csv(behaviors_path, sep='\t', 
                                     names=['impression_id', 'user_id', 'timestamp', 'history', 'impressions'])
            
            user_interactions = {}
            
            print(f"Loading user interactions from {len(behaviors_df)} behavior records...")
            
            # Process subset of users to avoid memory issues
            for idx, row in behaviors_df.head(1000).iterrows():
                user_id = row['user_id']
                
                if user_id not in user_interactions:
                    user_interactions[user_id] = {
                        "clicked_entities": [],
                        "viewed_entities": [],
                        "interaction_counts": {}
                    }
                
                # Process history (articles user previously viewed)
                if pd.notna(row['history']):
                    history_articles = row['history'].split()
                    for article_id in history_articles[:10]:  # Limit to recent 10 articles
                        entities = self._get_article_entities(article_id)
                        user_interactions[user_id]["viewed_entities"].extend(entities)
                        for entity in entities:
                            if entity in self.top_entities:  # Only count entities in our KG
                                user_interactions[user_id]["interaction_counts"][entity] = \
                                    user_interactions[user_id]["interaction_counts"].get(entity, 0) + 1
                
                # Process impressions (articles shown with click labels)
                if pd.notna(row['impressions']):
                    impressions = row['impressions'].split()
                    for impression in impressions:
                        if '-' in impression:
                            article_id, clicked = impression.split('-')
                            entities = self._get_article_entities(article_id)
                            
                            # Add to viewed entities
                            user_interactions[user_id]["viewed_entities"].extend(entities)
                            
                            # If clicked, add higher weight
                            if clicked == '1':
                                user_interactions[user_id]["clicked_entities"].extend(entities)
                                for entity in entities:
                                    if entity in self.top_entities:  # Only count entities in our KG
                                        user_interactions[user_id]["interaction_counts"][entity] = \
                                            user_interactions[user_id]["interaction_counts"].get(entity, 0) + 3
            
            # Clean up duplicates
            for user_id in user_interactions:
                user_interactions[user_id]["clicked_entities"] = list(set(user_interactions[user_id]["clicked_entities"]))
                user_interactions[user_id]["viewed_entities"] = list(set(user_interactions[user_id]["viewed_entities"]))
            
            print(f"Loaded interactions for {len(user_interactions)} users")
            return user_interactions
            
        except Exception as e:
            print(f"Error loading user interactions: {e}")
            print("Falling back to mock data...")
            # Fallback to mock data if real data fails
            return {
                f"user_{i}": {
                    "clicked_entities": random.sample(list(self.top_entities.keys()), 
                                                    min(random.randint(3, 8), len(self.top_entities.keys()))),
                    "viewed_entities": random.sample(list(self.top_entities.keys()), 
                                                   min(random.randint(5, 10), len(self.top_entities.keys()))),
                    "interaction_counts": {entity: random.randint(1, 10) 
                                         for entity in random.sample(list(self.top_entities.keys()), 
                                                                    min(random.randint(5, 10), len(self.top_entities.keys())))}
                }
                for i in range(100) 
            }
    
    def _load_news_articles(self) -> Dict:
        """Load actual news articles from MIND dataset"""
        try:
            news_path = '../data/mind/MINDsmall_train/news.tsv'
            news_df = pd.read_csv(news_path, sep='\t', 
                                names=['news_id', 'category', 'subcategory', 'title', 'abstract', 
                                       'url', 'title_entities', 'abstract_entities'])
            
            news_articles = {}
            
            print(f"Loading {len(news_df)} news articles...")
            
            for _, row in news_df.iterrows():
                news_id = row['news_id']
                
                # Parse entities from JSON strings
                title_entities = self._parse_entities(row['title_entities'])
                abstract_entities = self._parse_entities(row['abstract_entities'])
                all_entities = list(set(title_entities + abstract_entities))
                
                # Filter entities to only include those in our KG
                kg_entities = [e for e in all_entities if e in self.top_entities]
                
                news_articles[news_id] = {
                    "title": row['title'] if pd.notna(row['title']) else f"News Article {news_id}",
                    "abstract": row['abstract'] if pd.notna(row['abstract']) else f"Abstract for {news_id}",
                    "entities": kg_entities,
                    "category": row['category'] if pd.notna(row['category']) else "general"
                }
            
            print(f"Loaded {len(news_articles)} articles with entities")
            return news_articles
            
        except Exception as e:
            print(f"Error loading news articles: {e}")
            print("Falling back to mock data...")
            # Fallback to mock data if real data fails
            return {
                f"news_{i}": {
                    "title": f"Sample News Article {i}",
                    "abstract": f"This is a sample abstract for news article {i}",
                    "entities": random.sample(list(self.top_entities.keys()), 
                                            random.randint(2, 6)),
                    "category": random.choice(["politics", "sports", "technology", "world"])
                }
                for i in range(1000) 
            }
    
    def _load_entity_cooccurrence(self) -> Dict:
        """Build entity co-occurrence matrix from real news articles"""
        print("Building entity co-occurrence matrix...")
        cooccurrence = defaultdict(dict)
        
        # Count entity co-occurrences across all articles
        for article_id, article_data in self.news_articles.items():
            entities = article_data["entities"]
            
            # For each pair of entities in the same article
            for i, entity1 in enumerate(entities):
                for entity2 in entities[i+1:]:
                    if entity1 in self.top_entities and entity2 in self.top_entities:
                        if entity2 not in cooccurrence[entity1]:
                            cooccurrence[entity1][entity2] = 0
                        if entity1 not in cooccurrence[entity2]:
                            cooccurrence[entity2][entity1] = 0
                        
                        cooccurrence[entity1][entity2] += 1
                        cooccurrence[entity2][entity1] += 1
        
        # Filter weak connections
        filtered_cooccurrence = {}
        min_cooccurrence = 2
        
        for entity1, connections in cooccurrence.items():
            filtered_cooccurrence[entity1] = {}
            for entity2, count in connections.items():
                if count >= min_cooccurrence:
                    filtered_cooccurrence[entity1][entity2] = count
        
        print(f"Built co-occurrence matrix for {len(filtered_cooccurrence)} entities")
        return filtered_cooccurrence

    def extract_kg_context(self, user_id: str, candidate_articles: List[str]) -> str:
        """Extract rich KG context for a user and candidate articles"""
        user_data = self.user_interactions.get(user_id, {})
        
        # Get user's top entities based on interaction 
        user_entities = []
        clicked_entities = user_data.get("clicked_entities", [])
        interaction_counts = user_data.get("interaction_counts", {})
        
        # Prioritize clicked entities with interaction counts
        for entity in clicked_entities[:5]: 
            entity_info = self.top_entities.get(entity, {})
            count = interaction_counts.get(entity, 1)
            user_entities.append(f"{entity_info.get('name', entity)} (clicked {count} times)")
        
        # Get co-occurring entities for interests shown by user
        related_entities = []
        for entity in clicked_entities[:3]:
            if entity in self.entity_cooccurrence:
                for related, strength in list(self.entity_cooccurrence[entity].items())[:3]:
                    related_info = self.top_entities.get(related, {})
                    related_entities.append(f"{related_info.get('name', related)} (co-occurs {strength} times)")
        
        # Analyze candidate articles
        candidate_context = []
        for article_id in candidate_articles:
            article = self.news_articles.get(article_id, {})
            article_entities = []
            for entity in article.get("entities", []):
                entity_info = self.top_entities.get(entity, {})
                article_entities.append(entity_info.get('name', entity))
            
            candidate_context.append({
                "title": article.get("title", ""),
                "category": article.get("category", ""),
                "entities": article_entities
            })
        
        # Construct KG context
        context = f"""
USER PROFILE FROM KNOWLEDGE GRAPH:
Primary Interests (Clicked Entities): {', '.join(user_entities)}
Related Interests (Co-occurring Entities): {', '.join(related_entities)}

CANDIDATE ARTICLES WITH ENTITY ANALYSIS:
"""
        
        for i, article in enumerate(candidate_context, 1):
            context += f"""
Article {i}: {article['title']}
Category: {article['category']}
Key Entities: {', '.join(article['entities'])}
"""
        
        context += f"""
ENTITY IMPORTANCE CONTEXT:
Global trending entities: {', '.join([self.top_entities[e]['name'] for e in list(self.top_entities.keys())[:5]])}

RELATIONSHIP INSIGHTS:
The knowledge graph shows {self.stats['nodes']} entities connected through {self.stats['edges']} relationships, 
indicating rich contextual associations between topics.
"""
        
        return context

    def extract_llm_only_context(self, user_id: str, candidate_articles: List[str]) -> str:
        """Extract basic context without KG insights (for comparison)"""
        user_data = self.user_interactions.get(user_id, {})
        
        # Get user history (basic)
        clicked_entities = user_data.get("clicked_entities", [])[:3]
        basic_interests = [self.top_entities.get(e, {}).get('name', e) for e in clicked_entities]
        
        # Get article information (basic)
        candidate_context = []
        for article_id in candidate_articles:
            article = self.news_articles.get(article_id, {})
            candidate_context.append({
                "title": article.get("title", ""),
                "abstract": article.get("abstract", ""),
                "category": article.get("category", "")
            })
        
        context = f"""
User History:
Previously interested in: {', '.join(basic_interests)}

Candidate Articles:
"""
        
        for i, article in enumerate(candidate_context, 1):
            context += f"""
Article {i}: {article['title']}
Summary: {article['abstract']}
Category: {article['category']}
"""
        
        return context

    def generate_recommendations(self, context: str, method: str) -> Dict[str, Any]:
        """Generate recommendations using Groq LLM"""
        
        system_prompt = f"""You are a news recommendation system using {method} approach. 
        Analyze the provided context and recommend the most relevant articles for this user.
        
        Provide your response in this format:
        RECOMMENDATIONS: [List top 3 recommended articles with brief reasoning]
        EXPLANATIONS: [Explain why these articles match user interests]
        DIVERSITY: [Comment on topic diversity in recommendations]
        CONFIDENCE: [Rate confidence 1-10 and explain]
        """
        
        user_prompt = f"""
        Context: {context}
        
        Please provide personalized news recommendations based on this context.
        Focus on relevance, diversity, and explainability.
        """
        
        try:
            response = self.client.chat.completions.create(
                model="llama3-8b-8192",  # or llama2-70b-4096
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.3,
                max_tokens=1000
            )
            
            return {
                "method": method,
                "response": response.choices[0].message.content,
                "success": True
            }
        
        except Exception as e:
            return {
                "method": method,
                "response": f"Error: {str(e)}",
                "success": False
            }

    def compare_methods(self, user_id: str, num_candidates: int = 5) -> Dict[str, Any]:
        """Compare KG+LLM vs LLM-only recommendations"""
        
        # Get candidate articles
        candidate_articles = random.sample(list(self.news_articles.keys()), num_candidates)
        
        # Extract contexts using both methods
        kg_context = self.extract_kg_context(user_id, candidate_articles)
        llm_context = self.extract_llm_only_context(user_id, candidate_articles)
        
        # Generate recommendations using both methods
        kg_llm_results = self.generate_recommendations(kg_context, "KG+LLM")
        llm_only_results = self.generate_recommendations(llm_context, "LLM-Only")
        
        return {
            "user_id": user_id,
            "candidate_articles": candidate_articles,
            "kg_context_length": len(kg_context),
            "llm_context_length": len(llm_context),
            "kg_llm_recommendations": kg_llm_results,
            "llm_only_recommendations": llm_only_results,
            "context_comparison": {
                "kg_context_preview": kg_context[:500] + "..." if len(kg_context) > 500 else kg_context,
                "llm_context_preview": llm_context[:500] + "..." if len(llm_context) > 500 else llm_context
            }
        }

    def run_evaluation(self, num_users: int = 5) -> List[Dict]:
        """Run evaluation comparing both methods across multiple users"""
        results = []
        
        print("Starting KG+LLM vs LLM-Only Evaluation...")
        print("=" * 60)
        
        # Get actual user IDs from loaded data
        user_ids = list(self.user_interactions.keys())[:num_users]
        
        for i, user_id in enumerate(user_ids):
            print(f"\nEvaluating User {i+1}/{num_users}: {user_id}")
            
            comparison_result = self.compare_methods(user_id)
            results.append(comparison_result)
            
            # Print summary
            print(f"KG Context Length: {comparison_result['kg_context_length']} chars")
            print(f"LLM Context Length: {comparison_result['llm_context_length']} chars")
            print(f"KG+LLM Success: {comparison_result['kg_llm_recommendations']['success']}")
            print(f"LLM-Only Success: {comparison_result['llm_only_recommendations']['success']}")
        
        return results

    def analyze_results(self, results: List[Dict]) -> Dict[str, Any]:
        """Analyze and summarize comparison results"""
        successful_kg = sum(1 for r in results if r['kg_llm_recommendations']['success'])
        successful_llm = sum(1 for r in results if r['llm_only_recommendations']['success'])
        
        avg_kg_context_length = sum(r['kg_context_length'] for r in results) / len(results)
        avg_llm_context_length = sum(r['llm_context_length'] for r in results) / len(results)
        
        analysis = {
            "total_evaluations": len(results),
            "kg_llm_success_rate": successful_kg / len(results),
            "llm_only_success_rate": successful_llm / len(results),
            "avg_kg_context_length": avg_kg_context_length,
            "avg_llm_context_length": avg_llm_context_length,
            "context_enrichment_ratio": avg_kg_context_length / avg_llm_context_length,
            "detailed_results": results
        }
        
        return analysis

In [None]:
import os
os.environ["GROQ_API_KEY"] = "**************"

In [5]:

def main():
    
    if not os.getenv("GROQ_API_KEY"):
        print("ERROR: Please set GROQ_API_KEY environment variable")
        print("Example: export GROQ_API_KEY='your_api_key_here'")
        return
    
    # Initialize system
    system = NewsRecommendationSystem("kg_analysis.json")
    
    # Evaluation
    results = system.run_evaluation(num_users=3)
    
    # Results
    analysis = system.analyze_results(results)
    
    print("\n")
    print(" Summary")
    print("\n")
    print(f"Total Evaluations: {analysis['total_evaluations']}")
    print(f"KG+LLM Success Rate: {analysis['kg_llm_success_rate']:.2%}")
    print(f"LLM-Only Success Rate: {analysis['llm_only_success_rate']:.2%}")
    print(f"Average KG Context Length: {analysis['avg_kg_context_length']:.0f} chars")
    print(f"Average LLM Context Length: {analysis['avg_llm_context_length']:.0f} chars")
    print(f"Context Enrichment Ratio: {analysis['context_enrichment_ratio']:.2f}x")
    
    # Show recommendations
    if results:
        print("\n")
        print("Recommendation Comparison")
        print("\n")
        sample = results[0]
        
        print("\nKG+LLM Recommendation:")
        print("\n")
        if sample['kg_llm_recommendations']['success']:
            print(sample['kg_llm_recommendations']['response'])
        else:
            print("Failed to generate recommendation")
        
        print("\nLLM-ONLY Recommendation:")
        print("\n")
        if sample['llm_only_recommendations']['success']:
            print(sample['llm_only_recommendations']['response'])
        else:
            print("Failed to generate recommendation")

if __name__ == "__main__":               
    main()

Loading user interactions from 156965 behavior records...
Loaded interactions for 988 users
Loading 51282 news articles...
Loaded 51282 articles with entities
Building entity co-occurrence matrix...
Built co-occurrence matrix for 10 entities
Starting KG+LLM vs LLM-Only Evaluation...

Evaluating User 1/3: U13740
KG Context Length: 1102 chars
LLM Context Length: 1737 chars
KG+LLM Success: True
LLM-Only Success: True

Evaluating User 2/3: U91836
KG Context Length: 1079 chars
LLM Context Length: 1915 chars
KG+LLM Success: True
LLM-Only Success: True

Evaluating User 3/3: U73700
KG Context Length: 1196 chars
LLM Context Length: 2135 chars
KG+LLM Success: True
LLM-Only Success: True


 Summary


Total Evaluations: 3
KG+LLM Success Rate: 100.00%
LLM-Only Success Rate: 100.00%
Average KG Context Length: 1126 chars
Average LLM Context Length: 1929 chars
Context Enrichment Ratio: 0.58x


Recommendation Comparison



KG+LLM Recommendation:


RECOMMENDATIONS:
[1] Article 3: Scientists Discover Wha

In [6]:
#EVALUATION 

In [7]:
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np

In [None]:

# Load MIND validation data
print("Loading MIND validation data for evaluation...")

# user interactions
behaviors_val = pd.read_csv('../data/mind/MINDsmal_test/behaviors.tsv', 
                           sep='\t', 
                           names=['impression_id', 'user_id', 'timestamp', 'history', 'impressions'])

# validation news articles
news_val = pd.read_csv('../data/mind/MINDsmal_test/news.tsv', 
                      sep='\t', 
                      names=['news_id', 'category', 'subcategory', 'title', 'abstract', 
                             'url', 'title_entities', 'abstract_entities'])

print(f"Validation behaviors: {len(behaviors_val)} records")
print(f"Validation news articles: {len(news_val)} unique articles")
print(f"Unique users in validation: {behaviors_val['user_id'].nunique()}")

# Display data structure
print("\nSample behavior record:")
print(behaviors_val.head(1).to_string())
print("\nSample news record:")
print(news_val.head(1).to_string())

Loading MIND validation data for evaluation...
Validation behaviors: 73152 records
Validation news articles: 42416 unique articles
Unique users in validation: 50000

Sample behavior record:
   impression_id user_id               timestamp                                                                                                history                                                                                                                                                                                        impressions
0              1  U80234  11/15/2019 12:37:50 PM  N55189 N46039 N51741 N53234 N11276 N264 N40716 N28088 N43955 N6616 N47686 N63573 N38895 N30924 N35671  N28682-0 N48740-0 N31958-1 N34130-0 N6916-0 N5472-0 N50775-0 N24802-0 N19990-0 N33176-0 N62365-0 N5940-0 N6400-0 N58098-0 N42844-0 N49285-0 N51470-0 N53572-0 N11930-0 N21679-0 N55237-0 N29862-0

Sample news record:
  news_id   category      subcategory                                                          

In [None]:
# Process validation data for evaluation
print("Processing validation data for AUC calculation...")

def parse_impressions(impression_str):
    """Parse impression data as string to get news_ids and click labels"""
    if pd.isna(impression_str):
        return [], []
    
    items = impression_str.split()
    news_ids = []
    labels = []
    
    for item in items:
        if '-' in item:
            news_id, label = item.split('-')
            news_ids.append(news_id)
            labels.append(int(label))
    
    return news_ids, labels

def parse_entities(entity_str):
    """Parse entity JSON string to extract WikidataIds"""
    if pd.isna(entity_str) or entity_str == '[]':
        return []
    
    try:
        import json
        entities = json.loads(entity_str)
        return [entity.get('WikidataId', '') for entity in entities if entity.get('WikidataId')]
    except:
        return []

# Processed evaluation dataset
evaluation_data = []
processed_users = 0

print("Processing user impressions and clicks...")

for idx, row in behaviors_val.head(1000).iterrows(): 
    user_id = row['user_id']
    impression_str = row['impressions']
    
    if pd.notna(impression_str):
        news_ids, click_labels = parse_impressions(impression_str)
        
        if len(news_ids) > 0:  # Only users with valid impressions ids
            evaluation_data.append({
                'user_id': user_id,
                'news_ids': news_ids,
                'click_labels': click_labels,
                'num_impressions': len(news_ids),
                'num_clicks': sum(click_labels)
            })
            processed_users += 1

print(f"Processed {processed_users} users with valid impressions")
print(f"Average impressions per user: {np.mean([d['num_impressions'] for d in evaluation_data]):.1f}")
print(f"Average clicks per user: {np.mean([d['num_clicks'] for d in evaluation_data]):.1f}")

# news entity mapping for scoring
print("\nCreating news-entity mapping...")
news_entities = {}

for idx, row in news_val.iterrows():
    news_id = row['news_id']
    title_entities = parse_entities(row['title_entities'])
    abstract_entities = parse_entities(row['abstract_entities'])
    
    # Merge title and abstract entities
    all_entities = list(set(title_entities + abstract_entities))
    news_entities[news_id] = all_entities

print(f"Mapped entities for {len(news_entities)} news articles")
print(f"Sample news entities: {list(news_entities.items())[:2]}")

# evaluation 
if evaluation_data:
    sample = evaluation_data[0]
    print(f"\nSample evaluation record:")
    print(f"User: {sample['user_id']}")
    print(f"Impressions: {sample['num_impressions']}")
    print(f"Clicks: {sample['num_clicks']}")
    print(f"First 5 news_ids: {sample['news_ids'][:5]}")
    print(f"First 5 labels: {sample['click_labels'][:5]}")

Processing validation data for AUC calculation...
Processing user impressions and clicks...
Processed 1000 users with valid impressions
Average impressions per user: 35.9
Average clicks per user: 1.5

Creating news-entity mapping...
Mapped entities for 42416 news articles
Sample news entities: [('N55528', ['Q43274', 'Q9682', 'Q80976']), ('N18955', ['Q622899'])]

Sample evaluation record:
User: U80234
Impressions: 22
Clicks: 1
First 5 news_ids: ['N28682', 'N48740', 'N31958', 'N34130', 'N6916']
First 5 labels: [0, 0, 1, 0, 0]


In [10]:
#Evaluation of recommendations generated by system
if 'system' not in locals():
    system = NewsRecommendationSystem("../data/processed/kg_analysis1.json") 

print("Using existing recommendation system for scoring...")

Loading user interactions from 156965 behavior records...
Loaded interactions for 988 users
Loading 51282 news articles...
Loaded 51282 articles with entities
Building entity co-occurrence matrix...
Built co-occurrence matrix for 10 entities
Using existing recommendation system for scoring...


In [15]:
# user entity from training data and calculate prediction scores
print("Creating user entity profiles for scoring...")

def get_user_entity_profile(user_id, system):
    """Get user entity profile from existing KG system"""
    # user interactions from the system
    user_data = system.user_interactions.get(user_id, {})
    
    if not user_data:
        # Fallback to random user for testing
        user_data = system.user_interactions.get('user_0', {})
    
    clicked_entities = user_data.get('clicked_entities', [])
    interaction_counts = user_data.get('interaction_counts', {})
    
    # weighted entity profile
    entity_profile = {}
    for entity in clicked_entities:
        weight = interaction_counts.get(entity, 1)
        entity_profile[entity] = weight
    
    return entity_profile

def calculate_entity_overlap_score(user_profile, article_entities, system):
    """Calculate relevance score based on entity overlap"""
    if not user_profile or not article_entities:
        return 0.0
    
    # Calculate weighted overlap
    overlap_score = 0.0
    total_user_weight = sum(user_profile.values())
    
    for entity in article_entities:
        if entity in user_profile:
            # Entity relevance from KG analysis
            entity_importance = system.top_entities.get(entity, {}).get('frequency', 1)
            user_weight = user_profile[entity]
            
            # user preference and their relevance
            entity_score = (user_weight / total_user_weight) * np.log(1 + entity_importance)
            overlap_score += entity_score
    
    # Normalize by article entity count
    if len(article_entities) > 0:
        overlap_score = overlap_score / len(article_entities)
    
    return min(overlap_score, 1.0) 

# Prediction scores for AUC evaluation
print("Calculating prediction scores for evaluation users...")

auc_evaluation_data = []

for user_data in evaluation_data[:100]:  
    user_id = user_data['user_id']
    news_ids = user_data['news_ids']
    click_labels = user_data['click_labels']
    
    # User entity profile
    user_profile = get_user_entity_profile(user_id, system)
    
    # Prediction scores for each article
    prediction_scores = []
    valid_labels = []
    
    for news_id, label in zip(news_ids, click_labels):
        article_entities = news_entities.get(news_id, [])
        score = calculate_entity_overlap_score(user_profile, article_entities, system)
        
        prediction_scores.append(score)
        valid_labels.append(label)
    
    if len(prediction_scores) > 0:
        auc_evaluation_data.append({
            'user_id': user_id,
            'prediction_scores': prediction_scores,
            'true_labels': valid_labels,
            'num_impressions': len(prediction_scores)
        })

print(f"Generated prediction scores for {len(auc_evaluation_data)} users")

# Results
if auc_evaluation_data:
    sample = auc_evaluation_data[0]
    print(f"\nSample scoring results:")
    print(f"User: {sample['user_id']}")
    print(f"Prediction scores: {sample['prediction_scores'][:5]}")
    print(f"True labels: {sample['true_labels'][:5]}")
    print(f"Score range: {min(sample['prediction_scores']):.3f} - {max(sample['prediction_scores']):.3f}")
    
    # articles clicked vs predicted scores
    clicked_indices = [i for i, label in enumerate(sample['true_labels']) if label == 1]
    if clicked_indices:
        print(f"Clicked article scores: {[sample['prediction_scores'][i] for i in clicked_indices]}")

Creating user entity profiles for scoring...
Calculating prediction scores for evaluation users...
Generated prediction scores for 100 users

Sample scoring results:
User: U80234
Prediction scores: [0.0, 0.0, 0.0, 0.0, 0.0]
True labels: [0, 0, 1, 0, 0]
Score range: 0.000 - 0.000
Clicked article scores: [0.0]


In [16]:
# AUC for the KG+LLM system
print(" AUC results..")

from sklearn.metrics import roc_auc_score

def calculate_auc_per_user(prediction_scores, true_labels):
    """Calculate AUC for a single user"""
    #  at least one positive and one negative sample
    if len(set(true_labels)) < 2:
        return None  # Cannot calculate AUC with only one class
    
    try:
        return roc_auc_score(true_labels, prediction_scores)
    except:
        return None

# AUC for each user
user_aucs = []
valid_users = 0

for user_data in auc_evaluation_data:
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    auc_score = calculate_auc_per_user(prediction_scores, true_labels)
    
    if auc_score is not None:
        user_aucs.append(auc_score)
        valid_users += 1

#  overall metrics
if user_aucs:
    mean_auc = np.mean(user_aucs)
    median_auc = np.median(user_aucs)
    std_auc = np.std(user_aucs)
    
    print(f"\n AUC ")
    print(f"Valid users for AUC calculation: {valid_users}/{len(auc_evaluation_data)}")
    print(f"Mean AUC: {mean_auc:.4f}")
    print(f"Median AUC: {median_auc:.4f}")
    print(f"Standard Deviation: {std_auc:.4f}")
    print(f"Min AUC: {min(user_aucs):.4f}")
    print(f"Max AUC: {max(user_aucs):.4f}")
    
    # Distribution analysis
    auc_ranges = {
        "0.0-0.3 (Poor)": sum(1 for auc in user_aucs if 0.0 <= auc < 0.3),
        "0.3-0.5 (Below Random)": sum(1 for auc in user_aucs if 0.3 <= auc < 0.5),
        "0.5-0.7 (Fair)": sum(1 for auc in user_aucs if 0.5 <= auc < 0.7),
        "0.7-0.9 (Good)": sum(1 for auc in user_aucs if 0.7 <= auc < 0.9),
        "0.9-1.0 (Excellent)": sum(1 for auc in user_aucs if 0.9 <= auc <= 1.0),
    }
    
    print(f"\n AUC Distribution")
    for range_name, count in auc_ranges.items():
        percentage = (count / len(user_aucs)) * 100

 AUC results..

 AUC 
Valid users for AUC calculation: 100/100
Mean AUC: 0.5000
Median AUC: 0.5000
Standard Deviation: 0.0000
Min AUC: 0.5000
Max AUC: 0.5000

 AUC Distribution


In [13]:
# MRR for the KG+LLM system
print("Calculating MRR")

def calculate_mrr_per_user(prediction_scores, true_labels):
    """Calculate MRR for a single user"""
    # list of (score, label) and sort them
    scored_items = list(zip(prediction_scores, true_labels))
    scored_items.sort(key=lambda x: x[0], reverse=True)
    
    # first clicked item in ranked list
    for rank, (score, label) in enumerate(scored_items, 1):
        if label == 1:  # First clicked item
            return 1.0 / rank
    
    return 0.0  # No clicked items foubd

# MRR for each user
user_mrrs = []

for user_data in auc_evaluation_data:
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    mrr_score = calculate_mrr_per_user(prediction_scores, true_labels)
    user_mrrs.append(mrr_score)

# MRR metrics
mean_mrr = np.mean(user_mrrs)
median_mrr = np.median(user_mrrs)
std_mrr = np.std(user_mrrs)

print(f"\n MRR Results")
print(f"Mean MRR: {mean_mrr:.4f}")
print(f"Median MRR: {median_mrr:.4f}")
print(f"Standard Deviation: {std_mrr:.4f}")
print(f"Min MRR: {min(user_mrrs):.4f}")
print(f"Max MRR: {max(user_mrrs):.4f}")

# MRR Distribution
mrr_ranges = {
    "0.0 (No clicks ranked first)": sum(1 for mrr in user_mrrs if mrr == 0.0),
    "0.0-0.2 (Poor ranking)": sum(1 for mrr in user_mrrs if 0.0 < mrr <= 0.2),
    "0.2-0.5 (Fair ranking)": sum(1 for mrr in user_mrrs if 0.2 < mrr <= 0.5),
    "0.5-1.0 (Good ranking)": sum(1 for mrr in user_mrrs if 0.5 < mrr <= 1.0),
}

print(f"\n MRR Distribution ")
for range_name, count in mrr_ranges.items():
    percentage = (count / len(user_mrrs)) * 100
    print(f"{range_name}: {count} users ({percentage:.1f}%)")

# sample rankings
print(f"\n Examples")
for i, user_data in enumerate(auc_evaluation_data[:3]):
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    # ranked list
    scored_items = list(zip(prediction_scores, true_labels, range(len(true_labels))))
    scored_items.sort(key=lambda x: x[0], reverse=True)
    
    # clicked items and their ranks
    clicked_ranks = []
    for rank, (score, label, orig_idx) in enumerate(scored_items, 1):
        if label == 1:
            clicked_ranks.append(rank)
    
    mrr = user_mrrs[i]
    print(f"User {i+1}: MRR = {mrr:.3f}, Clicked items at ranks: {clicked_ranks}")

Calculating MRR

 MRR Results
Mean MRR: 0.2848
Median MRR: 0.1667
Standard Deviation: 0.3005
Min MRR: 0.0103
Max MRR: 1.0000

 MRR Distribution 
0.0 (No clicks ranked first): 0 users (0.0%)
0.0-0.2 (Poor ranking): 58 users (58.0%)
0.2-0.5 (Fair ranking): 30 users (30.0%)
0.5-1.0 (Good ranking): 12 users (12.0%)

 Examples
User 1: MRR = 0.333, Clicked items at ranks: [3]
User 2: MRR = 0.500, Clicked items at ranks: [2]
User 3: MRR = 0.071, Clicked items at ranks: [14]


In [14]:
# nDCG@5 and nDCG@10 for the KG+LLM system
print("Calculating nDCG@5 and nDCG@10...")

def calculate_dcg(relevances, k):
    """Calculate DCG@k"""
    dcg = 0.0
    for i in range(min(k, len(relevances))):
        dcg += relevances[i] / np.log2(i + 2)  
    return dcg

def calculate_ndcg_per_user(prediction_scores, true_labels, k):
    """Calculate nDCG@k for a single user"""
    # ranked list by prediction scores
    scored_items = list(zip(prediction_scores, true_labels))
    scored_items.sort(key=lambda x: x[0], reverse=True)
    
    # labels in ranked order
    ranked_relevances = [label for score, label in scored_items]
    
    # DCG@k
    dcg_k = calculate_dcg(ranked_relevances, k)
    
    # IDCG@k 
    ideal_relevances = sorted(true_labels, reverse=True)
    idcg_k = calculate_dcg(ideal_relevances, k)
    
    #  nDCG@k
    if idcg_k == 0:
        return 0.0
    return dcg_k / idcg_k

#  nDCG@5 and nDCG@10 for each user
user_ndcg5 = []
user_ndcg10 = []

for user_data in auc_evaluation_data:
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    ndcg5 = calculate_ndcg_per_user(prediction_scores, true_labels, 5)
    ndcg10 = calculate_ndcg_per_user(prediction_scores, true_labels, 10)
    
    user_ndcg5.append(ndcg5)
    user_ndcg10.append(ndcg10)

# Calculate overall nDCG metrics
print(f"\n nDCG@5 Results")
print(f"Mean nDCG@5: {np.mean(user_ndcg5):.4f}")
print(f"Median nDCG@5: {np.median(user_ndcg5):.4f}")
print(f"Standard Deviation: {np.std(user_ndcg5):.4f}")
print(f"Min nDCG@5: {min(user_ndcg5):.4f}")
print(f"Max nDCG@5: {max(user_ndcg5):.4f}")

print(f"\n nDCG@10 Results ")
print(f"Mean nDCG@10: {np.mean(user_ndcg10):.4f}")
print(f"Median nDCG@10: {np.median(user_ndcg10):.4f}")
print(f"Standard Deviation: {np.std(user_ndcg10):.4f}")
print(f"Min nDCG@10: {min(user_ndcg10):.4f}")
print(f"Max nDCG@10: {max(user_ndcg10):.4f}")

# Distribution analysis
ndcg5_ranges = {
    "0.0-0.2": sum(1 for ndcg in user_ndcg5 if 0.0 <= ndcg < 0.2),
    "0.2-0.4": sum(1 for ndcg in user_ndcg5 if 0.2 <= ndcg < 0.4),
    "0.4-0.6": sum(1 for ndcg in user_ndcg5 if 0.4 <= ndcg < 0.6),
    "0.6-0.8": sum(1 for ndcg in user_ndcg5 if 0.6 <= ndcg < 0.8),
    "0.8-1.0": sum(1 for ndcg in user_ndcg5 if 0.8 <= ndcg <= 1.0),
}

print(f"\n nDCG@5 Distribution")
for range_name, count in ndcg5_ranges.items():
    percentage = (count / len(user_ndcg5)) * 100
    print(f"{range_name}: {count} users ({percentage:.1f}%)")

# Summary table
print(f"\n PERFORMANCE SUMMARY")
print(f"{'Metric':<12} {'Mean':<8} {'Median':<8} {'Std':<8}")
print(f"{'-'*40}")
print(f"{'AUC':<12} {np.mean([auc for auc in user_aucs]):<8.4f} {np.median([auc for auc in user_aucs]):<8.4f} {np.std([auc for auc in user_aucs]):<8.4f}")
print(f"{'MRR':<12} {np.mean(user_mrrs):<8.4f} {np.median(user_mrrs):<8.4f} {np.std(user_mrrs):<8.4f}")
print(f"{'nDCG@5':<12} {np.mean(user_ndcg5):<8.4f} {np.median(user_ndcg5):<8.4f} {np.std(user_ndcg5):<8.4f}")
print(f"{'nDCG@10':<12} {np.mean(user_ndcg10):<8.4f} {np.median(user_ndcg10):<8.4f} {np.std(user_ndcg10):<8.4f}")

Calculating nDCG@5 and nDCG@10...

 nDCG@5 Results
Mean nDCG@5: 0.2770
Median nDCG@5: 0.0000
Standard Deviation: 0.3387
Min nDCG@5: 0.0000
Max nDCG@5: 1.0000

 nDCG@10 Results 
Mean nDCG@10: 0.3395
Median nDCG@10: 0.3244
Standard Deviation: 0.3202
Min nDCG@10: 0.0000
Max nDCG@10: 1.0000

 nDCG@5 Distribution
0.0-0.2: 55 users (55.0%)
0.2-0.4: 8 users (8.0%)
0.4-0.6: 15 users (15.0%)
0.6-0.8: 12 users (12.0%)
0.8-1.0: 10 users (10.0%)

 PERFORMANCE SUMMARY
Metric       Mean     Median   Std     
----------------------------------------
AUC          0.5000   0.5000   0.0000  
MRR          0.2848   0.1667   0.3005  
nDCG@5       0.2770   0.0000   0.3387  
nDCG@10      0.3395   0.3244   0.3202  


In [17]:
# LLM-only Baseline Evaluation 


print("Calculating AUC, MRR, nDCG@5, nDCG@10 for LLM-only approach...")

def create_llm_only_user_profiles(evaluation_data, behaviors_val, news_entities):
    """ simple user profiles without KG context"""
    print("Creating LLM-only user profiles...")
    
    llm_user_profiles = {}
    
    for user_data in evaluation_data:
        user_id = user_data['user_id']
        
        # users history from behaviors data 
        user_behaviors = behaviors_val[behaviors_val['user_id'] == user_id]
        
        if not user_behaviors.empty:
            history = user_behaviors.iloc[0]['history']
            
            # entity counting from history 
            entity_counts = defaultdict(int)
            
            if pd.notna(history):
                history_articles = history.split()[:10]  
                
                for article_id in history_articles:
                    if article_id in news_entities:
                        entities = news_entities[article_id]
                        for entity in entities:
                            entity_counts[entity] += 1
            
            llm_user_profiles[user_id] = dict(entity_counts)
        else:
            llm_user_profiles[user_id] = {}
    
    print(f"Created profiles for {len(llm_user_profiles)} users")
    return llm_user_profiles

def calculate_llm_only_score(user_profile, article_entities):
    """Calculate simple relevance score without KG context"""
    if not user_profile or not article_entities:
        return 0.0
    
    # overlap scoring  (count matching entities only)
    overlap_score = 0.0
    total_user_interactions = sum(user_profile.values())
    
    if total_user_interactions > 0:
        for entity in article_entities:
            if entity in user_profile:
                # normalized score
                overlap_score += user_profile[entity] / total_user_interactions
        
        # Normalize by article entity count
        overlap_score = overlap_score / len(article_entities)
    
    return min(overlap_score, 1.0) 

def calculate_auc_per_user(prediction_scores, true_labels):
    """Calculate AUC for a single user"""
    if len(set(true_labels)) < 2:
        return None  
    
    try:
        return roc_auc_score(true_labels, prediction_scores)
    except:
        return None

def calculate_mrr_per_user(prediction_scores, true_labels):
    """Calculate MRR for a single user"""
    scored_items = list(zip(prediction_scores, true_labels))
    scored_items.sort(key=lambda x: x[0], reverse=True)
    
    for rank, (score, label) in enumerate(scored_items, 1):
        if label == 1:  # first clicked item by user
            return 1.0 / rank
    
    return 0.0  # in case no clicked items found

def calculate_dcg(relevances, k):
    """Calculate DCG@k"""
    dcg = 0.0
    for i in range(min(k, len(relevances))):
        dcg += relevances[i] / np.log2(i + 2)
    return dcg

def calculate_ndcg_per_user(prediction_scores, true_labels, k):
    """Calculate nDCG@k for a single user"""
    scored_items = list(zip(prediction_scores, true_labels))
    scored_items.sort(key=lambda x: x[0], reverse=True)
    
    ranked_relevances = [label for score, label in scored_items]
    dcg_k = calculate_dcg(ranked_relevances, k)
    
    ideal_relevances = sorted(true_labels, reverse=True)
    idcg_k = calculate_dcg(ideal_relevances, k)
    
    if idcg_k == 0:
        return 0.0
    return dcg_k / idcg_k



Calculating AUC, MRR, nDCG@5, nDCG@10 for LLM-only approach...


In [18]:

# create user profiles fro LLM-only approach
llm_user_profiles = create_llm_only_user_profiles(evaluation_data, behaviors_val, news_entities)

# prediction scores for LLM-only approach
print("Calculating LLM-only prediction scores...")

llm_evaluation_data = []

for user_data in evaluation_data[:100]:  #take first 100 users
    user_id = user_data['user_id']
    news_ids = user_data['news_ids']
    click_labels = user_data['click_labels']
    
    # Get user profile
    user_profile = llm_user_profiles.get(user_id, {})
    
    # Calculate prediction scores for each article
    prediction_scores = []
    
    for news_id in news_ids:
        article_entities = news_entities.get(news_id, [])
        score = calculate_llm_only_score(user_profile, article_entities)
        prediction_scores.append(score)
    
    if len(prediction_scores) > 0:
        llm_evaluation_data.append({
            'user_id': user_id,
            'prediction_scores': prediction_scores,
            'true_labels': click_labels,
            'num_impressions': len(prediction_scores)
        })

print(f"Generated LLM-only scores for {len(llm_evaluation_data)} users")



# METRICS 
# AUC

llm_user_aucs = []
valid_users_auc = 0

for user_data in llm_evaluation_data:
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    auc_score = calculate_auc_per_user(prediction_scores, true_labels)
    
    if auc_score is not None:
        llm_user_aucs.append(auc_score)
        valid_users_auc += 1

# MRR 
llm_user_mrrs = []

for user_data in llm_evaluation_data:
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    mrr_score = calculate_mrr_per_user(prediction_scores, true_labels)
    llm_user_mrrs.append(mrr_score)


# nDCG@k
llm_user_ndcg5 = []
llm_user_ndcg10 = []

for user_data in llm_evaluation_data:
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    ndcg5 = calculate_ndcg_per_user(prediction_scores, true_labels, 5)
    ndcg10 = calculate_ndcg_per_user(prediction_scores, true_labels, 10)
    
    llm_user_ndcg5.append(ndcg5)
    llm_user_ndcg10.append(ndcg10)

print("\n")

# results

print("LLM-only Baseline Results")
print("\n")

if llm_user_aucs:
    print(f"\nAUC Results:")
    print(f"Valid users for AUC calculation: {valid_users_auc}/{len(llm_evaluation_data)}")
    print(f"Mean AUC: {np.mean(llm_user_aucs):.4f}")
    print(f"Median AUC: {np.median(llm_user_aucs):.4f}")
    print(f"Standard Deviation: {np.std(llm_user_aucs):.4f}")
    print(f"Min AUC: {min(llm_user_aucs):.4f}")
    print(f"Max AUC: {max(llm_user_aucs):.4f}")

print(f"\nMRR Results:")
print(f"Mean MRR: {np.mean(llm_user_mrrs):.4f}")
print(f"Median MRR: {np.median(llm_user_mrrs):.4f}")
print(f"Standard Deviation: {np.std(llm_user_mrrs):.4f}")
print(f"Min MRR: {min(llm_user_mrrs):.4f}")
print(f"Max MRR: {max(llm_user_mrrs):.4f}")

print(f"\nnDCG@5 Results:")
print(f"Mean nDCG@5: {np.mean(llm_user_ndcg5):.4f}")
print(f"Median nDCG@5: {np.median(llm_user_ndcg5):.4f}")
print(f"Standard Deviation: {np.std(llm_user_ndcg5):.4f}")
print(f"Min nDCG@5: {min(llm_user_ndcg5):.4f}")
print(f"Max nDCG@5: {max(llm_user_ndcg5):.4f}")

print(f"\nnDCG@10 Results:")
print(f"Mean nDCG@10: {np.mean(llm_user_ndcg10):.4f}")
print(f"Median nDCG@10: {np.median(llm_user_ndcg10):.4f}")
print(f"Standard Deviation: {np.std(llm_user_ndcg10):.4f}")
print(f"Min nDCG@10: {min(llm_user_ndcg10):.4f}")
print(f"Max nDCG@10: {max(llm_user_ndcg10):.4f}")

# Summary Table
print(f"\n" + "="*60)
print("LLM-ONLY BASELINE SUMMARY")
print("="*60)
print(f"{'Metric':<12} {'Mean':<8} {'Median':<8} {'Std':<8}")
print(f"{'-'*40}")
if llm_user_aucs:
    print(f"{'AUC':<12} {np.mean(llm_user_aucs):<8.4f} {np.median(llm_user_aucs):<8.4f} {np.std(llm_user_aucs):<8.4f}")
print(f"{'MRR':<12} {np.mean(llm_user_mrrs):<8.4f} {np.median(llm_user_mrrs):<8.4f} {np.std(llm_user_mrrs):<8.4f}")
print(f"{'nDCG@5':<12} {np.mean(llm_user_ndcg5):<8.4f} {np.median(llm_user_ndcg5):<8.4f} {np.std(llm_user_ndcg5):<8.4f}")
print(f"{'nDCG@10':<12} {np.mean(llm_user_ndcg10):<8.4f} {np.median(llm_user_ndcg10):<8.4f} {np.std(llm_user_ndcg10):<8.4f}")

# Distribution Analysis
print(f"\nPerformance Distribution:")
if llm_user_aucs:
    auc_ranges = {
        "0.0-0.3 (Poor)": sum(1 for auc in llm_user_aucs if 0.0 <= auc < 0.3),
        "0.3-0.5 (Below Random)": sum(1 for auc in llm_user_aucs if 0.3 <= auc < 0.5),
        "0.5-0.7 (Fair)": sum(1 for auc in llm_user_aucs if 0.5 <= auc < 0.7),
        "0.7-0.9 (Good)": sum(1 for auc in llm_user_aucs if 0.7 <= auc < 0.9),
        "0.9-1.0 (Excellent)": sum(1 for auc in llm_user_aucs if 0.9 <= auc <= 1.0),
    }
    
    print("AUC Distribution:")
    for range_name, count in auc_ranges.items():
        percentage = (count / len(llm_user_aucs)) * 100
        print(f"  {range_name}: {count} users ({percentage:.1f}%)")

# Sample Analysis
print(f"\nSample User Analysis (First 3 Users):")
for i, user_data in enumerate(llm_evaluation_data[:3]):
    prediction_scores = user_data['prediction_scores']
    true_labels = user_data['true_labels']
    
    clicked_count = sum(true_labels)
    avg_score = np.mean(prediction_scores)
    
    print(f"User {i+1}: {clicked_count} clicks out of {len(true_labels)} impressions")
    print(f"  Average prediction score: {avg_score:.4f}")
    print(f"  Score range: {min(prediction_scores):.4f} - {max(prediction_scores):.4f}")
    
    if clicked_count > 0:
        clicked_scores = [prediction_scores[j] for j, label in enumerate(true_labels) if label == 1]
        print(f"  Clicked article scores: {[f'{score:.4f}' for score in clicked_scores]}")

print("\n")
print("LLM-only Baseline Evaluation done!")

Creating LLM-only user profiles...
Created profiles for 996 users
Calculating LLM-only prediction scores...
Generated LLM-only scores for 100 users


LLM-only Baseline Results



AUC Results:
Valid users for AUC calculation: 100/100
Mean AUC: 0.4999
Median AUC: 0.5000
Standard Deviation: 0.1088
Min AUC: 0.3205
Max AUC: 1.0000

MRR Results:
Mean MRR: 0.2870
Median MRR: 0.1667
Standard Deviation: 0.3100
Min MRR: 0.0103
Max MRR: 1.0000

nDCG@5 Results:
Mean nDCG@5: 0.2823
Median nDCG@5: 0.0000
Standard Deviation: 0.3441
Min nDCG@5: 0.0000
Max nDCG@5: 1.0000

nDCG@10 Results:
Mean nDCG@10: 0.3340
Median nDCG@10: 0.3155
Standard Deviation: 0.3271
Min nDCG@10: 0.0000
Max nDCG@10: 1.0000

LLM-ONLY BASELINE SUMMARY
Metric       Mean     Median   Std     
----------------------------------------
AUC          0.4999   0.5000   0.1088  
MRR          0.2870   0.1667   0.3100  
nDCG@5       0.2823   0.0000   0.3441  
nDCG@10      0.3340   0.3155   0.3271  

Performance Distribution:
AUC Distributio