# <span style="color:#FF8888;">FastText Model using <span style="color: #1E90FF;"> Semantic</span> and <span style="color: #1E90FF;">Statistical</span> Features </span>

# 📥 Install Libraries

In [1]:
!pip install fasttext
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


# 📚 Import Libraries

In [2]:
import os
import re
import json
import math
import copy
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from collections import Counter, defaultdict
import glob
from nltk.util import ngrams
import fasttext
import fasttext.util
import sys
from contextlib import redirect_stdout, redirect_stderr
from tqdm import tqdm
from rouge import Rouge

# ⚙️ Settings

In [3]:
representation_technique = "fasttext"
similarity_measure = 1 # 1 means original cosine similarity and 2 means modifed cosine similarity
with_edge_thresholding = True
edge_weight_threshold = 0.9
summary_ratio = 0.3
ranking_method = 1 # 1 means degree centrality and 2 means textrank and 3 means pagerank
max_iterations = 100
with_redundancy_reduction = False
redundancy_threshold = 0.98

# 📂 Load JSON Data and Extract Sentences with Indexing

In [4]:
def load_json_file(file_path):
    """Load and parse a JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_sentences(data, skip_title=True):
    """Extract all sentences from the nested JSON structure.
    
    Args:
        data: The JSON data containing paragraphs and sentences
        skip_title: If True, skip paragraph 0 (title)
    
    Returns:
        A list of tuples: (sentence_text, (paragraph_idx, sentence_idx))
    """
    sentences = []
    
    # Go through all paragraphs
    for para_idx in data:
        # Skip paragraph 0 (title) if skip_title is True
        if skip_title and para_idx == '0':
            continue
            
        # Go through all sentences in the paragraph
        for sent_idx in data[para_idx]:
            if isinstance(data[para_idx][sent_idx], str):
                # Store sentence text and its indices
                sentences.append((data[para_idx][sent_idx], (para_idx, sent_idx)))
    
    return sentences

# 🧠📥 Load FastText Arabic Model

In [5]:
def load_fasttext_model(model_path):
    """
    Load a pre-trained FastText model for Arabic with minimal output.
    If model_path is provided, load from that path instead.
    
    Args:
        model_path: Path to a pre-trained FastText model (optional)
        verbose: Whether to show download progress (default: True)
        
    Returns:
        Loaded FastText model
    """
    # Load model from specified path
    return fasttext.load_model(model_path)

# 🧮🧠 Compute FastText Embeddings

In [6]:
def compute_sentence_embeddings(sentence_data, model):
    embeddings = []
    
    for sentence_text, (para_idx, sent_idx), statistical_score in sentence_data:
        emb = model.get_sentence_vector(sentence_text)
        # Append the statistical score
        emb = np.append(emb, statistical_score)
        embeddings.append(emb)
    
    embeddings = np.vstack(embeddings)
    normalized = normalize(embeddings, axis=1)
    return normalized

def compute_title_embedding(title_text, model):
    """
    Compute & normalize FastText embedding for the title,
    appending a dummy statistical score (0.0) to match sentence embeddings.
    
    Returns:
        np.ndarray of shape (vector_dim + 1,)
    """
    emb = model.get_sentence_vector(title_text)
    
    # Append a dummy statistical score (0.0 for the title)
    emb = np.append(emb, 0.0)
    
    # Normalize
    normalized = normalize(emb.reshape(1, -1), axis=1)[0]
    
    return normalized


model_path = "/kaggle/input/fastetxt-model/cc.ar.300.bin"    
# Load FastText model
model = load_fasttext_model(model_path)

# 📏 Compute Similarity

In [7]:
def cosine_similarity(v1, v2):
    """Compute cosine similarity between two vectors."""
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    
    # Avoid division by zero
    if norm_v1 == 0 or norm_v2 == 0:
        return 0
    
    return dot_product / (norm_v1 * norm_v2)

def modified_similarity(v1, v2, title, keyphrase_score_v2):
    return cosine_similarity(v1, v2) * (1 + keyphrase_score_v2 + cosine_similarity(v2, title))

# 📈 Build the Graph

In [8]:
def build_graph_with_fasttext(sentence_data, title_text, scores, model):
    """
    Build a graph where nodes are sentences and edges represent modified similarity above a threshold,
    using FastText embeddings instead of TF-ISF.
    
    Args:
        sentence_data: List of tuples (sentence_text, (para_idx, sent_idx))
        title_text: The text of the document title
        keyphrase_scores: Dictionary of keyphrase scores by paragraph and sentence indices
        model: FastText model
        threshold: Minimum similarity threshold for creating an edge
    
    Returns:
        Tuple of (graph, embeddings)
    """
    sentence_data_copy = []
    
    for i, (sentence_text, (para_idx, sent_idx)) in enumerate(sentence_data):
        # Get statistical score for this sentence (default to 0 if not found)
        statistical_score = 0
        if para_idx in scores and sent_idx in scores[para_idx]:
            statistical_score = scores[para_idx][sent_idx].get('statistical_score', 0)

        sentence_data_copy.append((sentence_text, (para_idx, sent_idx), statistical_score))
    
    # Compute FastText embeddings
    embeddings = compute_sentence_embeddings(sentence_data_copy, model)
    
    # Create graph
    G = nx.Graph()
    
    # Add nodes (sentences) with their indices
    for i, (sentence_text, (para_idx, sent_idx)) in enumerate(sentence_data):
        node_id = f"{para_idx}_{sent_idx}"
        
        # Get keyphrase score for this sentence (default to 0 if not found)
        keyphrase_score = 0
        if para_idx in scores and sent_idx in scores[para_idx]:
            keyphrase_score = scores[para_idx][sent_idx].get('keyphrase_score', 0)
        
        G.add_node(
            node_id, 
            text=sentence_text, 
            para_idx=para_idx, 
            sent_idx=sent_idx, 
            vector_idx=i,
            keyphrase_score=keyphrase_score
        )
    
    # Compute title embedding
    title_embedding = compute_title_embedding(title_text, model)
    
    # Calculate similarities for all pairs
    similarities = []
    sentence_pairs = []
    
    for i, (_, (para_i, sent_i)) in enumerate(sentence_data):
        node_i = f"{para_i}_{sent_i}"
        keyphrase_score_i = G.nodes[node_i]['keyphrase_score']
        
        for j, (_, (para_j, sent_j)) in enumerate(sentence_data[i+1:], i+1):
            node_j = f"{para_j}_{sent_j}"
            keyphrase_score_j = G.nodes[node_j]['keyphrase_score']
            
            # Use modified similarity with FastText embeddings
            if(similarity_measure == 2):
                similarity = modified_similarity(
                    embeddings[i], 
                    embeddings[j], 
                    title_embedding,
                    keyphrase_score_j
                )
            
            else:
                similarity = cosine_similarity(
                    embeddings[i], 
                    embeddings[j]
                )
            
            similarities.append(similarity)
            sentence_pairs.append((node_i, node_j))
    
    # Normalize similarities if there are any
    if similarities:
        min_sim = min(similarities)
        max_sim = max(similarities)
        
        # Avoid division by zero if all similarities are the same
        if max_sim > min_sim:
            normalized_similarities = [(sim - min_sim) / (max_sim - min_sim) for sim in similarities]
        else:
            normalized_similarities = [1.0 for _ in similarities]
        
        # Add edges based on normalized similarities
        for (node_i, node_j), norm_sim in zip(sentence_pairs, normalized_similarities):
            if with_edge_thresholding:
                if norm_sim > edge_weight_threshold:
                    G.add_edge(node_i, node_j, weight=norm_sim)
            else:
                G.add_edge(node_i, node_j, weight=norm_sim)
    
    return G, embeddings

# 👀 Visualize Graph

In [9]:
def visualize_graph(G, filename=None):
    """Visualize the graph and display in the console."""
    plt.figure(figsize=(12, 10))
    
    # Use spring layout for better visualization
    pos = nx.spring_layout(G, seed=42)
    
    # Draw nodes and edges
    nx.draw_networkx_nodes(G, pos, node_size=300, node_color='lightblue')
    
    # Get edge weights for line thickness and color
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    
    # Draw edges with varying thickness based on weight
    nx.draw_networkx_edges(G, pos, width=[w*3 for w in edge_weights], alpha=0.7)
    
    # Draw node labels
    node_labels = {node: f"{data['para_idx']}_{data['sent_idx']}" for node, data in G.nodes(data=True)}
    nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10)
    
    plt.title(f"Sentence Similarity Graph for {filename if filename else 'Sample'}")
    plt.axis('off')
        
    # Instead of saving, just display the plot
    plt.show()

# 🛠️ All Graph Construction Stage

In [10]:
def process_file_with_fasttext(file_path, scores_folder, model):
    """
    Process a single JSON file using FastText embeddings and return the results.
    
    Args:
        file_path: Path to the JSON file to process
        scores_folder: Path to the folder containing sentence scores
        model: FastText model
    
    Returns:
        Dictionary with processing results or None if processing failed
    """
    filename = os.path.basename(file_path)
    
    # Load document data
    data = load_json_file(file_path)
    
    # Load sentence scores
    scores_path = os.path.join(scores_folder, filename)
    keyphrase_scores = {}
    if os.path.exists(scores_path):
        keyphrase_scores = load_json_file(scores_path)
    
    # Extract the title
    title_text = ""
    if '0' in data and '0' in data['0']:
        title_text = data['0']['0']
    
    # Extract sentences (skipping title)
    sentence_data = extract_sentences(data, skip_title=True)
    
    if not sentence_data:
        print(f"No sentences found in {filename} after skipping title.")
        return None
    
    # Build graph with FastText embeddings
    graph, embeddings = build_graph_with_fasttext(
        sentence_data, 
        title_text, 
        keyphrase_scores, 
        model,
    )

    #if filename == "file87.json":
    #    visualize_graph(graph)
    
    return {
        'graph': graph,
        'vectors': embeddings,  # Now these are FastText embeddings
        'sentences': sentence_data,
        'title': title_text,
        'keyphrase_scores': keyphrase_scores,
        'filename': filename
    }

def process_all_files_with_fasttext(folder_path, scores_folder):
    """
    Process all JSON files in the given folder with FastText embeddings.
    
    Args:
        folder_path: Path to the folder containing document JSON files
        scores_folder: Path to the folder containing sentence scores
        
    Returns:
        Dictionary of processing results by filename
    """
    results = {}

    # Get all JSON files in the folder
    file_paths = glob.glob(os.path.join(folder_path, "*.json"))
    
    for file_path in tqdm(file_paths):
        result = process_file_with_fasttext(file_path, scores_folder, model)
        if result:
            results[result['filename']] = result
    
    return results

if __name__ == "__main__":
    documents_folder = "/kaggle/input/graduation-project/preprocessed_classical"
    scores_folder = "/kaggle/input/graduation-project/sentence_scores"
    
    # Process files with FastText
    results = process_all_files_with_fasttext(documents_folder, scores_folder)

    print(f"Successfully processed {len(results)} files.")

100%|██████████| 153/153 [00:02<00:00, 60.69it/s]

Successfully processed 153 files.





# 🏆 Ranking

## Weighted Degree Centrality

In [11]:
def weighted_centrality(results):
    """
    Rank sentences in each file by weighted degree centrality.
    
    Args:
        results: Dictionary of results from process_all_files function
        
    Returns:
        Dictionary with filename as key and list of (sentence_id, score, text) tuples as value,
        sorted by score in descending order
    """
    rankings = {}
    scores_dir = "/kaggle/input/graduation-project/sentence_scores"
    
    for filename, result in results.items():
        graph = result['graph']
        sentences = result['sentences']
        
        # Get just the filename without path and extension
        base_filename = os.path.basename(filename)
        base_filename = os.path.splitext(base_filename)[0]
        
        # Load the statistical scores for this file
        scores_path = os.path.join(scores_dir, f"{base_filename}.json")
        try:
            statistical_scores = load_json_file(scores_path)
        except FileNotFoundError:
            print(f"Warning: Statistical scores file not found for {filename}. Using empty scores.")
            statistical_scores = {}
        
        # Calculate weighted degree centrality for each node
        weighted_centrality = {}
        
        for node_id in graph.nodes():
            # Get node data
            node_data = graph.nodes[node_id]
            para_idx = node_data['para_idx']
            sent_idx = node_data['sent_idx']
            
            # Calculate the weighted centrality for this node
            centrality_score = 0
            
            # Iterate through all neighbors of the node
            for neighbor in graph.neighbors(node_id):
                # Get the similarity (edge weight) between the node and its neighbor
                similarity = graph[node_id][neighbor]['weight']
                
                # Get the statistical score for the neighbor
                neighbor_data = graph.nodes[neighbor]
                neighbor_para_idx = neighbor_data['para_idx']
                neighbor_sent_idx = neighbor_data['sent_idx']
                
                # Extract statistical score for the neighbor
                neighbor_stat_score = 0
                if neighbor_para_idx in statistical_scores and neighbor_sent_idx in statistical_scores[neighbor_para_idx]:
                    neighbor_stat_score = statistical_scores[neighbor_para_idx][neighbor_sent_idx].get('statistical_score', 0)
                
                # Add contribution of this neighbor to the centrality score
                centrality_score += similarity * neighbor_stat_score
            
            weighted_centrality[node_id] = centrality_score
        
        # Create a list of (sentence_id, score, text) tuples
        sentence_scores = []
        for node_id, score in weighted_centrality.items():
            # Get node data
            node_data = graph.nodes[node_id]
            text = node_data['text']
            
            sentence_scores.append((node_id, score, text))
        
        # Sort by score in descending order
        sentence_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Store rankings
        rankings[filename] = sentence_scores
    
    return rankings

## Semantic TextRank

In [12]:
def normalize_statistical_scores(statistical_scores):
    # Flatten all scores, normalize them to [0, 1], then reshape
    flat_scores = []
    for para_dict in statistical_scores.values():
        for sent_dict in para_dict.values():
            flat_scores.append(sent_dict.get('statistical_score', 0))
    
    if not flat_scores:
        return statistical_scores  # return as is if empty

    min_score = min(flat_scores)
    max_score = max(flat_scores)
    score_range = max(max_score - min_score, 1e-10)

    # Normalize
    normalized = defaultdict(lambda: defaultdict(dict))
    for para_idx, para_dict in statistical_scores.items():
        for sent_idx, sent_dict in para_dict.items():
            raw = sent_dict.get('statistical_score', 0)
            normalized_score = (raw - min_score) / score_range
            normalized[para_idx][sent_idx]['statistical_score'] = normalized_score

    return normalized


def semantic_TextRank(results, damping=0.85, alpha=0.5):
    rankings = {}
    scores_dir = "/kaggle/input/graduation-project/sentence_scores"
    
    for filename, result in results.items():
        graph = result['graph']
        sentences = result['sentences']
        
        base_filename = os.path.splitext(os.path.basename(filename))[0]
        scores_path = os.path.join(scores_dir, f"{base_filename}.json")
        
        try:
            statistical_scores = load_json_file(scores_path)
        except FileNotFoundError:
            print(f"Warning: Statistical scores file not found for {filename}. Using default scores.")
            statistical_scores = {}
        
        statistical_scores = normalize_statistical_scores(statistical_scores)
        
        nodes = list(graph.nodes())
        node_to_idx = {node: i for i, node in enumerate(nodes)}
        n = len(nodes)
        
        if n == 0:
            rankings[filename] = []
            continue
        
        # --- Degree centrality as initial score ---
        degree_centrality = nx.degree_centrality(graph)
        scores = np.array([degree_centrality.get(node, 0.0) for node in nodes])

        # --- Precompute outgoing weights for symmetric normalization ---
        out_weights_sum = {}
        for node in nodes:
            out_sum = sum(graph[node][nbr]['weight'] for nbr in graph.neighbors(node))
            out_weights_sum[node] = max(out_sum, 1e-10)
        
        # --- TextRank Iterations with symmetric weight normalization ---
        for _ in range(max_iterations):
            new_scores = np.ones(n) * (1 - damping)
            for i, node_i in enumerate(nodes):
                score_sum = 0
                for neighbor in graph.neighbors(node_i):
                    j = node_to_idx[neighbor]
                    weight = graph[node_i][neighbor]['weight']
                    norm_factor = np.sqrt(out_weights_sum[node_i] * out_weights_sum[neighbor])
                    score_sum += (weight * scores[j]) / norm_factor
                new_scores[i] += damping * score_sum
            if np.allclose(scores, new_scores, atol=1e-6):
                break
            scores = new_scores
        
        # --- Combine with normalized statistical scores using adaptive alpha ---
        final_scores = np.zeros(n)
        for i, node in enumerate(nodes):
            node_data = graph.nodes[node]
            para_idx = node_data['para_idx']
            sent_idx = node_data['sent_idx']
            stat_score = statistical_scores.get(para_idx, {}).get(sent_idx, {}).get('statistical_score', 0)

            # Adaptive alpha: favor stat scores for title sentences
            adaptive_alpha = 0.4 if para_idx == 0 else alpha

            final_scores[i] = adaptive_alpha * scores[i] + (1 - adaptive_alpha) * stat_score
        
        # --- Store results ---
        sentence_scores = [
            (node, float(final_scores[i]), graph.nodes[node]['text']) for i, node in enumerate(nodes)
        ]
        sentence_scores.sort(key=lambda x: x[1], reverse=True)
        rankings[filename] = sentence_scores
    
    return rankings

## PageRank

In [13]:
def rank_sentences_by_textrank(results):
    """
    Rank sentences in each file by TextRank algorithm.
    
    Args:
        results: Dictionary of results from process_all_files function
        max_iterations: (global variable) Maximum number of iterations for PageRank algorithm
        
    Returns:
        Dictionary with filename as key and list of (sentence_id, score, text) tuples as value,
        sorted by score in descending order
    """
    rankings = {}
    
    for filename, result in results.items():
        graph = result['graph']
        sentences = result['sentences']
        
        # Calculate TextRank scores (PageRank) for each node with max iterations
        textrank_scores = nx.pagerank(graph, max_iter=max_iterations)
        
        # Create a list of (sentence_id, score, text) tuples
        sentence_scores = []
        for node_id, score in textrank_scores.items():
            # Get node data
            node_data = graph.nodes[node_id]
            para_idx = node_data['para_idx']
            sent_idx = node_data['sent_idx']
            text = node_data['text']
            
            sentence_scores.append((node_id, score, text))
        
        # Sort by score in descending order
        sentence_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Store rankings
        rankings[filename] = sentence_scores
        
        # Optional: Calculate some statistics about the scores
        scores = [score for _, score, _ in sentence_scores]
    
    return rankings

# Sentence Selection

In [14]:
def sentence_selection(rankings, file_results, original_folder):
    """
    Generate extractive summaries for each file by selecting top sentences from original files.
    Avoids redundant sentences by checking similarity with already selected sentences.
    
    Args:
        rankings: The output from rank_sentences_by_centrality
        file_results: Original results from process_all_files
        original_folder: Path to folder containing original sentences
        summary_ratio: Percentage of original sentences to include in summary (0.0 to 1.0)
        redundancy_threshold: Maximum allowed similarity between sentences in summary (0.0 to 1.0)
        
    Returns:
        Dictionary with filename as key and summary text as value
    """
    summaries = {}
    
    for filename, ranked_sentences in rankings.items():
        # Load original sentences file
        original_file_path = os.path.join(original_folder, filename)
        try:
            with open(original_file_path, 'r', encoding='utf-8') as f:
                original_data = json.load(f)
        except FileNotFoundError:
            print(f"Warning: Original file {filename} not found in {original_folder}")
            continue
        
        # Count total sentences in original file
        total_sentences = sum(len(original_data.get(para, {})) for para in original_data)
        
        # Calculate number of sentences for summary based on ratio
        target_sentences = max(1, round(total_sentences * summary_ratio))
        
        # Original graph to get paragraph and sentence indices and TF-ISF vectors
        graph = file_results[filename]['graph']
        vectors = file_results[filename]['vectors']
        
        # Keep track of selected sentences and their TF-ISF vectors
        selected_sentences = []
        selected_vectors = []
        
        # Process ranked sentences in order
        for sent_id, score, *_ in ranked_sentences:
            # Skip once we've reached our target number of sentences
            if len(selected_sentences) >= target_sentences:
                break
                
            # Parse sentence ID to get paragraph and sentence indices
            para_idx, sent_idx = sent_id.split('_')
            
            # Get the TF-ISF vector for this sentence
            vector_idx = graph.nodes[sent_id]['vector_idx']
            current_vector = vectors[vector_idx]
            
            if with_redundancy_reduction:
                # Check if this sentence is redundant compared to already selected sentences
                is_redundant = False
                for sel_vector in selected_vectors:
                    similarity = cosine_similarity(current_vector, sel_vector)
                    if similarity > redundancy_threshold:
                        is_redundant = True
                        break
            
                # Skip redundant sentences
                if is_redundant:
                    continue
            
            # Extract original sentence from the original data
            try:
                original_text = original_data[para_idx][sent_idx]
                selected_sentences.append((int(para_idx), int(sent_idx), original_text))
                selected_vectors.append(current_vector)
            except KeyError:
                print(f"Warning: Sentence ID {sent_id} not found in original file {filename}")
                continue
        
        # Sort by paragraph index first, then sentence index for proper document flow
        selected_sentences.sort()
        
        # Combine sentences into summary
        summary = " ".join([text for _, _, text in selected_sentences])
        summaries[filename] = summary
        
        print(f"\nExtractive summary for {filename} ({len(selected_sentences)} of {total_sentences} sentences, {round(len(selected_sentences)/total_sentences*100, 1)}%):")
        print(summary)
    
    return summaries

In [15]:
# Example usage
if __name__ == "__main__": 
    if(ranking_method == 1):
        rankings = weighted_centrality(results)
    elif(ranking_method == 2):
        rankings = semantic_TextRank(results)
    else:
        print("pagerank")
        rankings = rank_sentences_by_textrank(results)
    
    # Path to original sentences
    original_folder = "/kaggle/input/graduation-project/original_sentences"
    
    summaries = sentence_selection(
        rankings, 
        results, 
        original_folder, 
    )


Extractive summary for file143.json (4 of 13 sentences, 30.8%):
أول اسم ثابت لمدينة القدس هو أورسالم قبل خمسة آلاف عام ،  ويعنى أسسها سالم و قيل مدينة السلام ،  ثم ما لبثت تلك المدينة أن أخذت اسم يبوس نسبة إلى يبوسيون| و الذين يعتقد أنهم متفرعين من الكنعانيين ،  وقد بنوا قلعتها صهيون والتى تعنى بالكنعانية مرتفع كما بنوا هيكلا لإلههم سالم فكان بيتا للعبادة تسمى المدينة في الترجمة العربية للنصوص القديمة من الإنجيل والعهد القديم باسم أورشليم أو يروشليم, ويرى البعض أن اسم المدينة تعريب للاسم الكنعاني والعبري يروشلايم الذي معناه غير واضح ،  وقد يشير إلى إله كنعاني قديم اسمه شاليم ،  أو إلى العبارة بلد السلام بالعبرية أو بلغة سامية أخرى نسخة قديمة لإنجيل مرقس بالعربية 1590م يظهر فيها اسم يروشليم بناء على سفر الملوك الثاني فإن القدس كانت تعرف ب القدس نسبة إلى يبوسيون الذين يعتقد أنهم متفرعين من الكنعانيين و الذين يشار لهم بأنهم عربيو الأصول ،  ثم قام النبي داوود عليه السلام بالسيطرة على المنطقة و القدس و ذلك في عام 1004 قبل الميلاد في رسائل إسلامية باللغة العربية من القرون الوسطى ،  وبخصوص ف

# ⚖️ Evaluation

In [16]:
def evaluate_summaries_with_rouge(generated_summaries, reference_folder):
    """
    Evaluate generated summaries against reference summaries using ROUGE metrics.
    
    Args:
        generated_summaries: Dictionary with filename as key and generated summary as value
        reference_folder: Path to folder containing reference summaries
        
    Returns:
        Dictionary with average ROUGE scores and per-file scores
    """
    # Initialize Rouge calculator
    rouge = Rouge()
    
    # Store all scores
    all_scores = {}
    
    # Store aggregate scores for averaging
    aggregate_scores = {
        'rouge-1': {'f': [], 'p': [], 'r': []},
        'rouge-2': {'f': [], 'p': [], 'r': []},
        'rouge-l': {'f': [], 'p': [], 'r': []}
    }
    
    # Get list of all reference files
    reference_files = os.listdir(reference_folder)

    
    # Evaluate each generated summary against its reference summaries
    for filename, generated_summary in generated_summaries.items():
        # Extract base filename without extension
        base_filename = os.path.splitext(filename)[0]
        
        # Find all reference summaries for this file
        file_refs = [f for f in reference_files if f.startswith(f"{base_filename}_sum")]
        
        if not file_refs:
            print(f"Warning: No reference summaries found for {filename}. Skipping this file.")
            continue
        
        # Load all reference summaries for this file
        valid_reference_summaries = []
        for ref_file in file_refs:
            ref_path = os.path.join(reference_folder, ref_file)
            try:
                with open(ref_path, 'r', encoding='utf-8') as f:
                    ref_content = f.read().strip()
                    # Check if reference summary is empty or only has one sentence
                    if not ref_content:
                        print(f"Warning: Empty reference file {ref_file}. Ignoring this reference.")
                        continue
                    
                    # Count sentences (naive approach - split by period, exclamation mark, or question mark)
                    sentences = re.split(r'[.!?]+', ref_content)
                    sentences = [s.strip() for s in sentences if s.strip()]
                    
                    #if len(sentences) <= 1:
                    #    print(f"Warning: Reference file {ref_file} has only one sentence. Ignoring this reference.")
                    #    continue
                    
                    valid_reference_summaries.append(ref_content)
            except Exception as e:
                print(f"Error reading reference file {ref_file}: {e}")
                continue
        
        if not valid_reference_summaries:
            print(f"Warning: No valid reference summaries for {filename}. Skipping this file.")
            continue
        
        # Calculate ROUGE scores against each reference
        file_scores = {
            'rouge-1': {'f': [], 'p': [], 'r': []},
            'rouge-2': {'f': [], 'p': [], 'r': []},
            'rouge-l': {'f': [], 'p': [], 'r': []}
        }
        
        for ref_summary in valid_reference_summaries:
            try:
                # Make sure generated summary is not empty
                if not generated_summary.strip():
                    print(f"Warning: Empty generated summary for {filename}. Skipping this file.")
                    continue
                    
                # Calculate ROUGE scores
                scores = rouge.get_scores(generated_summary, ref_summary)[0]
                
                # Add scores to file-specific records
                for metric in ['rouge-1', 'rouge-2', 'rouge-l']:
                    for score_type in ['f', 'p', 'r']:
                        file_scores[metric][score_type].append(scores[metric][score_type])
            except Exception as e:
                print(f"Error calculating ROUGE for {filename} with a reference: {e}")
                continue
        
        # Skip this file if no valid scores were calculated
        if not any(file_scores[metric][score_type] for metric in ['rouge-1', 'rouge-2', 'rouge-l'] for score_type in ['f', 'p', 'r']):
            print(f"Warning: Could not calculate any valid ROUGE scores for {filename}. Skipping this file.")
            continue
            
        # Calculate average score across all references for this file
        file_avg_scores = {}
        for metric in ['rouge-1', 'rouge-2', 'rouge-l']:
            file_avg_scores[metric] = {}
            for score_type in ['f', 'p', 'r']:
                if file_scores[metric][score_type]:
                    score_avg = np.mean(file_scores[metric][score_type])
                    file_avg_scores[metric][score_type] = score_avg
                    
                    # Add to aggregate scores for overall average
                    aggregate_scores[metric][score_type].append(score_avg)
                else:
                    file_avg_scores[metric][score_type] = 0.0
        
        # Store file scores
        all_scores[filename] = file_avg_scores
        
        # Print summary for this file with precision and recall
        print(f"\nROUGE scores for {filename}:")
        print(f"  ROUGE-1: F1: {file_avg_scores['rouge-1']['f']:.4f}, Precision: {file_avg_scores['rouge-1']['p']:.4f}, Recall: {file_avg_scores['rouge-1']['r']:.4f}")
        print(f"  ROUGE-2: F1: {file_avg_scores['rouge-2']['f']:.4f}, Precision: {file_avg_scores['rouge-2']['p']:.4f}, Recall: {file_avg_scores['rouge-2']['r']:.4f}")
        print(f"  ROUGE-L: F1: {file_avg_scores['rouge-l']['f']:.4f}, Precision: {file_avg_scores['rouge-l']['p']:.4f}, Recall: {file_avg_scores['rouge-l']['r']:.4f}")
    
    # Calculate overall average scores
    average_scores = {}
    for metric in ['rouge-1', 'rouge-2', 'rouge-l']:
        average_scores[metric] = {}
        for score_type in ['f', 'p', 'r']:
            if aggregate_scores[metric][score_type]:
                average_scores[metric][score_type] = np.mean(aggregate_scores[metric][score_type])
            else:
                average_scores[metric][score_type] = 0.0
    
    # Print number of files used in evaluation
    evaluated_files_count = len(all_scores)
    total_files_count = len(generated_summaries)
    print(f"\nEvaluated {evaluated_files_count} out of {total_files_count} files.")
    
    # Print overall summary with precision and recall
    print("\n" + "="*60)
    print("OVERALL AVERAGE ROUGE SCORES:")
    print(f"  ROUGE-1: F1: {average_scores['rouge-1']['f']:.4f}, Precision: {average_scores['rouge-1']['p']:.4f}, Recall: {average_scores['rouge-1']['r']:.4f}")
    print(f"  ROUGE-2: F1: {average_scores['rouge-2']['f']:.4f}, Precision: {average_scores['rouge-2']['p']:.4f}, Recall: {average_scores['rouge-2']['r']:.4f}")
    print(f"  ROUGE-L: F1: {average_scores['rouge-l']['f']:.4f}, Precision: {average_scores['rouge-l']['p']:.4f}, Recall: {average_scores['rouge-l']['r']:.4f}")
    print("="*60)
    
    return {
        'average': average_scores,
        'per_file': all_scores,
        'evaluated_files_count': evaluated_files_count,
        'total_files_count': total_files_count
    }

# Example usage
if __name__ == "__main__":
    # Reference summaries folder path
    reference_folder = "/kaggle/input/voted-sentences"
    
    # Assuming 'summaries' is the variable containing our generated summaries
    # from the previous extractive summarization step
    evaluation_results = evaluate_summaries_with_rouge(summaries, reference_folder)
    
    # Additional analysis: Find best and worst performing files
    per_file_scores = evaluation_results['per_file']
    
    # Get scores for ROUGE-1
    file_metrics = [(filename, scores['rouge-1']['f'], scores['rouge-1']['p'], scores['rouge-1']['r']) 
                   for filename, scores in per_file_scores.items()]
    
    # Sort by F1 score
    f1_scores = [(filename, f1) for filename, f1, p, r in file_metrics]
    f1_scores.sort(key=lambda x: x[1])
    
    # Sort by precision
    precision_scores = [(filename, p) for filename, _, p, _ in file_metrics]
    precision_scores.sort(key=lambda x: x[1])
    
    # Sort by recall
    recall_scores = [(filename, r) for filename, _, _, r in file_metrics]
    recall_scores.sort(key=lambda x: x[1])
    
    print("\nFILE PERFORMANCE ANALYSIS:")
    
    # Check if we have enough files to show stats
    worst_count = min(3, len(f1_scores))
    best_count = min(3, len(f1_scores))
    
    # F1 Score Analysis
    if worst_count > 0:
        print("\nWorst performing files (ROUGE-1 F1):")
        for filename, score in f1_scores[:worst_count]:  # Bottom 3 or fewer
            print(f"  {filename}: {score:.4f}")
    
    if best_count > 0:        
        print("\nBest performing files (ROUGE-1 F1):")
        for filename, score in f1_scores[-best_count:]:  # Top 3 or fewer
            print(f"  {filename}: {score:.4f}")
    
    # Precision Analysis
    if worst_count > 0:
        print("\nWorst performing files (ROUGE-1 Precision):")
        for filename, score in precision_scores[:worst_count]:  # Bottom 3 or fewer
            print(f"  {filename}: {score:.4f}")
    
    if best_count > 0:        
        print("\nBest performing files (ROUGE-1 Precision):")
        for filename, score in precision_scores[-best_count:]:  # Top 3 or fewer
            print(f"  {filename}: {score:.4f}")
    
    # Recall Analysis
    if worst_count > 0:
        print("\nWorst performing files (ROUGE-1 Recall):")
        for filename, score in recall_scores[:worst_count]:  # Bottom 3 or fewer
            print(f"  {filename}: {score:.4f}")
    
    if best_count > 0:        
        print("\nBest performing files (ROUGE-1 Recall):")
        for filename, score in recall_scores[-best_count:]:  # Top 3 or fewer
            print(f"  {filename}: {score:.4f}")


ROUGE scores for file123.json:
  ROUGE-1: F1: 0.7048, Precision: 1.0000, Recall: 0.5441
  ROUGE-2: F1: 0.7006, Precision: 0.9910, Recall: 0.5419
  ROUGE-L: F1: 0.7048, Precision: 1.0000, Recall: 0.5441

ROUGE scores for file129.json:
  ROUGE-1: F1: 0.4486, Precision: 0.6957, Recall: 0.3310
  ROUGE-2: F1: 0.3835, Precision: 0.6000, Recall: 0.2818
  ROUGE-L: F1: 0.4393, Precision: 0.6812, Recall: 0.3241

ROUGE scores for file134.json:
  ROUGE-1: F1: 0.3665, Precision: 0.2991, Recall: 0.4730
  ROUGE-2: F1: 0.2510, Precision: 0.2027, Recall: 0.3297
  ROUGE-L: F1: 0.3351, Precision: 0.2735, Recall: 0.4324

ROUGE scores for file150.json:
  ROUGE-1: F1: 0.4730, Precision: 0.3098, Recall: 1.0000
  ROUGE-2: F1: 0.4399, Precision: 0.2832, Recall: 0.9846
  ROUGE-L: F1: 0.4730, Precision: 0.3098, Recall: 1.0000

ROUGE scores for file25.json:
  ROUGE-1: F1: 0.2657, Precision: 0.1895, Recall: 0.4444
  ROUGE-2: F1: 0.1671, Precision: 0.1189, Recall: 0.2816
  ROUGE-L: F1: 0.2362, Precision: 0.1684, R