<a href="https://colab.research.google.com/github/Nanda654/HEADS/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 2: Import Libraries and Load Models

import torch
from transformers import LongformerModel, LongformerTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
import spacy

print("Loading Longformer model and tokenizer...")
# Load a pre-trained Longformer model and tokenizer
# 'allenai/longformer-base-4096' is a good general choice.
# For very long documents, you might consider 'allenai/longformer-large-4096' if you have sufficient GPU memory.
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerModel.from_pretrained(model_name)

# Ensure model is in evaluation mode and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval() # Set model to evaluation mode
model.to(device)
print(f"Using device: {device}")
print("Longformer model loaded.")

print("Loading spaCy model for sentence segmentation...")
# Load spaCy for sentence segmentation
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("spaCy model 'en_core_web_sm' not found. Downloading...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
print("spaCy model loaded.")

Loading Longformer model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Using device: cpu
Longformer model loaded.
Loading spaCy model for sentence segmentation...
spaCy model loaded.


In [2]:
# Cell 4: Centroid-Based Summarization Function

def centroid_summarization(text, num_sentences=3):
    """
    Generates an extractive summary using a centroid-based approach.
    It selects sentences closest to the overall document embedding.
    Optionally, includes Maximal Marginal Relevance (MMR) for diversity.

    Args:
        text (str): The input document.
        num_sentences (int): The desired number of sentences in the summary.

    Returns:
        list: A list of strings, where each string is a sentence in the summary.
    """
    print("\n--- Starting Centroid-Based Summarization ---")
    sentences, embeddings = get_sentence_embeddings(text)

    if not sentences or embeddings.shape[0] == 0:
        print("  No sentences or embeddings generated. Cannot summarize.")
        return []

    if num_sentences <= 0:
        print("  Number of sentences for summary must be positive.")
        return []

    num_sentences_to_extract = min(num_sentences, len(sentences))

    # Calculate the centroid of all sentence embeddings
    document_centroid = np.mean(embeddings, axis=0)

    # Calculate cosine similarity of each sentence embedding to the document centroid
    similarities = cosine_similarity(embeddings, document_centroid.reshape(1, -1)).flatten()

    # --- Apply MMR for improved diversity ---
    summary_sentences_mmr = []
    selected_indices = set() # Using a set for efficient lookup

    # Sort sentences by initial relevance (to centroid)
    ranked_initial_indices = np.argsort(similarities)[::-1] # indices in descending order of similarity

    for _ in range(num_sentences_to_extract):
        best_sentence_idx = -1
        max_mmr_score = -1

        for i in ranked_initial_indices:
            if i not in selected_indices:
                relevance = similarities[i]

                if not selected_indices:
                    # If no sentences are selected yet, just use relevance
                    mmr_score = relevance
                else:
                    # Calculate redundancy: max similarity to already selected sentences
                    diversity_scores = cosine_similarity(embeddings[i].reshape(1, -1),
                                                         embeddings[list(selected_indices)])
                    redundancy = np.max(diversity_scores)

                    lambda_param = 0.7 # Can be tuned between 0 (more diversity) and 1 (more relevance)
                    mmr_score = lambda_param * relevance - (1 - lambda_param) * redundancy

                if mmr_score > max_mmr_score:
                    max_mmr_score = mmr_score
                    best_sentence_idx = i

        if best_sentence_idx != -1:
            summary_sentences_mmr.append((sentences[best_sentence_idx], best_sentence_idx))
            selected_indices.add(best_sentence_idx)
            # Remove the selected sentence from `ranked_initial_indices` for the next iteration
            ranked_initial_indices = ranked_initial_indices[ranked_initial_indices != best_sentence_idx]
        else:
            # No suitable sentence found, break
            break

    # Sort the final summary sentences by their original order in the document
    summary_sentences_mmr.sort(key=lambda x: x[1])
    final_summary = [s[0] for s in summary_sentences_mmr]

    print("--- Centroid-Based Summarization Complete ---")
    return final_summary

In [3]:
# Cell 5: K-Means Based Summarization Function

def kmeans_summarization(text, num_clusters=5, num_sentences_per_cluster=1):
    """
    Generates an extractive summary using K-Means clustering.
    It groups sentences into clusters and selects the most representative
    sentence(s) from each cluster.

    Args:
        text (str): The input document.
        num_clusters (int): The desired number of clusters (topics) to identify.
                            This will indirectly affect summary length.
        num_sentences_per_cluster (int): Number of top sentences to select from each cluster.

    Returns:
        list: A list of strings, where each string is a sentence in the summary.
    """
    print("\n--- Starting K-Means Based Summarization ---")
    sentences, embeddings = get_sentence_embeddings(text)

    if not sentences or embeddings.shape[0] == 0:
        print("  No sentences or embeddings generated. Cannot summarize.")
        return []

    if num_clusters <= 0 or num_sentences_per_cluster <= 0:
        print("  Number of clusters and sentences per cluster must be positive.")
        return []

    # Adjust num_clusters if it's more than available sentences
    effective_num_clusters = min(num_clusters, len(sentences))

    if effective_num_clusters == 0:
        print("  Not enough sentences to form clusters.")
        return []

    # Perform K-Means clustering
    # n_init='auto' or a specific number (e.g., 10) for robustness
    kmeans = KMeans(n_clusters=effective_num_clusters, random_state=42, n_init='auto')
    kmeans.fit(embeddings)
    clusters = kmeans.labels_
    centroids = kmeans.cluster_centers_

    summary_sentences = []
    selected_indices = set() # To ensure unique sentences in summary

    # Iterate through each cluster to select representative sentences
    for i in range(effective_num_clusters):
        # Get indices of sentences belonging to the current cluster
        cluster_sentence_indices = np.where(clusters == i)[0]

        if len(cluster_sentence_indices) == 0:
            continue

        # Calculate cosine distance of cluster sentences to their centroid
        # cosine_distance = 1 - cosine_similarity
        distances = cdist(embeddings[cluster_sentence_indices], centroids[i].reshape(1, -1), 'cosine').flatten()

        # Sort sentences within the cluster by their distance to the centroid (closest first)
        sorted_cluster_indices = cluster_sentence_indices[np.argsort(distances)]

        # Select top `num_sentences_per_cluster` from this cluster
        count_selected_from_cluster = 0
        for original_idx in sorted_cluster_indices:
            if original_idx not in selected_indices: # Ensure uniqueness across clusters
                summary_sentences.append((sentences[original_idx], original_idx)) # Store (sentence, original_index)
                selected_indices.add(original_idx)
                count_selected_from_cluster += 1
                if count_selected_from_cluster >= num_sentences_per_cluster:
                    break

    # Sort the final summary sentences by their original appearance in the document
    summary_sentences.sort(key=lambda x: x[1])
    final_summary = [s[0] for s in summary_sentences]

    print("--- K-Means Based Summarization Complete ---")
    return final_summary

In [5]:
# Cell 6: Example Usage and Testing

# A longer example document for testing
long_document = """
Artificial intelligence (AI) has rapidly transformed various sectors, revolutionizing industries from healthcare to finance. In healthcare, AI assists in diagnosing diseases earlier and more accurately, personalizing treatment plans, and accelerating drug discovery. Machine learning algorithms, a subset of AI, analyze vast amounts of patient data to identify patterns that human doctors might miss, leading to more effective interventions. For instance, AI-powered tools can detect subtle signs of retinopathy from eye scans, potentially preventing blindness. The integration of AI into electronic health records is also streamlining administrative tasks, freeing up medical professionals to focus more on patient care. This technological leap promises to enhance diagnostic capabilities and optimize treatment protocols significantly.

The financial industry also heavily leverages AI for fraud detection, algorithmic trading, and personalized financial advice. AI systems can monitor transactions in real-time, identifying unusual patterns indicative of fraudulent activity with high precision. Furthermore, robo-advisors powered by AI provide automated, data-driven investment advice tailored to individual risk tolerance and financial goals, making financial planning more accessible to a wider demographic. The use of AI in predicting market trends and managing portfolios is becoming increasingly sophisticated, offering new avenues for investors.

Beyond these, AI is deeply embedded in everyday life through virtual assistants like Siri and Alexa, recommendation engines on streaming platforms, and autonomous vehicles. AI's role in natural language processing (NLP) has led to advancements in language translation and sentiment analysis, impacting global communication and customer service. The ethical implications of AI, however, are a growing concern among researchers and policymakers. Issues such as algorithmic bias, job displacement due to automation, and privacy breaches require careful consideration and robust regulation. Ensuring transparency, fairness, and accountability in AI development is paramount to harnessing its benefits responsibly and mitigating potential harm to society.

Research in AI continues to advance at an astonishing pace, focusing on areas like explainable AI (XAI) to make AI decisions more understandable, and robust AI to improve performance in real-world, unpredictable environments. Novel architectures like generative adversarial networks (GANs) and reinforcement learning are pushing the boundaries of what AI can achieve, from creating realistic imagery to mastering complex games. The future of AI promises even more integration into society, with potential breakthroughs in areas like general artificial intelligence (AGI) and enhanced human-computer interaction, leading to smarter cities and more efficient resource management. However, achieving these advancements responsibly will necessitate ongoing collaboration between technologists, policymakers, and ethicists to address the complex challenges that arise. The rapid pace of development means that continuous public discourse and legislative adaptation are critical to navigate the challenges and maximize the societal benefits of AI, ensuring it serves humanity's best interests.
"""

print("Original Document Length (sentences):", len(long_document))


# --- Test Centroid-Based Summarization ---
print("\n" + "="*80)
print("Centroid-Based Summary:")
# Experiment with different `num_sentences` values
centroid_summary = centroid_summarization(long_document, num_sentences=5)
for i, sent in enumerate(centroid_summary):
    print(f"{i+1}. {sent}")


# --- Test K-Means Based Summarization ---
print("\n" + "="*80)
print("K-Means Based Summary:")
# Experiment with different `num_clusters` and `num_sentences_per_cluster`
# e.g., for a summary of 5 sentences, you could do:
# num_clusters=5, num_sentences_per_cluster=1 (one sentence from each of 5 topics)
# num_clusters=3, num_sentences_per_cluster=2 (two sentences from each of 3 major topics, last topic might only have one)
kmeans_summary = kmeans_summarization(long_document, num_clusters=4, num_sentences_per_cluster=1)
for i, sent in enumerate(kmeans_summary):
    print(f"{i+1}. {sent}")

print("\n" + "="*80)
print("\nSummarization process complete.")

Original Document Length (sentences): 3298

Centroid-Based Summary:

--- Starting Centroid-Based Summarization ---


NameError: name 'get_sentence_embeddings' is not defined