<a href="https://colab.research.google.com/github/Nanda654/HEADS/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:


import torch
from transformers import LongformerModel, LongformerTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np

# --- Load Longformer Model and Tokenizer ---
print("Loading Longformer model and tokenizer...")
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerModel.from_pretrained(model_name)

# --- Set Device (GPU if available, else CPU) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval() # Set model to evaluation mode
model.to(device)
print(f"Using device: {device}")
print("Longformer model loaded.")

# --- Helper Function for Sentence Embeddings ---
def get_sentence_embeddings(text, batch_size=4):
    """
    Splits text into sentences, tokenizes them, and gets Longformer embeddings.
    Handles long documents by processing sentences in batches.
    Returns:
        sentences (list): List of original sentence strings.
        sentence_embeddings (np.array): NumPy array of sentence embeddings.
    """
    doc = nlp(text) # nlp is globally defined at the start of the cell
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    if not sentences:
        print("Warning: No valid sentences found in the input text.")
        return [], np.array([])

    all_sentence_embeddings = []
    print(f"Total sentences to process: {len(sentences)}")

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size] # CORRECTED: using batch_size
        try:
            inputs = tokenizer(
                batch_sentences,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=tokenizer.model_max_length
            ).to(device)

            with torch.no_grad():
                outputs = model(**inputs)

            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_sentence_embeddings.extend(cls_embeddings)
            # Removed detailed batch print to reduce output clutter unless needed for debugging speed
            # print(f"  Processed batch {i // batch_size + 1}/{(len(sentences) + batch_size - 1) // batch_size}")

        except Exception as e:
            print(f"Error processing batch of sentences (index {i}-{i+len(batch_sentences)-1}): {e}")
            all_sentence_embeddings.extend([np.zeros(model.config.hidden_size)] * len(batch_sentences))
            continue

    return sentences, np.array(all_sentence_embeddings)

# --- Centroid-Based Summarization Function (Optimized to accept pre-calculated embeddings) ---
def centroid_summarization_optimized(sentences, embeddings, num_sentences=3):
    """
    Generates an extractive summary using a centroid-based approach.
    Accepts pre-calculated sentences and embeddings.
    """
    print("\n--- Starting Centroid-Based Summarization ---")
    if not sentences or embeddings.shape[0] == 0:
        print("  No sentences or embeddings provided. Cannot summarize.")
        return [], []

    if num_sentences <= 0:
        print("  Number of sentences for summary must be positive.")
        return [], []

    num_sentences_to_extract = min(num_sentences, len(sentences))

    document_centroid = np.mean(embeddings, axis=0)
    similarities = cosine_similarity(embeddings, document_centroid.reshape(1, -1)).flatten()

    summary_sentences_mmr = []
    selected_indices = set()
    ranked_initial_indices = np.argsort(similarities)[::-1]

    for _ in range(num_sentences_to_extract):
        best_sentence_idx = -1
        max_mmr_score = -1

        for i in ranked_initial_indices:
            if i not in selected_indices:
                relevance = similarities[i]

                if not selected_indices:
                    mmr_score = relevance
                else:
                    diversity_scores = cosine_similarity(embeddings[i].reshape(1, -1),
                                                         embeddings[list(selected_indices)])
                    redundancy = np.max(diversity_scores)
                    lambda_param = 0.7
                    mmr_score = lambda_param * relevance - (1 - lambda_param) * redundancy

                if mmr_score > max_mmr_score:
                    max_mmr_score = mmr_score
                    best_sentence_idx = i

        if best_sentence_idx != -1:
            summary_sentences_mmr.append((sentences[best_sentence_idx], best_sentence_idx))
            selected_indices.add(best_sentence_idx)
            ranked_initial_indices = ranked_initial_indices[ranked_initial_indices != best_sentence_idx]
        else:
            break

    summary_sentences_mmr.sort(key=lambda x: x[1])
    final_summary_sents = [s[0] for s in summary_sentences_mmr]
    final_summary_indices = [s[1] for s in summary_sentences_mmr]

    print("--- Centroid-Based Summarization Complete ---")
    return final_summary_sents, final_summary_indices

# --- K-Means Based Summarization Function (Optimized to accept pre-calculated embeddings) ---
def kmeans_summarization_optimized(sentences, embeddings, num_clusters=5, num_sentences_per_cluster=1):
    """
    Generates an extractive summary using K-Means clustering.
    Accepts pre-calculated sentences and embeddings.
    """
    print("\n--- Starting K-Means Based Summarization ---")
    if not sentences or embeddings.shape[0] == 0:
        print("  No sentences or embeddings provided. Cannot summarize.")
        return [], []

    if num_clusters <= 0 or num_sentences_per_cluster <= 0:
        print("  Number of clusters and sentences per cluster must be positive.")
        return [], []

    effective_num_clusters = min(num_clusters, len(sentences))

    if effective_num_clusters == 0:
        print("  Not enough sentences to form clusters.")
        return [], []

    kmeans = KMeans(n_clusters=effective_num_clusters, random_state=42, n_init='auto')
    kmeans.fit(embeddings)
    clusters = kmeans.labels_
    centroids = kmeans.cluster_centers_

    summary_sentences_with_idx = []
    selected_indices = set()

    for i in range(effective_num_clusters):
        cluster_sentence_indices = np.where(clusters == i)[0]

        if len(cluster_sentence_indices) == 0:
            continue

        distances = cdist(embeddings[cluster_sentence_indices], centroids[i].reshape(1, -1), 'cosine').flatten()
        sorted_cluster_indices = cluster_sentence_indices[np.argsort(distances)]

        count_selected_from_cluster = 0
        for original_idx in sorted_cluster_indices:
            if original_idx not in selected_indices:
                summary_sentences_with_idx.append((sentences[original_idx], original_idx))
                selected_indices.add(original_idx)
                count_selected_from_cluster += 1
                if count_selected_from_cluster >= num_sentences_per_cluster:
                    break

    summary_sentences_with_idx.sort(key=lambda x: x[1])
    final_summary_sents = [s[0] for s in summary_sentences_with_idx]
    final_summary_indices = [s[1] for s in summary_sentences_with_idx]

    print("--- K-Means Based Summarization Complete ---")
    return final_summary_sents, final_summary_indices

# --- Combined Extractive Summarization Function (Optimized to accept pre-calculated embeddings) ---
def combined_extractive_summary_optimized(sentences, embeddings, total_summary_sentences=7,
                                centroid_sentences_to_propose=5,
                                kmeans_clusters_to_propose=4,
                                kmeans_sentences_per_cluster_to_propose=1,
                                lambda_param_mmr=0.7):
    """
    Generates a single extractive summary by combining candidates from
    both centroid-based and K-Means approaches, then using MMR for final selection.
    Accepts pre-calculated sentences and embeddings.
    """
    print("\n--- Starting Combined Extractive Summarization ---")
    if not sentences or embeddings.shape[0] == 0:
        print("  No sentences or embeddings provided. Cannot summarize combined.")
        return []

    centroid_candidates_sents, centroid_candidates_indices = centroid_summarization_optimized(
        sentences, embeddings, num_sentences=centroid_sentences_to_propose
    )
    print(f"  Centroid proposed {len(centroid_candidates_sents)} candidates.")

    kmeans_candidates_sents, kmeans_candidates_indices = kmeans_summarization_optimized(
        sentences, embeddings, num_clusters=kmeans_clusters_to_propose, num_sentences_per_cluster=kmeans_sentences_per_cluster_to_propose
    )
    print(f"  K-Means proposed {len(kmeans_candidates_sents)} candidates.")

    # Combine candidates and their original indices, removing duplicates
    combined_candidates_map = {}
    for idx, sent in zip(centroid_candidates_indices, centroid_candidates_sents):
        combined_candidates_map[idx] = sent
    for idx, sent in zip(kmeans_candidates_indices, kmeans_candidates_sents):
        combined_candidates_map[idx] = sent

    all_candidate_indices_sorted = sorted(combined_candidates_map.keys())
    all_candidate_sentences = [combined_candidates_map[idx] for idx in all_candidate_indices_sorted]
    all_candidate_embeddings = np.array([embeddings[idx] for idx in all_candidate_indices_sorted])

    if not all_candidate_sentences or all_candidate_embeddings.shape[0] == 0:
        print("  No unique candidates found after combining. Cannot generate combined summary.")
        return []

    num_sentences_to_extract = min(total_summary_sentences, len(all_candidate_sentences))
    print(f"  Total unique candidates: {len(all_candidate_sentences)}. Extracting {num_sentences_to_extract} for combined summary.")

    document_centroid = np.mean(embeddings, axis=0)
    candidate_similarities = cosine_similarity(all_candidate_embeddings, document_centroid.reshape(1, -1)).flatten()

    final_summary_sentences = []
    selected_candidate_indices = set()

    ranked_initial_candidate_indices = np.argsort(candidate_similarities)[::-1]

    for _ in range(num_sentences_to_extract):
        best_idx_in_candidates = -1
        max_mmr_score = -1

        for i_candidate in ranked_initial_candidate_indices:
            if i_candidate not in selected_candidate_indices:
                relevance = candidate_similarities[i_candidate]

                if not selected_candidate_indices:
                    mmr_score = relevance
                else:
                    diversity_scores = cosine_similarity(all_candidate_embeddings[i_candidate].reshape(1, -1),
                                                         all_candidate_embeddings[list(selected_candidate_indices)])
                    redundancy = np.max(diversity_scores)

                    mmr_score = lambda_param_mmr * relevance - (1 - lambda_param_mmr) * redundancy

                if mmr_score > max_mmr_score:
                    max_mmr_score = mmr_score
                    best_idx_in_candidates = i_candidate

        if best_idx_in_candidates != -1:
            final_summary_sentences.append((all_candidate_sentences[best_idx_in_candidates],
                                             all_candidate_indices_sorted[best_idx_in_candidates]))
            selected_candidate_indices.add(best_idx_in_candidates)
            ranked_initial_candidate_indices = ranked_initial_candidate_indices[ranked_initial_candidate_indices != best_idx_in_candidates]
        else:
            break

    final_summary_sentences.sort(key=lambda x: x[1])
    final_summary = [s[0] for s in final_summary_sentences]

    print("--- Combined Extractive Summarization Complete ---")
    return final_summary

# --- Example Usage and Testing ---
long_document = """
Artificial intelligence (AI) has rapidly transformed various sectors, revolutionizing industries from healthcare to finance. In healthcare, AI assists in diagnosing diseases earlier and more accurately, personalizing treatment plans, and accelerating drug discovery. Machine learning algorithms, a subset of AI, analyze vast amounts of patient data to identify patterns that human doctors might miss, leading to more effective interventions. For instance, AI-powered tools can detect subtle signs of retinopathy from eye scans, potentially preventing blindness. The integration of AI into electronic health records is also streamlining administrative tasks, freeing up medical professionals to focus more on patient care. This technological leap promises to enhance diagnostic capabilities and optimize treatment protocols significantly.

The financial industry also heavily leverages AI for fraud detection, algorithmic trading, and personalized financial advice. AI systems can monitor transactions in real-time, identifying unusual patterns indicative of fraudulent activity with high precision. Furthermore, robo-advisors powered by AI provide automated, data-driven investment advice tailored to individual risk tolerance and financial goals, making financial planning more accessible to a wider demographic. The use of AI in predicting market trends and managing portfolios is becoming increasingly sophisticated, offering new avenues for investors.

Beyond these, AI is deeply embedded in everyday life through virtual assistants like Siri and Alexa, recommendation engines on streaming platforms, and autonomous vehicles. AI's role in natural language processing (NLP) has led to advancements in language translation and sentiment analysis, impacting global communication and customer service. The ethical implications of AI, however, are a growing concern among researchers and policymakers. Issues such as algorithmic bias, job displacement due to automation, and privacy breaches require careful consideration and robust regulation. Ensuring transparency, fairness, and accountability in AI development is paramount to harnessing its benefits responsibly.

Research in AI continues to advance at an astonishing pace, focusing on areas like explainable AI (XAI) to make AI decisions more understandable, and robust AI to improve performance in real-world, unpredictable environments. Novel architectures like generative adversarial networks (GANs) and reinforcement learning are pushing the boundaries of what AI can achieve, from creating realistic imagery to mastering complex games. The future of AI promises even more integration into society, with potential breakthroughs in areas like general artificial intelligence (AGI) and enhanced human-computer interaction, leading to smarter cities and more efficient resource management. However, achieving these advancements responsibly will necessitate ongoing collaboration between technologists, policymakers, and ethicists to address the complex challenges that arise. The rapid pace of development means that continuous public discourse and legislative adaptation are critical to navigate the challenges and maximize the societal benefits of AI, ensuring it serves humanity's best interests.
"""

print("Original Document Length (sentences):", sum(1 for _ in nlp(long_document).sents))

# --- OPTIMIZATION: Calculate document embeddings only ONCE ---
print("\nCalculating document embeddings (this might take a while for long texts)...")
sentences_list, embeddings_array = get_sentence_embeddings(long_document, batch_size=8)
print("Embeddings calculation complete.")


# --- Individual Centroid-Based Summarization ---
print("\n" + "="*80)
print("Individual Centroid-Based Summary:")
centroid_summary, _ = centroid_summarization_optimized(sentences_list, embeddings_array, num_sentences=5)
for i, sent in enumerate(centroid_summary):
    print(f"{i+1}. {sent}")


# --- Individual K-Means Based Summarization ---
print("\n" + "="*80)
print("Individual K-Means Based Summary:")
kmeans_summary, _ = kmeans_summarization_optimized(sentences_list, embeddings_array, num_clusters=4, num_sentences_per_cluster=1)
for i, sent in enumerate(kmeans_summary):
    print(f"{i+1}. {sent}")


# --- Combined Extractive Summarization ---
print("\n" + "="*80)
print("Combined Extractive Summary:")
combined_summary = combined_extractive_summary_optimized(
    sentences_list,
    embeddings_array,
    total_summary_sentences=6,
    centroid_sentences_to_propose=7,
    kmeans_clusters_to_propose=5,
    kmeans_sentences_per_cluster_to_propose=1
)
for i, sent in enumerate(combined_summary):
    print(f"{i+1}. {sent}")

print("\n" + "="*80)
print("\nAll summarization processes complete.")

Loading Longformer model and tokenizer...


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Using device: cpu
Longformer model loaded.
Original Document Length (sentences): 20

Calculating document embeddings (this might take a while for long texts)...
Total sentences to process: 20
Embeddings calculation complete.

Individual Centroid-Based Summary:

--- Starting Centroid-Based Summarization ---
--- Centroid-Based Summarization Complete ---
1. Artificial intelligence (AI) has rapidly transformed various sectors, revolutionizing industries from healthcare to finance.
2. The integration of AI into electronic health records is also streamlining administrative tasks, freeing up medical professionals to focus more on patient care.
3. Furthermore, robo-advisors powered by AI provide automated, data-driven investment advice tailored to individual risk tolerance and financial goals, making financial planning more accessible to a wider demographic.
4. The future of AI promises even more integration into society, with potential breakthroughs in areas like general artificial intelligen