<a href="https://colab.research.google.com/github/Nanda654/HEADS/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Dataset preparation using TFIDF centroid similarity

In [None]:
import os
import json
from datasets import load_dataset, DatasetDict, Dataset
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer # Explicitly import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm # Import tqdm for progress bar

# --- 0. Configuration ---
# Path to your existing govreport_raw JSON files
GOVREPORT_RAW_DIR = "./govreport_raw"
# Path to store the newly generated dataset with extractive summaries
GENERATED_DATASET_DIR = "./govreport_tfidf_vscode2" # New output folder name for TF-IDF approach
os.makedirs(GENERATED_DATASET_DIR, exist_ok=True)

# Language for NLTK tokenization
LANGUAGE = "english"
# Similarity threshold to determine if an original sentence is "related" to the abstractive summary.
# Adjust this value (0.0 to 1.0) to control how strict the relation must be.
# Higher values mean only very similar sentences are included.
SIMILARITY_THRESHOLD = 0.5

# --- NLTK Resource Download and Tokenizer Initialization ---
# This ensures 'punkt' is downloaded and the tokenizer is explicitly loaded once.
punkt_tokenizer_instance = None
try:
    nltk.download('punkt', quiet=True)
    print("NLTK 'punkt' downloaded successfully.")
    # After download, the punkt tokenizer should be discoverable by PunktSentenceTokenizer
    punkt_tokenizer_instance = PunktSentenceTokenizer()
    print("Punkt tokenizer instance created successfully.")
except Exception as e:
    print(f"Error downloading or initializing NLTK 'punkt' tokenizer: {e}")
    print("NLTK functions will likely fail. Please check internet connection and NLTK setup.")
    # Exit if NLTK setup fails, as the rest of the script depends on it.
    exit()

# --- 1. Load the existing govreport_raw dataset ---
print(f"Loading existing JSON files from: {GOVREPORT_RAW_DIR}")
try:
    data_files = {
        "train": os.path.join(GOVREPORT_RAW_DIR, "train.json"),
        "validation": os.path.join(GOVREPORT_RAW_DIR, "validation.json"),
        "test": os.path.join(GOVREPORT_RAW_DIR, "test.json"),
    }

    govreport_raw_data = load_dataset("json", data_files=data_files)

    print("\nRaw GovReport data loaded successfully!")
    print(govreport_raw_data)
except Exception as e:
    print(f"\nError loading raw GovReport dataset from {GOVREPORT_RAW_DIR}: {e}")
    print("Please ensure the directory exists and contains 'train.json', 'validation.json', 'test.json'.")
    print("Exiting as the dataset could not be loaded.")
    exit()

# --- 2. Define a Processing Function to Generate Reference-Based Extractive Summaries ---
# Added 'language' and 'similarity_threshold' as arguments
def generate_reference_extractive_and_combine(example, language, similarity_threshold):
    """
    Generates an extractive summary by finding sentences in the original text
    that are sufficiently similar to any sentence in the abstractive summary,
    based on a defined SIMILARITY_THRESHOLD.
    """
    # Use the globally defined punkt_tokenizer_instance to tokenize sentences
    # No need for explicit nltk imports or downloads within this function when num_proc=1
    sentences_original = punkt_tokenizer_instance.tokenize(example["document"])
    sentences_abstractive = punkt_tokenizer_instance.tokenize(example["summary"])

    original_text = example["document"]
    abstractive_summary = example["summary"]

    # Handle potential empty or non-string documents/summaries
    if not isinstance(original_text, str) or not original_text.strip() or \
       not isinstance(abstractive_summary, str) or not abstractive_summary.strip():
        return {
            "original_text": original_text,
            "extractive_summary": "",
            "abstractive_summary": abstractive_summary
        }

    # If either list of sentences is empty, return empty extractive summary
    if not sentences_original or not sentences_abstractive:
        return {
            "original_text": original_text,
            "extractive_summary": "",
            "abstractive_summary": abstractive_summary
        }

    # Combine all sentences for TF-IDF vectorization to ensure a consistent vocabulary
    all_sentences = sentences_original + sentences_abstractive

    # Handle case where all_sentences might be empty or contain only empty strings
    # after tokenization, which can happen if original_text/abstractive_summary
    # contained only whitespace or un-tokenizable characters.
    if not any(s.strip() for s in all_sentences):
        return {
            "original_text": original_text,
            "extractive_summary": "",
            "abstractive_summary": abstractive_summary
        }

    vectorizer = TfidfVectorizer().fit(all_sentences)

    # Transform sentences into TF-IDF vectors
    original_vectors = vectorizer.transform(sentences_original)
    abstractive_vectors = vectorizer.transform(sentences_abstractive)

    extractive_sentences_indices = []

    # Calculate cosine similarity between all original sentences and all abstractive sentences
    # The result `similarity_matrix` will have dimensions (num_original_sentences, num_abstractive_sentences)
    similarity_matrix = cosine_similarity(original_vectors, abstractive_vectors)

    # For each original sentence, find its maximum similarity to any abstractive sentence
    # If this max similarity is above the threshold, include the original sentence
    for i, _ in enumerate(sentences_original):
        # Get the similarities of the current original sentence to all abstractive sentences
        current_original_sent_similarities = similarity_matrix[i]

        # Find the maximum similarity for this original sentence
        max_similarity_to_abstractive = np.max(current_original_sent_similarities)

        if max_similarity_to_abstractive >= similarity_threshold: # Use passed similarity_threshold
            extractive_sentences_indices.append(i)

    # Reconstruct the extractive summary by selecting sentences based on their original order
    # and the identified indices
    extractive_summary = " ".join([sentences_original[i] for i in sorted(extractive_sentences_indices)])

    return {
        "original_text": original_text,
        "extractive_summary": extractive_summary,
        "abstractive_summary": abstractive_summary
    }

# --- 3. Main Dataset Processing ---
processed_data = DatasetDict()

for split_name in ["train", "validation", "test"]:
    print(f"\nProcessing '{split_name}' split to generate reference-based extractive summaries (TF-IDF Cosine Similarity)...")

    current_split_data = []
    # Manually iterate with tqdm to show progress bar when num_proc=1
    for example in tqdm(govreport_raw_data[split_name], desc=f"Generating {split_name} extractive summaries"):
        # Call the processing function for each example
        processed_example = generate_reference_extractive_and_combine(
            example,
            language=LANGUAGE,
            similarity_threshold=SIMILARITY_THRESHOLD
        )
        current_split_data.append(processed_example)

    # Convert the list of dictionaries to a Dataset object
    processed_split = Dataset.from_list(current_split_data)
    processed_data[split_name] = processed_split

    print(f"'{split_name}' split processed. New features: {processed_split.column_names}")
    print(f"Sample from '{split_name}' split (first entry):\n{processed_split[0]}")

    # Save the processed split to disk in JSON format
    save_path = os.path.join(GENERATED_DATASET_DIR, f"{split_name}.json")
    processed_split.to_json(save_path) # Save as JSONL by default for Datasets.to_json()
    print(f"'{split_name}' split saved to: {save_path}")

print(f"\nAll splits processed and saved to: {GENERATED_DATASET_DIR}")
print("\nSummary of final dataset structure:")
print(processed_data)

print("\nTo load this generated dataset later, you can use:")
print(f"from datasets import load_dataset")
print(f"train_data = load_dataset('json', data_files='{GENERATED_DATASET_DIR}/train.json', split='train')")
print(f"validation_data = load_dataset('json', data_files='{GENERATED_DATASET_DIR}/validation.json', split='validation')")
print(f"test_data = load_dataset('json', data_files='{GENERATED_DATASET_DIR}/test.json', split='test')")


#Dataset generation using both TFIDF centroid and greeedy ROUGE approach

In [None]:
import os
import json
from datasets import load_dataset, DatasetDict, Dataset
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer # Explicitly import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from rouge_score import rouge_scorer # For ROUGE score calculation
from tqdm import tqdm # Import tqdm for progress bar

# --- 0. Configuration ---
# Path to your existing govreport_raw JSON files
GOVREPORT_RAW_DIR = "./govreport_raw"
# Path to store the newly generated dataset with the combined extractive summary
GENERATED_DATASET_DIR = "./govreport_combined_extractive" # New output folder name for combined approach
os.makedirs(GENERATED_DATASET_DIR, exist_ok=True)

# Language for NLTK tokenization and ROUGE scorer
LANGUAGE = "english"

# Configuration for Combined Extractive Summarization
NUM_COMBINED_SENTENCES = 7 # Number of sentences for the combined hybrid summary
WEIGHT_ROUGE = 0.6 # Weight for ROUGE score in combined calculation (0.0 to 1.0)
WEIGHT_CENTROID = 0.4 # Weight for Centroid similarity in combined calculation (0.0 to 1.0)
# Ensure WEIGHT_ROUGE + WEIGHT_CENTROID = 1.0 for normalized combined score
if not np.isclose(WEIGHT_ROUGE + WEIGHT_CENTROID, 1.0):
    print("Warning: WEIGHT_ROUGE and WEIGHT_CENTROID do not sum to 1.0. Normalizing...")
    total_weight = WEIGHT_ROUGE + WEIGHT_CENTROID
    WEIGHT_ROUGE /= total_weight
    WEIGHT_CENTROID /= total_weight
    print(f"Normalized weights: WEIGHT_ROUGE={WEIGHT_ROUGE:.2f}, WEIGHT_CENTROID={WEIGHT_CENTROID:.2f}")


# --- NLTK Resource Download and Tokenizer Initialization ---
# This ensures 'punkt' is downloaded and the tokenizer is explicitly loaded once.
punkt_tokenizer_instance = None
try:
    nltk.download('punkt', quiet=True)
    print("NLTK 'punkt' downloaded successfully.")
    # After download, the punkt tokenizer should be discoverable by PunktSentenceTokenizer
    punkt_tokenizer_instance = PunktSentenceTokenizer()
    print("Punkt tokenizer instance created successfully.")
except Exception as e:
    print(f"Error downloading or initializing NLTK 'punkt' tokenizer: {e}")
    print("NLTK functions will likely fail. Please check internet connection and NLTK setup.")
    # Exit if NLTK setup fails, as the rest of the script depends on it.
    exit()

# --- 1. Load the existing govreport_raw dataset ---
print(f"Loading existing JSON files from: {GOVREPORT_RAW_DIR}")
try:
    data_files = {
        "train": os.path.join(GOVREPORT_RAW_DIR, "train.json"),
        "validation": os.path.join(GOVREPORT_RAW_DIR, "validation.json"),
        "test": os.path.join(GOVREPORT_RAW_DIR, "test.json"),
    }

    govreport_raw_data = load_dataset("json", data_files=data_files)

    print("\nRaw GovReport data loaded successfully!")
    print(govreport_raw_data)
except Exception as e:
    print(f"\nError loading raw GovReport dataset from {GOVREPORT_RAW_DIR}: {e}")
    print("Please ensure the directory exists and contains 'train.json', 'validation.json', 'test.json'.")
    print("Exiting as the dataset could not be loaded.")
    exit()

# --- Helper for ROUGE (needed for combined_extractive_summary) ---
def postprocess_text_for_rouge(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    # For ROUGE, especially Lsum, it's good to have sentences separated by newlines
    preds = ["\n".join(punkt_tokenizer_instance.tokenize(pred)) for pred in preds]
    labels = ["\n".join(punkt_tokenizer_instance.tokenize(label)) for label in labels]
    return preds, labels

# --- 2. Combined Extractive Summary Function ---
def combined_extractive_summary(document, reference_summary, num_sentences, weight_rouge, weight_centroid):
    """
    Generates a single extractive summary by combining ROUGE-L F1 score with
    TF-IDF centroid similarity for each sentence.
    """
    sentences = punkt_tokenizer_instance.tokenize(document)

    if not sentences or not reference_summary.strip():
        return ""

    # --- Calculate ROUGE-L F1 scores for each sentence ---
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    sentence_rouge_scores = []
    for sent_idx, sentence in enumerate(sentences):
        processed_sent, processed_ref_summary = postprocess_text_for_rouge(
            [sentence], [reference_summary]
        )
        score = rouge_scorer_instance.score(processed_ref_summary[0], processed_sent[0])['rougeL'].fmeasure
        sentence_rouge_scores.append(score)

    # --- Calculate TF-IDF Centroid similarities for each sentence ---
    vectorizer = TfidfVectorizer().fit(sentences)
    original_vectors = vectorizer.transform(sentences)

    if original_vectors.shape[0] == 0 or np.all(original_vectors.toarray() == 0):
        # If no valid vectors, return empty summary
        return ""

    document_centroid = np.mean(original_vectors.toarray(), axis=0)

    if np.all(document_centroid == 0):
        # If centroid is zero (e.g., all sentences were empty after tokenization), return empty summary
        return ""

    sentence_centroid_similarities = cosine_similarity(document_centroid.reshape(1, -1), original_vectors)[0]

    # --- Combine scores and select top sentences ---
    sentence_scores = [] # List of (original_index, combined_score, sentence_text)
    for i, sentence in enumerate(sentences):
        # ROUGE F1 is already 0-1. Cosine similarity is also 0-1.
        combined_score = (weight_rouge * sentence_rouge_scores[i]) + \
                         (weight_centroid * sentence_centroid_similarities[i])
        sentence_scores.append((i, combined_score, sentence))

    # Sort sentences by combined score in descending order
    sentence_scores.sort(key=lambda x: x[1], reverse=True)

    # Select the top N sentences
    num_to_extract = min(num_sentences, len(sentence_scores))
    selected_sentences_with_indices = sentence_scores[:num_to_extract]

    # Sort these selected sentences by their original index to maintain document order
    selected_sentences_with_indices.sort(key=lambda x: x[0])

    combined_summary = " ".join([s for _, _, s in selected_sentences_with_indices])
    return combined_summary

# --- 3. Main Processing Function to Generate the Combined Summary ---
def generate_combined_extractive_summary_for_example(example):
    """
    Generates a single combined extractive summary for a given example.
    """
    original_text = example["document"]
    abstractive_summary = example["summary"]

    # Handle potential empty or non-string inputs
    if not isinstance(original_text, str) or not original_text.strip() or \
       not isinstance(abstractive_summary, str) or not abstractive_summary.strip():
        return {
            "original_text": original_text,
            "extractive_summary_combined": "",
            "abstractive_summary": abstractive_summary
        }

    # Generate the combined extractive summary
    extractive_combined = combined_extractive_summary(
        original_text,
        abstractive_summary,
        num_sentences=NUM_COMBINED_SENTENCES,
        weight_rouge=WEIGHT_ROUGE,
        weight_centroid=WEIGHT_CENTROID
    )

    return {
        "original_text": original_text,
        "extractive_summary_combined": extractive_combined,
        "abstractive_summary": abstractive_summary
    }

# --- 4. Main Dataset Processing Loop ---
processed_data = DatasetDict()

for split_name in ["train", "validation", "test"]:
    print(f"\nProcessing '{split_name}' split to generate combined extractive summaries...")

    current_split_data = []
    # Manually iterate with tqdm to show progress bar
    for example in tqdm(govreport_raw_data[split_name], desc=f"Generating {split_name} combined summaries"):
        processed_example = generate_combined_extractive_summary_for_example(example)
        current_split_data.append(processed_example)

    # Convert the list of dictionaries to a Dataset object
    processed_split = Dataset.from_list(current_split_data)
    processed_data[split_name] = processed_split

    print(f"'{split_name}' split processed. New features: {processed_split.column_names}")
    print(f"Sample from '{split_name}' split (first entry):\n{processed_split[0]}")

    # Save the processed split to disk in JSON format
    save_path = os.path.join(GENERATED_DATASET_DIR, f"{split_name}.json")
    processed_split.to_json(save_path) # Save as JSONL by default for Datasets.to_json()
    print(f"'{split_name}' split saved to: {save_path}")

print(f"\nAll splits processed and saved to: {GENERATED_DATASET_DIR}")
print("\nSummary of final dataset structure:")
print(processed_data)

print("\nTo load this generated dataset later, you can use:")
print(f"from datasets import load_dataset")
print(f"train_data = load_dataset('json', data_files='{GENERATED_DATASET_DIR}/train.json', split='train')")
print(f"validation_data = load_dataset('json', data_files='{GENERATED_DATASET_DIR}/validation.json', split='validation')")
print(f"test_data = load_dataset('json', data_files='{GENERATED_DATASET_DIR}/test.json', split='test')")
