In [None]:
import numpy as np
import nltk
import os
import re
import string
import pyrouge
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.stats import entropy  # KL Divergence

nltk.download('punkt')

# ------------------------ Preprocessing Functions ------------------------

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)
    return words

def get_word_distribution(text):
    words = preprocess_text(text)
    word_counts = Counter(words)
    total_words = sum(word_counts.values())
    return {word: count / total_words for word, count in word_counts.items()}

# ------------------------ KL-Sum (Word-Based) ------------------------

def kl_divergence(p, q):
    """ Compute KL divergence between distributions p and q (avoiding log(0)). """
    p = np.array(list(p.values()))
    q = np.array(list(q.values()))

    # Smoothing: Add small value to avoid log(0)
    q = np.where(q == 0, 1e-10, q)
    return entropy(p, q)

def kl_sum_word_based(document, summary_length):
    sentences = sent_tokenize(document)
    PD = get_word_distribution(document)  # Fixed document distribution
    summary = []
    PS = Counter()

    while len(summary) < summary_length and len(sentences) > 0:
        min_kl_sentence = None
        min_kl_value = float('inf')

        for sentence in sentences:
            temp_PS = PS.copy()
            temp_PS.update(preprocess_text(sentence))
            temp_total = sum(temp_PS.values())
            temp_PS = {word: count / temp_total for word, count in temp_PS.items()}

            kl_value = kl_divergence(temp_PS, PD)
            if kl_value < min_kl_value:
                min_kl_value = kl_value
                min_kl_sentence = sentence

        if min_kl_sentence:
            summary.append(min_kl_sentence)
            PS.update(preprocess_text(min_kl_sentence))
            sentences.remove(min_kl_sentence)

    return ' '.join(summary)

# ------------------------ KL-Sum (Topic-Based with LDA) ------------------------

def get_lda_topic_distribution(text, lda_model, vectorizer):
    """ Compute topic distribution for a given text using LDA model. """
    text_vectorized = vectorizer.transform([text])
    return lda_model.transform(text_vectorized)[0]  # Probability distribution

def kl_sum_topic_based(document, summary_length, lda_model, vectorizer):
    sentences = sent_tokenize(document)
    PD = get_lda_topic_distribution(document, lda_model, vectorizer)
    summary = []
    PS = np.zeros(len(PD))  # Initialize uniform topic distribution

    while len(summary) < summary_length and len(sentences) > 0:
        min_kl_sentence = None
        min_kl_value = float('inf')

        for sentence in sentences:
            temp_PS = PS.copy()
            temp_PS += get_lda_topic_distribution(sentence, lda_model, vectorizer)
            temp_PS /= temp_PS.sum()  # Normalize

            kl_value = entropy(temp_PS, PD)
            if kl_value < min_kl_value:
                min_kl_value = kl_value
                min_kl_sentence = sentence

        if min_kl_sentence:
            summary.append(min_kl_sentence)
            PS += get_lda_topic_distribution(min_kl_sentence, lda_model, vectorizer)
            PS /= PS.sum()
            sentences.remove(min_kl_sentence)

    return ' '.join(summary)

# ------------------------ ROUGE Evaluation ------------------------

def evaluate_with_rouge(pred_summaries, gold_summaries_dir):
    """ Evaluate generated summaries against gold-standard summaries using ROUGE. """
    rouge = pyrouge.Rouge155()
    rouge.system_dir = "generated_summaries/"
    rouge.model_dir = gold_summaries_dir
    rouge.system_filename_pattern = 'summary.(\d+).txt'
    rouge.model_filename_pattern = 'gold_summary.#ID#.txt'
    
    rouge_results = rouge.convert_and_evaluate()
    print(rouge_results)

# ------------------------ Running on Datasets ------------------------

# Load DUC Dataset
duc_data_dir = "DUC_Dataset/Documents/"
duc_gold_dir = "DUC_Dataset/Summaries/"

for file in os.listdir(duc_data_dir):
    with open(os.path.join(duc_data_dir, file), 'r', encoding='utf-8') as f:
        document = f.read()

    # KL-Sum (Word-Based)
    summary_word = kl_sum_word_based(document, summary_length=5)
    with open(f"generated_summaries/summary.{file}.txt", 'w', encoding='utf-8') as f:
        f.write(summary_word)

    # KL-Sum (Topic-Based)
    vectorizer = CountVectorizer(stop_words='english')
    doc_term_matrix = vectorizer.fit_transform([document])
    lda = LatentDirichletAllocation(n_components=10, random_state=42)
    lda.fit(doc_term_matrix)

    summary_topic = kl_sum_topic_based(document, summary_length=5, lda_model=lda, vectorizer=vectorizer)
    with open(f"generated_summaries/summary_topic.{file}.txt", 'w', encoding='utf-8') as f:
        f.write(summary_topic)

# Evaluate with ROUGE
evaluate_with_rouge("generated_summaries/", duc_gold_dir)
