In [2]:
import os
import re
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk import download
from scipy.stats import entropy
from rouge_score import rouge_scorer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jenishkothari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jenishkothari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def load_documents(duc_path):
    docs_path = os.path.join(duc_path, "docs")
    summaries_path = os.path.join(duc_path, "Summaries")
    
    documents = {}
    gold_summaries = {}

    for filename in os.listdir(docs_path):
        filepath = os.path.join(docs_path, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                documents[filename] = f.read()
    
    for filename in os.listdir(summaries_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(summaries_path, filename)
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                match = re.search(r'Abstract:(.*?)(?:Introduction:|$)', content, re.DOTALL)
                if match:
                    doc_key = filename.replace(".txt", "")
                    gold_summaries[doc_key] = match.group(1).strip()
    
    return documents, gold_summaries


In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    sentences = sent_tokenize(text)
    preprocessed_sentences = []
    
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [stemmer.stem(w) for w in words if w.isalnum() and w not in stop_words]
        preprocessed_sentences.append(' '.join(filtered_words))
    
    return sentences, preprocessed_sentences

In [5]:
def train_lda(documents, n_topics=30):
    all_cleaned_docs = []

    for doc in documents.values():
        _, cleaned = preprocess(doc)
        all_cleaned_docs.append(' '.join(cleaned))  # full doc as one string
    
    vectorizer = CountVectorizer()
    doc_term_matrix = vectorizer.fit_transform(all_cleaned_docs)
    
    lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
    lda_model.fit(doc_term_matrix)
    
    return lda_model, vectorizer


In [None]:
def get_topic_distribution(text_list, vectorizer, lda_model, n_topics):
    text = ' '.join(text_list)
    X = vectorizer.transform([text])
    topic_distribution = lda_model.transform(X)[0]
    topic_distribution = np.where(topic_distribution == 0, 1e-10, topic_distribution)
    return topic_distribution


In [7]:
def kl_sum_topic_based(original_sentences, cleaned_sentences, vectorizer, lda_model, PD, n_topics=30, max_sentences=5):
    selected = []
    used_indices = set()

    while len(selected) < max_sentences:
        min_kl = float('inf')
        best_idx = -1

        for i, cleaned_sentence in enumerate(cleaned_sentences):
            if i in used_indices:
                continue

            # Build candidate summary
            summary_text = [cleaned_sentences[j] for j in selected] + [cleaned_sentence]
            PS = get_topic_distribution(summary_text, vectorizer, lda_model, n_topics)
            kl_div = entropy(PD, PS)

            if kl_div < min_kl:
                min_kl = kl_div
                best_idx = i

        if best_idx != -1:
            used_indices.add(best_idx)
            selected.append(best_idx)
        else:
            break

    return ' '.join([original_sentences[i] for i in selected])


In [10]:
duc_path = "DUC2001"
documents, gold_summaries = load_documents(duc_path)
lda_model, vectorizer = train_lda(documents, n_topics=30)

# Pick one doc
doc_id = list(documents.keys())[3]
sentences, cleaned = preprocess(documents[doc_id])
PD = get_topic_distribution(cleaned, vectorizer, lda_model, n_topics=30)

# Generate summary
generated_summary = kl_sum_topic_based(sentences, cleaned, vectorizer, lda_model, PD, n_topics=30, max_sentences=5)

print("📃 Generated Summary:")
print(generated_summary)
doc_id = doc_id.lower()
if doc_id in gold_summaries:
    print("\n🟨 Gold Summary:")
    print(gold_summaries[doc_id])


📃 Generated Summary:

<DOC>
<DOCNO> AP900629-0260 </DOCNO>
<FILEID>AP-NR-06-29-90 0653EDT</FILEID>
<FIRST>a f BC-Chunnel Adv02   06-29 1083</FIRST>
<SECOND>BC-Chunnel, Adv 02,1122</SECOND>
<NOTE>$adv02</NOTE>
<NOTE>For release Monday July 2</NOTE>
<HEAD>England-France Tunnel Halfway There Despite Problems</HEAD>
<HEAD>LaserPhoto LON23 of June 26; Graphic</HEAD>
<BYLINE>By COTTEN TIMBERLAKE</BYLINE>
<BYLINE>Associated Press Writer</BYLINE>
<DATELINE>LONDON (AP) </DATELINE>
<TEXT>
   It's been described as the largest current civil
engineering project, a multibillion dollar link that will help
revolutionize Europe's economy and physically end Britain's
historic isolation, a dream born in Napoleon's day. In October, concern about the rising pricetag drove Eurotunnel's
banking syndicate to freeze funds for three months until the
company reached a truce with Trans-Manche Link, the consortium of
10 British and French contractors doing the construction, over
responsibility for $1.7 billion in