In [2]:
import os
import re
import numpy as np
import nltk
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk import download
from scipy.stats import entropy

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jenishkothari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jenishkothari/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def load_documents(duc_path):
    docs_path = os.path.join(duc_path, "Docs")
    summaries_path = os.path.join(duc_path, "Summaries")
    
    documents = {}
    gold_summaries = {}

    for filename in os.listdir(docs_path):
        filepath = os.path.join(docs_path, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                documents[filename] = f.read()

    for filename in os.listdir(summaries_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(summaries_path, filename)
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                match = re.search(r'Abstract:(.*?)(?:Introduction:|$)', content, re.DOTALL)
                if match:
                    doc_key = filename.replace(".txt", "")
                    gold_summaries[doc_key] = match.group(1).strip()
    
    return documents, gold_summaries


In [None]:
duc_path = "DUC2001"
documents, gold_summaries = load_documents(duc_path)

for i, (doc_id, text) in enumerate(documents.items()):
    print(f"\n📄 Document ID: {doc_id}\n{text[:300]}...")
    doc_id = doc_id.lower()
    if doc_id in gold_summaries:
        print(f"\n🟨 Matching Gold Summary:\n{gold_summaries[doc_id]}")
    else:
        print("\n❌ No matching summary found.")
    
    if i >= 2:
        break



📄 Document ID: FBIS-41815
<DOC>
<DOCNO> FBIS3-41815 </DOCNO>
<HT>    "jptep001__l94053" </HT>


<HEADER>
<AU>   JPRS-TEP-94-001-L </AU>
Document Type:JPRS 
Document Title:Epidemiology 

<DATE1>  25 February 1994 </DATE1>

</HEADER>

<F P=100> WEST EUROPE </F>
<F P=101> UNITED KINGDOM </F>
<H3> <TI>   Concern Over Transmissio...

🟨 Matching Gold Summary:


📄 Document ID: AP881017-0235

<DOC>
<DOCNO> AP881017-0235 </DOCNO>
<FILEID>AP-NR-10-17-88 2351EDT</FILEID>
<FIRST>a i BC-ChannelTunnel ADV30   10-17 0995</FIRST>
<SECOND>BC-Channel Tunnel, ADV 30,1025</SECOND>
<NOTE>$Adv30</NOTE>
<NOTE>For Release Sunday, Oct. 30, or Thereafter</NOTE>
<HEAD>Gigantic Tunnel Project Inches Toward...

🟨 Matching Gold Summary:
Construction of a 31-mile tunnel, 24 miles of it underwater, is underway between Dover, England to the coast of France. Scheduled to be completed in 1993, it will cut the London-Paris journey from six hours to three. It will enable freight to travel on a single train instead of bei

In [30]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    sentences = sent_tokenize(text)
    preprocessed_sentences = []
    
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [stemmer.stem(w) for w in words if w.isalnum() and w not in stop_words]
        preprocessed_sentences.append(filtered_words)
    
    return sentences, preprocessed_sentences


In [31]:
sample_doc_id = list(documents.keys())[0]
sentences, tokenized = preprocess(documents[sample_doc_id])

print("🔹Original sentence example:")
print(sentences[0])

print("\n🔹Tokenized and preprocessed:")
print(tokenized[0])


🔹Original sentence example:
<DOC>
<DOCNO> FBIS3-41815 </DOCNO>
<HT>    "jptep001__l94053" </HT>


<HEADER>
<AU>   JPRS-TEP-94-001-L </AU>
Document Type:JPRS 
Document Title:Epidemiology 

<DATE1>  25 February 1994 </DATE1>

</HEADER>

<F P=100> WEST EUROPE </F>
<F P=101> UNITED KINGDOM </F>
<H3> <TI>   Concern Over Transmission of Spongiform Encephalopathy </TI></H3>
<F P=102>  94WE0069A London THE TIMES in English 13 Oct 93 p 12 -- FOR 
OFFICIAL USE ONLY </F>

<F P=103> 94WE0069A </F>
<F P=104>  London THE TIMES </F>


<TEXT>
Language: <F P=105> English </F>
Article Type:CSO 

<F P=106> [Article by Nigel Hawkes, Science Editor: "Zoo Antelope </F>
Catch Mad Cow Disease"] 
  [Text] Scientists at London zoo have discovered that a 
strain of "mad cow disease" affecting a type of antelope can be 
transmitted much more easily than was thought.

🔹Tokenized and preprocessed:
['doc', 'docno', 'ht', 'header', 'au', 'document', 'type', 'jpr', 'document', 'titl', 'epidemiolog', 'date1', '25', 'fe

In [32]:
def compute_distribution(word_lists):
    counter = Counter()
    for word_list in word_lists:
        counter.update(word_list)
    
    total = sum(counter.values())
    distribution = {word: count / total for word, count in counter.items()}
    return distribution


In [None]:
def kl_sum_word_based(original_sentences, tokenized_sentences, PD, max_sentences=5):
    selected = []
    used_indices = set()

    while len(selected) < max_sentences:
        min_kl = float('inf')
        best_idx = -1
        for i, sentence_tokens in enumerate(tokenized_sentences):
            if i in used_indices:
                continue
            summary_tokens = []
            for j in selected:
                summary_tokens.extend(tokenized_sentences[j])
            summary_tokens.extend(sentence_tokens)
            
            PS = compute_distribution([summary_tokens])
            
            all_words = set(PD.keys()).union(set(PS.keys()))
            pd_vec = np.array([PD.get(w, 1e-10) for w in all_words])
            ps_vec = np.array([PS.get(w, 1e-10) for w in all_words])
            
            kl_div = entropy(pd_vec, ps_vec)
            
            if kl_div < min_kl:
                min_kl = kl_div
                best_idx = i
        
        if best_idx != -1:
            used_indices.add(best_idx)
            selected.append(best_idx)
        else:
            break

    return ' '.join([original_sentences[i] for i in selected])


In [36]:

for doc_id, doc_text in list(documents.items())[:3]:  
    doc_id = doc_id.lower()
    sentences, tokenized = preprocess(doc_text)
    PD = compute_distribution(tokenized)
    generated_summary = kl_sum_word_based(sentences, tokenized, PD, max_sentences=5)

    print(f"\nDocument: {doc_id}")
    print("\n---Generated Summary---")
    print(generated_summary)
    
    if doc_id in gold_summaries:
        print("\n---Gold Summary (Abstract)---")
        print(gold_summaries[doc_id])



Document: fbis-41815

---Generated Summary---
<DOC>
<DOCNO> FBIS3-41815 </DOCNO>
<HT>    "jptep001__l94053" </HT>


<HEADER>
<AU>   JPRS-TEP-94-001-L </AU>
Document Type:JPRS 
Document Title:Epidemiology 

<DATE1>  25 February 1994 </DATE1>

</HEADER>

<F P=100> WEST EUROPE </F>
<F P=101> UNITED KINGDOM </F>
<H3> <TI>   Concern Over Transmission of Spongiform Encephalopathy </TI></H3>
<F P=102>  94WE0069A London THE TIMES in English 13 Oct 93 p 12 -- FOR 
OFFICIAL USE ONLY </F>

<F P=103> 94WE0069A </F>
<F P=104>  London THE TIMES </F>


<TEXT>
Language: <F P=105> English </F>
Article Type:CSO 

<F P=106> [Article by Nigel Hawkes, Science Editor: "Zoo Antelope </F>
Catch Mad Cow Disease"] 
  [Text] Scientists at London zoo have discovered that a 
strain of "mad cow disease" affecting a type of antelope can be 
transmitted much more easily than was thought. Another danger taken seriously by the zoo, a world centre 
for 
breeding rare and endangered species, is that animals bred in 
cap

In [38]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(gold_summaries[doc_id], generated_summary)
print("\n---ROUGE Scores---")
print(scores)



---ROUGE Scores---
{'rouge1': Score(precision=0.3399014778325123, recall=0.6509433962264151, fmeasure=0.4466019417475728), 'rouge2': Score(precision=0.09900990099009901, recall=0.19047619047619047, fmeasure=0.13029315960912055), 'rougeL': Score(precision=0.1625615763546798, recall=0.3113207547169811, fmeasure=0.21359223300970875)}
