In [None]:
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import entropy
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jenishkothari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
document = """The moon mission launched today. NASA scientists are hopeful. The rocket is powered by new fuel. SpaceX contributed to the design. The mission is expected to orbit the moon twice. Many space enthusiasts are excited about the project."""

sentences = nltk.sent_tokenize(document)

print("Sentences:")
for i, s in enumerate(sentences):
    print(f"{i+1}: {s}")

Sentences:
1: The moon mission launched today.
2: NASA scientists are hopeful.
3: The rocket is powered by new fuel.
4: SpaceX contributed to the design.
5: The mission is expected to orbit the moon twice.
6: Many space enthusiasts are excited about the project.


In [12]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(sentences)

doc_vector = vectorizer.transform([document])
PD = np.asarray(doc_vector.sum(axis=0)).flatten()
PD = PD + 1e-12
PD = PD / PD.sum()

In [13]:
def compute_ps(sent_list):
    if not sent_list:
        # Avoid zero probabilities: smooth all words
        return np.ones(len(PD)) * 1e-12
    vec = vectorizer.transform(sent_list)
    ps = np.asarray(vec.sum(axis=0)).flatten()
    return ps / ps.sum() if ps.sum() > 0 else np.ones_like(ps) * 1e-12

In [16]:
summary = []
remaining = sentences.copy()
summary_length = 3 

for _ in range(summary_length):
    best_sentence, best_kl = None, float('inf')
    for sent in remaining:
        trial_summary = summary + [sent]
        PS = compute_ps(trial_summary)
        PS = PS + 1e-12
        kl = entropy(PD, PS)
        if kl < best_kl:
            best_sentence, best_kl = sent, kl
    if best_sentence:
        summary.append(best_sentence)
        remaining.remove(best_sentence)


In [17]:
print("\nFinal KL-Sum Extractive Summary (Word-based):")
for s in summary:
    print("-", s)


Final KL-Sum Extractive Summary (Word-based):
- The mission is expected to orbit the moon twice.
- The rocket is powered by new fuel.
- Many space enthusiasts are excited about the project.
