In [None]:
# Load a sample text document or article for summarization.

text = """
Natural Language Processing (NLP) is a subfield of artificial intelligence that focuses on the interaction 
between humans and computers using natural language. NLP techniques are widely used in various applications 
such as chatbots, translation, text summarization, and sentiment analysis. Text summarization is a crucial 
application of NLP that allows users to condense lengthy documents into shorter, coherent versions while 
retaining the essential information. Summarization techniques can be categorized into two types: extractive 
and abstractive. Extractive summarization involves selecting the most important sentences from the original 
text, whereas abstractive summarization generates new sentences that convey the core meaning. Extractive 
methods are easier to implement as they rely on scoring and ranking sentences based on their importance.
"""

# Print original text
print("Original Text:")
print(text)

In [None]:
# Preprocess the text: tokenize sentences and words, and remove stopwords.

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string

# Download resources
nltk.download('punkt')
nltk.download('stopwords')

# Tokenize text into sentences
sentences = sent_tokenize(text)

# Tokenize and clean words
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    words = word_tokenize(text.lower())  # Tokenize and lowercase
    words = [word for word in words if word not in stop_words and word not in string.punctuation]
    return words

# Cleaned words
cleaned_words = preprocess_text(text)
print("\nCleaned Words Sample:", cleaned_words[:20])

In [None]:
# Compute the frequency of each word in the cleaned text.

from collections import Counter

# Word frequency
word_frequencies = Counter(cleaned_words)

# Normalize word frequencies
max_freq = max(word_frequencies.values())
for word in word_frequencies:
    word_frequencies[word] = word_frequencies[word] / max_freq

print("\nNormalized Word Frequencies:")
print(dict(list(word_frequencies.items())[:10]))

In [None]:
# Score sentences based on the sum of word frequencies of words they contain.

sentence_scores = {}
for sentence in sentences:
    words = preprocess_text(sentence)
    for word in words:
        if word in word_frequencies:
            if sentence not in sentence_scores:
                sentence_scores[sentence] = word_frequencies[word]
            else:
                sentence_scores[sentence] += word_frequencies[word]

# Print top scored sentences
print("\nTop Scored Sentences:")
print(sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:3])

In [None]:
# Extract the top N scored sentences to form the summary.

def extract_summary(sentence_scores, num_sentences=3):
    top_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:num_sentences]
    summary = " ".join([sentence for sentence, score in top_sentences])
    return summary

# Generate summary
summary = extract_summary(sentence_scores, num_sentences=3)
print("\nGenerated Summary:")
print(summary)

In [None]:
# Compare the summary with the original text.

print("\nOriginal Text Length:", len(text.split()))
print("Summary Length:", len(summary.split()))
print("\nSummary:\n", summary)
