Simple Extractive Text Summarization Model

In [1]:
#1)Installing scikit-learn & nltk Packages
%pip install scikit-learn nltk ipywidgets --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
#2)Importing required libraries
import nltk 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import heapq

In [3]:
#3)Downloading required nltk datasets
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\karth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#4)Text Preprocessing function
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize

def preprocess_text(text):
    stop_words=set(stopwords.words('english'))
    sentences=sent_tokenize(text)
    clean_sentences=[]

    for sentence in sentences:
        words=word_tokenize(sentence.lower())
        clean_words=[word for word in words if word.isalnum() and word not in stop_words]
        clean_sentences.append(" ".join(clean_words))
    return clean_sentences,sentences 


In [None]:
#5)TF-IDF Vectorization & Sentence Similarity Calculation
def summarize_text(text,num_sentences=3):
    clean_sentences,original_setences=preprocess_text(text)

    tfidfVectorizer=TfidfVectorizer()
    tfidf_matrix=tfidfVectorizer.fit_transform(clean_sentences)

    cosine_similarites=cosine_similarity(tfidf_matrix,tfidf_matrix)
    sentence_scores=cosine_similarites.sum(axis=1)
    top_sentence_indices=heapq.nlargest(num_senteces,range(len(sentence_scores)),key=sentence_scores.take)
    summary=[original_setences[i] for i in sorted(top_sentence_indices)]
    
    return " ".join(summary)


In [6]:
#6)This section allows users to input text and interactively generate summaries
import ipywidgets as widgets

output = widgets.Output()

text_input = widgets.Textarea(
    value='',
    placeholder='Type something...',
    description='Input text:',
    disabled=False,
    layout=widgets.Layout(width='500px', height='200px')
)

num_sentences_input = widgets.IntText(
    value=0,
    description='Total sentences:',
    disabled=False
)

submit_btn = widgets.Button(
    description='Submit',
    button_style='info'
)


In [7]:
#7)Handling User Input
def process_input(_):
    text = text_input.value
    total_sentences = num_sentences_input.value

    with output:
        output.clear_output()
        print(summarize_text(text, num_sentences=total_sentences))

submit_btn.on_click(process_input)


In [None]:
#8)Display the Interactive Layout
layout = widgets.VBox([
    text_input,
    num_sentences_input,
    submit_btn,
    output
])

layout
