<a href="https://colab.research.google.com/github/krishnadixit05/AI/blob/main/AI_lab_10(Doc_Summ_).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##TextRank for Document Summarization

In [None]:
import nltk
import numpy as np
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

In [None]:
# Download necessary nltk resources
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    sentences = sent_tokenize(text)
    word_frequencies = []

    for sent in sentences:
        words = word_tokenize(sent.lower())
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
        word_frequencies.append(Counter(filtered_words))

    return sentences, word_frequencies

In [None]:
def build_similarity_matrix(word_frequencies):
    size = len(word_frequencies)
    similarity_matrix = np.zeros((size, size))

    for i in range(size):
        for j in range(size):
            if i != j:
                words1 = word_frequencies[i]
                words2 = word_frequencies[j]
                common_words = set(words1.keys()).union(set(words2.keys()))

                vec1 = np.array([words1[word] for word in common_words])
                vec2 = np.array([words2[word] for word in common_words])

                similarity_matrix[i][j] = cosine_similarity([vec1], [vec2])[0, 0]

    return similarity_matrix

In [None]:
def textrank_summarization(text, num_sentences=3):
    sentences, word_frequencies = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(word_frequencies)

    # Build graph and rank sentences using PageRank
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    # Sort sentences by score and select top-ranked ones
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([sent for _, sent in ranked_sentences[:num_sentences]])

    return summary

In [None]:
nltk.download('punkt_tab')
text = """TextRank is an unsupervised algorithm for keyword extraction and text summarization.
It is based on PageRank, which is used by Google to rank web pages in search results.
TextRank builds a graph of sentences, where edges represent similarity between them.
By running the PageRank algorithm on this graph, we can extract the most important sentences
for summarization. This technique is widely used in NLP applications."""

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
summary = textrank_summarization(text, num_sentences=2)
print("Summary:")
print(summary)

Summary:
By running the PageRank algorithm on this graph, we can extract the most important sentences
for summarization. TextRank is an unsupervised algorithm for keyword extraction and text summarization.
