<a href="https://colab.research.google.com/github/khemsu/TextSumarization/blob/main/textRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')


def build_similarity_matrix(sentences, stop_words):
    # Preprocess sentences
    def preprocess(sentence):
        return ' '.join([w.lower() for w in sentence.split() if w.lower() not in stop_words])

    cleaned_sentences = [preprocess(s) for s in sentences]

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)

    # Cosine similarity matrix
    sim_matrix = cosine_similarity(tfidf_matrix)

    # Fill diagonal with 0 (no self similarity)
    np.fill_diagonal(sim_matrix, 0)

    return sim_matrix


def pagerank(M, eps=0.0001, d=0.85):
    """
    Manual PageRank implementation.
    M: square numpy matrix (adjacency/similarity matrix), rows sum to 1 (transition matrix)
    eps: convergence threshold
    d: damping factor
    """
    N = M.shape[0]
    v = np.ones(N) / N  # Initial rank vector (uniform)
    last_v = np.zeros(N)

    # Normalize rows to sum to 1 (transition probability)
    row_sums = M.sum(axis=1, keepdims=True)
    # Add a small epsilon to avoid division by zero
    M = M / (row_sums + 1e-8)

    while np.linalg.norm(v - last_v) > eps:
        last_v = v.copy()
        v = (1 - d) / N + d * M.T.dot(v)

    return v


def textrank_summarize_manual_pagerank(text, top_n=3):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))

    sim_matrix = build_similarity_matrix(sentences, stop_words)
    scores = pagerank(sim_matrix)

    # Rank sentences by score
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Return top_n sentences as summary
    summary = ' '.join([s for _, s in ranked_sentences[:top_n]])
    return summary


# Example usage:
text = """
In the field of natural language processing, an extractive summarization task can
be described as the selection of the most important sentences in a document.
Using different levels of compression, a summarized version of the document of
arbitrary length can be obtained.
TextRank is a graph-based extractive summarization algorithm. It is domain
and language independent since it does not require deep linguistic knowledge,
nor domain or language specific annotated corpora [16]. These features makes the
algorithm widely used: it performs well summarizing structured text like news
articles, but it has also shown good results in other usages such as summarizing
meeting transcriptions [8] and assessing web content credibility [1].
In this article we present different proposals for the construction of the graph
and report the results obtained with them.
The first sections of this article describe previous work in the area and an
overview of the TextRank algorithm. Then we present the variations and describe
the different metrics and dataset used for the evaluation. Finally we report the
results obtained using the proposed changes.
"""

summary = textrank_summarize_manual_pagerank(text, top_n=5)
print(summary)

In this article we present different proposals for the construction of the graph
and report the results obtained with them. TextRank is a graph-based extractive summarization algorithm. Finally we report the
results obtained using the proposed changes. Using different levels of compression, a summarized version of the document of
arbitrary length can be obtained. 
In the field of natural language processing, an extractive summarization task can
be described as the selection of the most important sentences in a document.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
