<a href="https://colab.research.google.com/github/jasim-1863/SDCProjects/blob/main/textSummarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

class TextSummarizer:
    def __init__(self):
        # Download necessary NLTK data
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('punkt_tab')  # Adding this additional download

        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        """Clean and normalize text"""
        # Convert to lowercase and remove special characters
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text.lower()

    def sentence_similarity(self, sent1, sent2):
        """Calculate cosine similarity between two sentences using word frequency vectors"""
        # Create word vectors
        all_words = list(set(sent1 + sent2))
        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # Build the vectors
        for word in sent1:
            if word in all_words:
                vector1[all_words.index(word)] += 1

        for word in sent2:
            if word in all_words:
                vector2[all_words.index(word)] += 1

        # Prevent division by zero
        if sum(vector1) == 0 or sum(vector2) == 0:
            return 0.0

        # Calculate cosine similarity
        return cosine_similarity([vector1], [vector2])[0][0]

    def build_similarity_matrix(self, sentences, filtered_words):
        """Build a similarity matrix between sentences"""
        # Initialize similarity matrix
        similarity_matrix = np.zeros((len(sentences), len(sentences)))

        # Calculate similarity between pairs of sentences
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    similarity_matrix[i][j] = self.sentence_similarity(
                        filtered_words[i], filtered_words[j]
                    )

        return similarity_matrix

    def summarize(self, text, num_sentences=3):
        """Generate a summary of the text"""
        # Tokenize text into sentences
        try:
            sentences = sent_tokenize(text)
        except LookupError:
            # Fallback to a simpler sentence tokenization approach if NLTK fails
            sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]

        # Return the original text if it has fewer sentences than requested
        if len(sentences) <= num_sentences:
            return text

        # Clean and tokenize each sentence
        cleaned_sentences = [self.clean_text(sentence) for sentence in sentences]

        # Tokenize each sentence into words and remove stop words
        word_tokenized = []
        for sentence in cleaned_sentences:
            try:
                word_tokenized.append(word_tokenize(sentence))
            except LookupError:
                # Fallback to simple word tokenization
                word_tokenized.append(sentence.split())

        filtered_words = [
            [word for word in words if word not in self.stop_words]
            for words in word_tokenized
        ]

        # Build the similarity matrix
        similarity_matrix = self.build_similarity_matrix(sentences, filtered_words)

        # Use NetworkX to rank sentences based on similarity
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)

        # Rank sentences by score
        ranked_sentences = sorted(
            ((scores[i], sentence) for i, sentence in enumerate(sentences)),
            reverse=True
        )

        # Select top N sentences for the summary (preserve original order)
        top_indices = [sentences.index(ranked_sentences[i][1]) for i in range(min(num_sentences, len(sentences)))]
        top_indices.sort()

        # Combine sentences to create the final summary
        summary = ' '.join([sentences[i] for i in top_indices])

        return summary


# Alternative implementation that doesn't rely on NLTK's punkt_tab
class SimpleTextSummarizer:
    def __init__(self):
        # Download only stopwords
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')

        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        # Simple sentence splitting
        sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]

        # Process each sentence
        processed_sentences = []
        for sentence in sentences:
            # Clean and tokenize
            cleaned = re.sub(r'[^\w\s]', '', sentence.lower())
            words = cleaned.split()
            # Remove stopwords
            filtered = [word for word in words if word not in self.stop_words]
            processed_sentences.append(filtered)

        return sentences, processed_sentences

    def build_similarity_matrix(self, processed_sentences):
        # Create similarity matrix
        n = len(processed_sentences)
        similarity_matrix = np.zeros((n, n))

        for i in range(n):
            for j in range(n):
                if i != j:
                    # Calculate similarity using word overlap (Jaccard similarity)
                    set1 = set(processed_sentences[i])
                    set2 = set(processed_sentences[j])

                    if not set1 or not set2:
                        similarity_matrix[i][j] = 0
                    else:
                        intersection = len(set1.intersection(set2))
                        union = len(set1.union(set2))
                        similarity_matrix[i][j] = intersection / union if union > 0 else 0

        return similarity_matrix

    def summarize(self, text, num_sentences=3):
        # Extract and process sentences
        sentences, processed_sentences = self.preprocess_text(text)

        # Return original text if too short
        if len(sentences) <= num_sentences:
            return text

        # Build similarity matrix
        similarity_matrix = self.build_similarity_matrix(processed_sentences)

        # Rank sentences using PageRank
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)

        # Sort sentences by score
        ranked_sentences = sorted(
            ((scores[i], sentence) for i, sentence in enumerate(sentences)),
            reverse=True
        )

        # Get top sentences (preserving original order)
        top_indices = [sentences.index(ranked_sentences[i][1]) for i in range(min(num_sentences, len(sentences)))]
        top_indices.sort()

        # Create summary
        summary = ' '.join([sentences[i] for i in top_indices])

        return summary


# Example usage
if __name__ == "__main__":
    sample_text = """
    Machine learning is a field of inquiry devoted to understanding and building methods that 'learn',
    that is, methods that leverage data to improve performance on some set of tasks. It is seen as a
    part of artificial intelligence. Machine learning algorithms build a model based on sample data,
    known as training data, in order to make predictions or decisions without being explicitly
    programmed to do so. Machine learning algorithms are used in a wide variety of applications, such
    as in medicine, email filtering, speech recognition, agriculture, and computer vision, where it is
    difficult or unfeasible to develop conventional algorithms to perform the needed tasks.

    A subset of machine learning is closely related to computational statistics, which focuses on making
    predictions using computers, but not all machine learning is statistical learning. The study of
    mathematical optimization delivers methods, theory and application domains to the field of machine learning.

    Some implementations of machine learning use data and neural networks in a way that mimics the working
    of a biological brain. In its application across business problems, machine learning is also referred
    to as predictive analytics.
    """

    # Try the original implementation with fallbacks
    try:
        summarizer = TextSummarizer()
        summary = summarizer.summarize(sample_text, num_sentences=2)

        print("Original Text Length:", len(sample_text.split()))
        print("Summary Length:", len(summary.split()))
        print("\nSummary:")
        print(summary)
    except Exception as e:
        print(f"First implementation failed: {e}")
        print("\nTrying alternative implementation...")

        # Use the simpler implementation if the first one fails
        simple_summarizer = SimpleTextSummarizer()
        summary = simple_summarizer.summarize(sample_text, num_sentences=2)

        print("Original Text Length:", len(sample_text.split()))
        print("Summary Length:", len(summary.split()))
        print("\nSummary:")
        print(summary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Text Length: 183
Summary Length: 43

Summary:
A subset of machine learning is closely related to computational statistics, which focuses on making 
    predictions using computers, but not all machine learning is statistical learning. The study of 
    mathematical optimization delivers methods, theory and application domains to the field of machine learning.
