In [None]:
import nltk
import pdfplumber
import PyPDF2
import pytesseract
import spacy
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from collections import Counter

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load a pre-trained Sentence-BERT model for embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')


# === PDF Reading Functions ===

def read_pdf_text(path):
    text = ""
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                content = page.extract_text()
                if content:
                    text += content + " "
        if not text.strip():  # Fallback to PyPDF2 if pdfplumber fails to extract
            with open(path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                for page in reader.pages:
                    text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading PDF with pdfplumber/PyPDF2: {e}")
        return ""  # Return empty string on error
    return text.strip()

def read_scanned_pdf_text(path):
    text = ""
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                img = page.to_image().original
                text += pytesseract.image_to_string(img) + " "
    except Exception as e:
        print(f"Error performing OCR with pytesseract: {e}")
        return ""  # Return empty string on error
    return text.strip()

# === Preprocessing ===

def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    processed_sentences = [
        " ".join([word.lower() for word in word_tokenize(sent) if word.isalnum() and word.lower() not in stop_words])
        for sent in sentences
    ]
    return sentences, processed_sentences

# ---
## Summarization Techniques

### Term Frequency Summarization

def tf_summary(text, num_sentences=5):
    sentences, processed = preprocess_text(text)
    word_freq = {}

    for sent in processed:
        for word in word_tokenize(sent):
            word_freq[word] = word_freq.get(word, 0) + 1

    sentence_scores = []
    for i, sent in enumerate(processed):
        score = sum(word_freq.get(word, 0) for word in word_tokenize(sent))
        sentence_scores.append((score, sentences[i]))

    ranked = sorted(sentence_scores, reverse=True)
    top = ranked[:num_sentences]
    summary = " ".join([s for _, s in top])
    return summary, top

### TextRank Summarization

def textrank_summary(text, num_sentences=5):
    sentences, _ = preprocess_text(text)
    if not sentences:  # Handle empty sentences case
        return "", []
    if len(sentences) <= num_sentences:
        return " ".join(sentences), [(1.0, s) for s in sentences]

    tfidf = TfidfVectorizer().fit_transform(sentences)
    sim_matrix = (tfidf * tfidf.T).toarray()

    graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(graph)

    ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    top = ranked[:num_sentences]
    summary = " ".join([s for _, s in top])
    return summary, top

### NER-Based Summarization

def ner_summary(text, num_sentences=5):
    sentences = sent_tokenize(text)
    sentence_scores = []

    for sent in sentences:
        doc_sent = nlp(sent)
        # Assign weights to different entity types
        score = sum({
            "PERSON": 3,
            "ORG": 2,
            "GPE": 2,
            "LAW": 3,
            "DATE": 1,
            "MONEY": 2,
            "LOC": 1,
            "EVENT": 2
        }.get(ent.label_, 0) for ent in doc_sent.ents)

        sentence_scores.append((score, sent))

    ranked = sorted(sentence_scores, reverse=True)
    top = ranked[:num_sentences]
    summary = " ".join([s for _, s in top])
    return summary, top

### BERT Extractive Summarization

def bert_extractive_summary(text, num_sentences=5):
    """
    Performs extractive summarization using BERT sentence embeddings.
    Sentences are ranked by their cosine similarity to the document's overall embedding.

    Args:
        text (str): The input text.
        num_sentences (int): The number of sentences to include in the summary.

    Returns:
        str: The generated summary.
        list: A list of tuples (similarity_score, sentence_text) for the top sentences.
    """
    sentences = sent_tokenize(text)
    if not sentences:
        return "", []

    # Get embeddings for all sentences
    sentence_embeddings = bert_model.encode(sentences, convert_to_tensor=True)

    # Get embedding for the entire document
    document_embedding = bert_model.encode(text, convert_to_tensor=True)

    bert_scores = []
    for i, sent_embedding in enumerate(sentence_embeddings):
        # Calculate cosine similarity between sentence embedding and document embedding
        similarity = cosine_similarity(sent_embedding.cpu().numpy().reshape(1, -1),
                                       document_embedding.cpu().numpy().reshape(1, -1))[0][0]
        bert_scores.append((similarity, sentences[i]))

    # Rank sentences based on their BERT similarity score in descending order
    ranked_sentences = sorted(bert_scores, key=lambda x: x[0], reverse=True)

    # Select the top N sentences for the summary
    top_summary_sentences_with_scores = ranked_sentences[:num_sentences]

    # Combine the summary sentences. It's often good practice to try to preserve original document order
    # for the selected sentences for readability, even if they were ranked by score.
    original_sentence_map = {sentence: idx for idx, sentence in enumerate(sentences)}
    summary_texts_ordered = sorted(top_summary_sentences_with_scores, key=lambda x: original_sentence_map.get(x[1], float('inf')))
    summary = " ".join([s for _, s in summary_texts_ordered])

    return summary, top_summary_sentences_with_scores

# === Evaluation Metrics ===

def calculate_accuracy(full_text, summary):
    """
    Calculates the cosine similarity (using TF-IDF) between the full text and the summary.
    This provides a measure of content overlap.
    """
    vectorizer = TfidfVectorizer().fit([full_text, summary])
    vectors = vectorizer.transform([full_text, summary])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity

# ---
## Main Execution

def generate_overall_summary(text, num_sentences=5):
    """
    Generates an overall summary by combining the results of different summarization methods.

    Args:
        text (str): The input text.
        num_sentences (int): The desired number of sentences in the final summary.

    Returns:
        str: The overall summary.
        list: A list of tuples (similarity_score, sentence_text) for the top sentences.
    """

    tf_summary_text, _ = tf_summary(text, num_sentences=num_sentences)
    textrank_summary_text, _ = textrank_summary(text, num_sentences=num_sentences)
    ner_summary_text, _ = ner_summary(text, num_sentences=num_sentences)
    bert_summary_text, _ = bert_extractive_summary(text, num_sentences=num_sentences)

    # Combine all sentences from different summaries
    all_summary_sentences = sent_tokenize(tf_summary_text) + \
                            sent_tokenize(textrank_summary_text) + \
                            sent_tokenize(ner_summary_text) + \
                            sent_tokenize(bert_summary_text)

    # Remove duplicate sentences
    unique_sentences = list(dict.fromkeys(all_summary_sentences))

    # Rank the unique sentences based on BERT similarity to the original document
    sentence_embeddings = bert_model.encode(unique_sentences, convert_to_tensor=True)
    document_embedding = bert_model.encode(text, convert_to_tensor=True)

    sentence_scores = []
    for i, sent_embedding in enumerate(sentence_embeddings):
        similarity = cosine_similarity(sent_embedding.cpu().numpy().reshape(1, -1),
                                       document_embedding.cpu().numpy().reshape(1, -1))[0][0]
        sentence_scores.append((similarity, unique_sentences[i]))

    ranked_sentences = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
    top_sentences_with_scores = ranked_sentences[:num_sentences]  # Keep scores

    # Sort the selected sentences by their original order in the document
    original_sentences = sent_tokenize(text)
    original_sentence_map = {sentence: idx for idx, sentence in enumerate(original_sentences)}
    final_summary_sentences = sorted(top_sentences_with_scores, key=lambda x: original_sentence_map.get(x[1], float('inf')))

    # Combine the top sentences into a coherent summary
    overall_summary = " ".join([sentence for _, sentence in final_summary_sentences]) # Extract just the sentences

    return overall_summary, final_summary_sentences # Return the sentences with scores

def count_sentence_occurrences(text, num_sentences=5):
    """
    Counts the occurrences of sentences in the summaries generated by different methods.

    Args:
        text (str): The input text.
        num_sentences (int): The number of sentences to include in each summary.

    Returns:
        Counter: A Counter object containing the sentences and their occurrences.
    """
    tf_summary_text, _ = tf_summary(text, num_sentences=num_sentences)
    textrank_summary_text, _ = textrank_summary(text, num_sentences=num_sentences)
    ner_summary_text, _ = ner_summary(text, num_sentences=num_sentences)
    bert_summary_text, _ = bert_extractive_summary(text, num_sentences=num_sentences)

    all_summary_sentences = sent_tokenize(tf_summary_text) + \
                            sent_tokenize(textrank_summary_text) + \
                            sent_tokenize(ner_summary_text) + \
                            sent_tokenize(bert_summary_text)

    sentence_counts = Counter(all_summary_sentences)
    return sentence_counts


file_path = r"C:\Users\japal\OneDrive\Documents\laxmi_narasimha.pdf"

# Load PDF
document_text = read_pdf_text(file_path)
if not document_text.strip():
    print("PDF is empty or could not be read. Attempting OCR for scanned PDF...")
    document_text = read_scanned_pdf_text(file_path)

if not document_text.strip():
    print("Could not extract text from PDF. Please ensure the PDF is valid and readable.")
else:
    # Generate Summaries
    tf_summary_text, tf_scored = tf_summary(document_text, num_sentences=5)
    textrank_summary_text, textrank_scored = textrank_summary(document_text, num_sentences=5)
    ner_summary_text, ner_scored = ner_summary(document_text, num_sentences=5)
    bert_summary_text, bert_scored = bert_extractive_summary(document_text, num_sentences=5)

    # Calculate Accuracy Scores
    tf_acc = calculate_accuracy(document_text, tf_summary_text)
    tr_acc = calculate_accuracy(document_text, textrank_summary_text)
    ner_acc = calculate_accuracy(document_text, ner_summary_text)
    bert_acc = calculate_accuracy(document_text, bert_summary_text)

    # ---
    ## Output Results
    

    print("\n" + "="*10 + " Term Frequency Summary with Scores " + "="*10 + "\n")
    for score, sent in tf_scored:
        print(f"[Score: {score}] {sent.strip()}")
    print("\n*TF Summary:*\n", tf_summary_text)
    print(f"\n*TF Cosine Similarity:* {tf_acc:.4f} ({tf_acc * 100:.2f}%)")

    print("\n" + "="*10 + " TextRank Summary with Scores " + "="*10 + "\n")
    for score, sent in textrank_scored:
        print(f"[Score: {score:.4f}] {sent.strip()}")
    print("\n*TextRank Summary:*\n", textrank_summary_text)
    print(f"\n*TextRank Cosine Similarity:* {tr_acc:.4f} ({tr_acc * 100:.2f}%)")

    print("\n" + "="*10 + " NER Summary with Scores " + "="*10 + "\n")
    for score, sent in ner_scored:
        print(f"[Score: {score}] {sent.strip()}")
    print("\n*NER Summary:*\n", ner_summary_text)
    print(f"\n*NER Cosine Similarity:* {ner_acc:.4f} ({ner_acc * 100:.2f}%)")

    print("\n" + "="*10 + " BERT Extractive Summary with Scores " + "="*10 + "\n")
    for score, sent in bert_scored:
        print(f"[BERT Similarity: {score:.4f}] {sent.strip()}")
    print("\n*BERT Summary:*\n", bert_summary_text)
    print(f"\n*BERT Cosine Similarity (TF-IDF based):* {bert_acc:.4f} ({bert_acc * 100:.2f}%)")

    # Generate and print the overall summary
    overall_summary, important_sentences = generate_overall_summary(document_text, num_sentences=5)
    print("\n" + "="*10 + " Overall Summary " + "="*10 + "\n")
    print(overall_summary)

    print("\n" + "="*10 + " Important Sentences with Scores " + "="*10 + "\n")
    for score, sent in important_sentences:
        print(f"[BERT Similarity: {score:.4f}] {sent.strip()}")

    # Count sentence occurrences
    sentence_counts = count_sentence_occurrences(document_text, num_sentences=5)
    most_common_sentences = sentence_counts.most_common()

    print("\n" + "="*10 + " Most Frequently Occurring Sentences " + "="*10 + "\n")
    for sentence, count in most_common_sentences:
        print(f"[Count: {count}] {sentence.strip()}")

    # Find sentences in the overall summary that are also among the most frequent
    overall_summary_sentences = sent_tokenize(overall_summary)
    most_frequent_in_summary = [
        (sentence, count) for sentence, count in most_common_sentences if sentence in overall_summary_sentences
    ]

    print("\n" + "=" * 10 + " Important and Most Frequent Sentences from All Methods " + "=" * 10 + "\n")
    if most_frequent_in_summary:
        for sentence, count in most_frequent_in_summary:
            print(f"[Count: {count}] {sentence.strip()}")
    else:
        print("No sentences were found to be both in the overall summary and among the most frequent.")
