In [10]:
pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install transformers torch datasets



Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=8855f2430e8dec4bd5c60a46204e19671a0b0ef94cfdcd4079b56a1afd918642
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Block 1: Summarization with Enhanced Section Extraction and Pre-processing

# Imports
import os
import re
import fitz  # PyMuPDF for PDF handling
import nltk
from nltk.corpus import stopwords
from transformers import BartForConditionalGeneration, BartTokenizer
import spacy

# NLTK and spaCy downloads
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

# High-level keywords for important sentence extraction
high_level_keywords = ["objective", "goal", "approach", "propose", "demonstrate", "result", "outcome",
                       "find", "discover", "introduce", "overview", "conclude", "describe", "present", "model"]

# Text cleaning and sentence filtering function
def clean_text(text):
    # Basic text cleaning
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\([A-Za-z,0-9\s&;]*\)', '', text)
    text = re.sub(r'\[[0-9,; ]+\]', '', text)
    text = re.sub(r'(Figure|Table|Equation|Eq|Fig|Exhibit)\s*\d+', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Segment into sentences and filter by length and keywords
    doc = nlp(text)
    sentences = [
        sent.text.strip() for sent in doc.sents
        if len(sent.text.split()) > 5 and any(keyword in sent.text.lower() for keyword in high_level_keywords)
    ]

    # Limit to a concise number of sentences
    return ' '.join(sentences[:5])

# Enhanced Section extraction with improved regex flexibility
def extract_sections_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "".join([doc.load_page(i).get_text("text") for i in range(doc.page_count)])
        doc.close()

        # Regex for Introduction and Conclusion, allowing variations
        intro_pattern = r"(?:^|\n)\s*(Introduction|Objective|Background)\s*[:.\-]?\s*(.*?)(?=\n\s*(Conclusion|Summary|Results|References|Acknowledgments|Appendix|$))"
        concl_pattern = r"(?:^|\n)\s*(Conclusion|Summary|Closing Remarks|DISCUSSION & CONCLUSIONS|Final Thoughts)\s*[:.\-]?\s*(.*?)(?=\n\s*(References|Acknowledgments|Appendix|$))"

        # Get matches within relevant parts of text
        intro_match = re.search(intro_pattern, text[:int(len(text) * 0.3)], re.S | re.I)
        concl_match = re.search(concl_pattern, text[-int(len(text) * 0.3):], re.S | re.I)

        # Clean and filter based on matches or fallback to a default
        introduction_text = clean_text(intro_match.group(2)) if intro_match else clean_text(text[:int(len(text) * 0.3)])
        conclusion_text = clean_text(concl_match.group(2)) if concl_match else clean_text(text[-int(len(text) * 0.3):])

        # Print debug information for extracted sections
        print(f"\n[DEBUG] Introduction for {os.path.basename(pdf_path)}:\n{introduction_text}\n{'-'*80}")
        print(f"\n[DEBUG] Conclusion for {os.path.basename(pdf_path)}:\n{conclusion_text}\n{'-'*80}")

        return introduction_text, conclusion_text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None, None

# Generate summary using BART model, aiming for specified word length
def generate_summary(text, model, tokenizer, min_words=200, max_words=300):
    # Set min and max length in tokens, estimating 1.3 tokens per word on average
    min_length = int(min_words * 1.3)
    max_length = int(max_words * 1.3)

    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        num_beams=5,
        length_penalty=1.5,  # Balanced to prevent overly terse or verbose summaries
        no_repeat_ngram_size=3,  # Avoids repetitive phrases for readability
        early_stopping=True
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Filter out repetitive or incomplete sentences in summary
def filter_summary(summary_text):
    sentences = summary_text.split('. ')
    unique_sentences = []
    for sentence in sentences:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return '. '.join(unique_sentences)

# Abstract extraction function with regex pattern
def extract_abstract_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "".join([doc.load_page(i).get_text("text") for i in range(min(2, doc.page_count))])  # Use only the first 2 pages for Abstract
        doc.close()

        # Regex pattern for Abstract section, allowing some variation
        abstract_pattern = r"(?:^|\n)\s*(Abstract)\s*[:.\-]?\s*(.*?)(?=\n\s*(Introduction|Background|Conclusion|References|$))"

        # Extract abstract based on pattern match
        abstract_match = re.search(abstract_pattern, text, re.S | re.I)
        abstract_text = clean_text(abstract_match.group(2)) if abstract_match else "Abstract not found."

        # Print debug information for extracted abstract
        print(f"\n[DEBUG] Abstract for {os.path.basename(pdf_path)}:\n{abstract_text}\n{'-'*80}")

        return abstract_text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Summarize PDFs in a directory and display abstracts
def extract_abstracts_from_pdfs(pdf_directory):
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            abstract = extract_abstract_from_pdf(os.path.join(pdf_directory, filename))
            if abstract:
                print(f"\n[DEBUG] Abstract for {filename}:\n{abstract}\n")

# Relevance comparison between abstract and enhanced summary
def compare_abstract_with_summary(abstract, summary):
    if not abstract or abstract == "Abstract not found.":
        return "Abstract not available for comparison."

    # Calculate basic similarity by overlapping words
    abstract_words = set(abstract.lower().split())
    summary_words = set(summary.lower().split())
    common_words = abstract_words.intersection(summary_words)
    relevance_score = len(common_words) / len(abstract_words) if abstract_words else 0
    relevance_percentage = relevance_score * 100

    return f"Relevance Score: {relevance_percentage:.2f}% - Common words: {len(common_words)} / {len(abstract_words)}"

# Generate enhanced summary and compare with abstract
def extract_sections_and_summarize(pdf_path, model, tokenizer):
    intro, concl = extract_sections_from_pdf(pdf_path)
    if intro and concl:
        combined_text = f"Introduction: {intro} Conclusion: {concl}"
        summary = generate_summary(combined_text, model, tokenizer)
        return filter_summary(summary)
    return None

# Summarize and compare PDFs in a directory
def summarize_and_compare_pdfs(pdf_directory):
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)

    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)

            # Extract abstract
            abstract = extract_abstract_from_pdf(pdf_path)
            print(f"\n[DEBUG] Abstract for {filename}:\n{abstract}\n{'-'*80}")

            # Generate enhanced summary
            enhanced_summary = extract_sections_and_summarize(pdf_path, model, tokenizer)
            print(f"\n[DEBUG] Enhanced Summary for {filename}:\n{enhanced_summary}\n{'-'*80}")

            # Compare abstract and enhanced summary
            if enhanced_summary:
                comparison_result = compare_abstract_with_summary(abstract, enhanced_summary)
                print(f"[DEBUG] Comparison for {filename}:\n{comparison_result}\n{'-'*80}")

# Run the summarization and comparison block
pdf_directory = "/content/drive/My Drive/10"
summarize_and_compare_pdfs(pdf_directory)



[DEBUG] Abstract for Rock_mechanics_modeling_based_on_soft_granulation_theory.pdf:
This paper describes application of information granulation theory on the design of rock engineering flowcharts Firstly an overall flowchart based on information granulation theory has been highlighted Information granulation theory in crisp nonfuzzy or fuzzy format can take into account engineering experiences especially in fuzzy shapeincomplete information or superfluous or engineering judgments in each step of designing procedure while the suitable instruments modeling are employed In this manner and to extension of soft modeling instruments using three combinations of Self Organizing Map  NeuroFuzzy Inference System  and Rough Set Theory  crisp and fuzzy granules from monitored data sets are obtained The main underlined core of our algorithms are balancing of crisprough or nonfuzzy granules and sub fuzzy granules within non fuzzy information  upon the openclose iterations Using different criteria on