In [1]:
import fitz  # PyMuPDF for PDF text extraction
import re
import nltk
from nltk.tokenize import sent_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Sumiran
[nltk_data]     Grover\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import fitz
import re

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file while removing unwanted metadata."""
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        page_text = page.get_text("text")
        
        # Remove figure numbers, table numbers, and references
        page_text = re.sub(r'Fig\s*\d+|Table\s*\d+', '', page_text)
        page_text = re.sub(r'References|Bibliography', '', page_text, flags=re.IGNORECASE)
        
        text += page_text + "\n"

    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


In [4]:
def clean_text(text):
    """Preprocesses extracted text by removing journal metadata, references, and unnecessary special characters."""
    
    # Remove journal metadata (e.g., ISSN, impact factor, volume, issue)
    text = re.sub(r'eISSN:\s*\d+|pISSN:\s*\d+|Impact Factor.*?\d+\.\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Volume:\s*\d+\s*Issue:\s*\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'ISO\s*\d{4,}', '', text, flags=re.IGNORECASE)

    # Remove in-text citations like [1], (Smith et al., 2020)
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\(.*?\d{4}.*?\)', '', text)

    # Remove extra special characters but keep basic punctuation
    text = re.sub(r'[^a-zA-Z0-9.,!?;:()\s]', '', text)

    return text.strip()


In [5]:
from nltk.tokenize import sent_tokenize

def split_into_sections(text):
    """Splits research paper text into logical sections using common headers."""
    
    # More robust section names to detect variations
    section_headers = {
        "Title": ["title"],
        "Abstract": ["abstract"],
        "Introduction": ["introduction"],
        "Methods": ["methods", "methodology", "materials & methods"],
        "Results": ["results", "findings"],
        "Discussion": ["discussion", "analysis"],
        "Conclusion": ["conclusion", "summary", "final thoughts"]
    }

    # Initialize empty sections
    sections = {key: "" for key in section_headers}

    sentences = sent_tokenize(text)  # Tokenize into sentences

    current_section = "Title"
    
    for sentence in sentences:
        for section, keywords in section_headers.items():
            if any(keyword in sentence.lower() for keyword in keywords):
                current_section = section
                break  # Switch to new section

        # Append sentence to detected section
        sections[current_section] += sentence + " "

    return sections


In [6]:
pdf_path = "A_Comprehensive_Review_of_Unimodal_and_Multimodal_Fingerprint_Biometric_Authentication_Systems_Fusion_Attacks_and_Template_Protection.pdf"  # Change to your file path
raw_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(raw_text)
sections = split_into_sections(cleaned_text)

In [7]:
for section, content in sections.items():
    print(f"\n=== {section} ===\n{content[:500]}...\n") 


=== Title ===
Received 7 March 2024, accepted 25 April 2024, date of publication 30 April 2024, date of current version 13 May 2024. Digital Object Identifier 10.1109ACCESS.2024.3395417 A Comprehensive Review of Unimodal and Multimodal Fingerprint Biometric Authentication Systems: Fusion, Attacks, and Template Protection U. SUMALATHA 1, K. KRISHNA PRAKASHA 1, , Manipal Academy of Higher Education , Manipal Academy of Higher Education  and Srikanth Prabhu  Universality: Anyone must be able to use the applica t...


=== Abstract ===
...


=== Introduction ===
A. K. Jain, A. Ross, and S. Prabhakar, An introduction to biometric recognition, IEEE Trans. Circuits Syst. Video Technol., vol. 14, no. 1, pp. 420, Jan. 2004. A. K. Jain, A. Ross, and K. Nandakumar, Introduction to Biometrics. Cham, Switzerland: Springer, 2016. K. ElMaleh and W. ElHajj, Voice biometrics: Security, forensics, and healthcare, J. Med. Syst., vol. 43, no. 9, p. 306, 2019. K. Banerjee, J. P. Singh, and R. Kaur, Human r

In [8]:
import os
import PyPDF2
import spacy
import nltk
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

In [9]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class HybridSummarizer:
    def __init__(self, extractive_model='en_core_web_sm', abstractive_model='facebook/bart-large-cnn'):
        # Extractive model setup
        self.nlp = spacy.load(extractive_model)
        
        # Abstractive model setup
        self.tokenizer = AutoTokenizer.from_pretrained(abstractive_model)
        self.abstractive_model = AutoModelForSeq2SeqLM.from_pretrained(abstractive_model)
        
        # Section summarization strategy
        self.summarization_strategy = {
            'Methods': self.extractive_summary,
            'Results': self.extractive_summary,
            'Introduction': self.abstractive_summary,
            'Discussion': self.abstractive_summary,
            'Conclusion': self.abstractive_summary
        }

    def extractive_summary(self, text, max_sentences=3):
        """Generate extractive summary using TF-IDF"""
        sentences = [sent.text for sent in self.nlp(text).sents]
        
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(sentences)
        
        sentence_scores = tfidf_matrix.sum(axis=1)
        top_sentence_indices = sorted(
            range(len(sentence_scores)), 
            key=lambda i: sentence_scores[i, 0],  # Fix indexing issue
            reverse=True
        )[:max_sentences]
        
        return ' '.join([sentences[i] for i in sorted(top_sentence_indices)])

    def abstractive_summary(self, text, max_length=150, min_length=50):
        """Generate abstractive summary using BART"""
        inputs = self.tokenizer(
            text, 
            max_length=1024, 
            return_tensors='pt', 
            truncation=True
        )
        
        summary_ids = self.abstractive_model.generate(
            inputs['input_ids'], 
            num_beams=4, 
            max_length=max_length, 
            min_length=min_length,
            early_stopping=True
        )
        
        return self.tokenizer.decode(
            summary_ids[0], 
            skip_special_tokens=True
        )

    def summarize_sections(self, sections):
        """Generate summaries for different sections using appropriate strategy"""
        summaries = {}
        for section, content in sections.items():
            if content.strip():
                # Select summarization method based on section
                summarizer = self.summarization_strategy.get(
                    section, 
                    self.extractive_summary  # Default fallback
                )
                summaries[section] = summarizer(content)
        
        return summaries


In [10]:
def process_research_paper(pdf_path):
    """Process and summarize a research paper"""
    # Existing text extraction and cleaning functions
    raw_text = extract_text_from_pdf(pdf_path)
    cleaned_text = clean_text(raw_text)
    sections = split_into_sections(cleaned_text)
    
    # Initialize hybrid summarizer
    summarizer = HybridSummarizer()
    
    # Generate section summaries
    section_summaries = summarizer.summarize_sections(sections)
    
    # Print summaries
    for section, summary in section_summaries.items():
        print(f"{section} Summary:")
        print(summary)
        print("-" * 50)
    
    return section_summaries


In [None]:
pdf_path = "A_Comprehensive_Review_of_Unimodal_and_Multimodal_Fingerprint_Biometric_Authentication_Systems_Fusion_Attacks_and_Template_Protection.pdf"  
raw_text = extract_text_from_pdf(pdf_path)
summaries = process_research_paper(pdf_path)
