In [None]:
import re
import spacy
from PyPDF2 import PdfReader

nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'\[\d+\]', '', text)  # Remove references like [1], [2]
    text = re.sub(r'(Figure|Table|Fig\.) \d+', '', text, flags=re.IGNORECASE)  # Remove figure/table mentions
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()


def extract_text_before_article_info(text):
    match = re.search(r'(.*?)A\s*R\s*T\s*I\s*C\s*L\s*E\s*I\s*N\s*F\s*O', text, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def segment_text(text):
    sections = {}

    # Keywords
    keywords_match = re.search(r'Keywords:\s*(.*?)A\s*B\s*S\s*T\s*R\s*A\s*C\s*T', text, re.DOTALL | re.IGNORECASE)
    sections["keywords"] = keywords_match.group(1).strip() if keywords_match else ""

    # Abstract
    abstract_match = re.search(r'A\s*B\s*S\s*T\s*R\s*A\s*C\s*T\s*(.*?)1\.\s*Introduction', text, re.DOTALL | re.IGNORECASE)
    sections["abstract"] = abstract_match.group(1).strip() if abstract_match else ""

    # Introduction
    intro_match = re.search(r'1\.\s*Introduction\s*(.*?)1\.\s', text, re.DOTALL | re.IGNORECASE)
    sections["introduction"] = intro_match.group(1).strip() if intro_match else ""

    # Methodology 
    methodology_match = re.search(
        r'2\.\s*Preliminaries\s*(.*?)\s*CRediT',
        text,
        re.DOTALL | re.IGNORECASE,
    )
    sections["methodology"] = methodology_match.group(1).strip() if methodology_match else ""

    return sections

def extract_entities(text):
    doc = nlp(text)
    authors, institutions = set(), set()

    # Extract authors and institutions
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            authors.add(ent.text.strip())
        elif ent.label_ in ["ORG", "FAC"]:
            institutions.add(ent.text.strip())

    return {
        "authors": list(authors),
        "institutions": list(institutions),
    }

# Main Pipeline
def extract_information(pdf_path):
    # extract text from pdf
    raw_text = extract_text_from_pdf(pdf_path)
    # Clean text
    cleaned_text = clean_text(raw_text)
    # Extract text before "article info"
    text_before = extract_text_before_article_info(cleaned_text)
    # Segment text into sections
    sections = segment_text(cleaned_text)

    # Extract authors and institutions
    entities = extract_entities(text_before)

    return {
        "authors": entities["authors"],
        "institutions": entities["institutions"],
        "introduction": sections.get("introduction", ""),
        "methodology": sections.get("methodology", ""),
        "keywords": sections.get("keywords", "").split(", "),  # Split keywords by commas
        "abstract": sections.get("abstract", ""),
    }

if __name__ == "__main__":
    pdf_path = "Guesbaya.pdf"  # Replace with your file path
    extracted_info = extract_information(pdf_path)
    print("Authors:", extracted_info["authors"])
    print("-----------------------------------------------------------------")
    print("Institutions:", extracted_info["institutions"])
    print("-----------------------------------------------------------------")
    print("Introduction:", extracted_info["introduction"])
    print("-----------------------------------------------------------------")
    print("Methodology:", extracted_info["methodology"])
    print("-----------------------------------------------------------------")
    print("Keywords:", extracted_info["keywords"])
    print("-----------------------------------------------------------------")
    print("Abstract:", extracted_info["abstract"])


Authors: ['Ioannis Lambadarisa', 'Howard Schwartza', 'Michel Barbeaub']
-----------------------------------------------------------------
Institutions: ['ScienceDirect Neurocomputing', 'Mohammad Tayefe Ramezanloua', 'Computer Engineering', 'Carleton University']
-----------------------------------------------------------------
Introduction: The consensus problem in flying multi-agent systems, commonly called the flocking or swarming challenge, is fundamental to aerial robotics. It involves coordinating and controlling multiple agents to en- sure cooperative behavior, avoid collisions, and align towards a shared goal. Such coordination is crucial for applications ranging from co- ordinated surveillance to communication, logistics, and infrastructure monitoring. Formation control within these systems, especially cost-constrained communication, faces significant challenges. These include managing limited resources, overcoming communication constraints, and ensur- ing system scalability an