In [70]:
import fitz  # PyMuPDF
import re
import os
import json
from bson import ObjectId
from datetime import datetime

# Function to extract title and journal name from an APA reference
def extract_title_and_journal_from_reference(reference):
    parts = re.split(r'\.\s', reference, maxsplit=2)
    if len(parts) >= 3:
        title = parts[1].strip()
        remaining_text = parts[2]
        journal_match = re.match(r'([^.]+)\.', remaining_text)
        journal_name = journal_match.group(1).strip() if journal_match else None
        return title, journal_name
    return None, None

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# Function to normalize text for easier comparison
def normalize_text(text):
    return re.sub(r'\W+', '', text.strip().lower())

# Function to map positions from normalized text back to original text
def map_normalized_to_original(normalized_text, original_text, target_position):
    normalized_index = 0
    for original_index, char in enumerate(original_text):
        if re.match(r'\w', char.lower()):  # Match only alphanumeric characters
            if normalized_index == target_position:
                return original_index
            normalized_index += 1
    return -1

# Function to search for the title in the normalized PDF text and determine where it ends
def find_title_end_position_in_pdf(normalized_text, normalized_title, original_text):
    title_start_position = normalized_text.find(normalized_title)
    if title_start_position != -1:
        title_end_position_in_normalized = title_start_position + len(normalized_title)
        title_end_position_in_original = map_normalized_to_original(normalized_text, original_text, title_end_position_in_normalized)
        return title_end_position_in_original
    return -1

# Function to extract surnames from reference authors
def extract_surnames_from_reference(reference_authors):
    reference_authors = re.sub(r'^[\d\s]+', '', reference_authors)
    surnames = []
    authors = reference_authors.split(',')
    for author in authors:
        surname = author.strip().split(' ')[0]
        surnames.append(surname)
    return surnames

# Function to match authors in the PDF text starting after the title
#TODO working but can be improved, especially with many authors
def match_authors_in_pdf(text, reference_authors, start_pos, search_range):
    surnames = extract_surnames_from_reference(reference_authors)
    matched_authors = []
    last_match_position = None

    # Limit the search to the specified range after the start position
    text_to_search = text[start_pos:start_pos + search_range]

    # Convert the text and surnames to lowercase for case-insensitive matching
    normalized_text = text_to_search.lower()
    surnames = [surname.lower() for surname in surnames]

    # Remove numbers from the text
    normalized_text = re.sub(r'\d+', '', normalized_text)

    original_positions = []

    # Adjusted pattern to handle lowercase names and potential middle initials or multi-word names
    for surname in surnames:
        pattern = r'\b([a-z]+(?:\s[a-z]\.)?(?:\s[a-z]+)*)\s' + re.escape(surname) + r'\b'
        matches = re.finditer(pattern, normalized_text)
        for match in matches:
            full_name = match.group(0).strip()
            if full_name not in matched_authors:  # Avoid duplicates
                matched_authors.append(full_name.title())  # Convert back to title case for consistency
                last_match_position = match.end()  # Update last match position in the normalized text
                original_pos = map_normalized_to_original(normalize_text(text), text, start_pos + match.end())
                original_positions.append(original_pos)

    # After initial matching, search for names between commas within the same range
    if len(matched_authors) < len(surnames):
        additional_authors_pattern = r'\b([a-z]+(?:\s[a-z]\.)?(?:\s[a-z]+)*)\b'
        additional_matches = re.finditer(additional_authors_pattern, normalized_text)
        for match in additional_matches:
            full_name = match.group(0).strip().title()  # Convert to title case for consistency
            if full_name not in matched_authors and not re.search(r'\b[A-Z]+\b', full_name):
                if '\n' in full_name:
                    last_author = full_name.split('\n')[0].strip()
                    if last_author:
                        search_text_start = full_name.split('\n', 1)[-1].strip()
                        matched_authors.append(last_author.title())
                        
                    original_pos = text.lower().find(search_text_start.lower())
                    original_positions.append(original_pos)
                    break

                matched_authors.append(full_name)
                last_match_position = match.end()  # Update last match position in the normalized text
                original_pos = map_normalized_to_original(normalize_text(text), text, start_pos + match.end())
                original_positions.append(original_pos)
                
    return matched_authors, original_positions[-1] if original_positions else None

# Function to extract the abstract from the PDF content
#TODO somewhat working, the PDF sometime is a mess and it's difficult to find the end of the abstract. 
#Can usually find the start pretty accurately
def extract_abstract(text, start_pos):
    # Normalize the text starting from start_pos
    normalized_text = text[start_pos:].lower()
    
    # Search for the word 'abstract' in the normalized text
    abstract_start = normalized_text.find("abstract")
    print(f"Abstract found at: {abstract_start}")

    if abstract_start != -1:
        abstract_start_index = start_pos + abstract_start + len("abstract")
        # Find the end of the abstract, marked by 'introduction'
        abstract_end = normalized_text[abstract_start+50:].find("introduction")
        print(f"Introduction found at: {abstract_end}")
        print(normalized_text[abstract_start:1000])
        
        if abstract_end != -1:
            abstract_end_index = start_pos + abstract_end
            abstract = text[abstract_start_index:abstract_end_index].strip()
            return abstract, abstract_end_index

    # If 'abstract' not found, use 'start_pos' as the start
    abstract_start_index = start_pos
    # Find the end of the abstract, marked by 'introduction'
    abstract_end = normalized_text.find("introduction")
    if abstract_end != -1:
        abstract_end_index = start_pos + abstract_end
        abstract = text[abstract_start_index:abstract_end_index].strip()
        return abstract, abstract_end_index

    return None, None


# Function to extract the body from the PDF content
#TODO mockup, need to roperly implement it
def extract_body(text, start_index):
    if start_index is None:
        return "", 0
    body_start = start_index
    body_end = re.search(r'\b(bibliography|references)\b', text[body_start:], re.IGNORECASE)
    if body_end:
        body = text[body_start:body_start + body_end.start()].strip()
        return body, body_start + body_end.start()
    else:
        return text[body_start:].strip(), len(text)

# Function to extract the bibliography from the PDF content
def extract_bibliography(text, start_index):
    return text[start_index:].strip()

# Function to extract the publication date from the PDF content
def extract_publication_date(text):
    date_match = re.search(r'\b(\d{4})\b', text)
    if date_match:
        return datetime.strptime(date_match.group(1), '%Y').date().isoformat()
    return None

# Function to extract the DOI from the PDF content
def extract_doi(text):
    doi_match = re.search(r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b', text, re.IGNORECASE)
    if doi_match:
        return doi_match.group(0)
    return None

# Function to extract keywords from the PDF content
def extract_keywords(text):
    keywords = re.findall(r'\b\w{4,}\b', text.lower())
    return list(set(keywords))[:10]

# Main function to process PDFs and match with references
def main():
    references_path = "materials/papers/article_references"
    pdf_directory = "materials/papers/PDF"
    extracted_data = []

    # Load and process references
    references = []
    with open(references_path, 'r') as file:
        for line in file:
            reference = line.strip()
            if reference:
                title, journal_name = extract_title_and_journal_from_reference(reference)
                authors = line.split('(')[0].strip()  # Extract the part before the year as authors
                if title and journal_name:
                    references.append({"title": title, "journal_name": journal_name, "authors": authors})

    # Process each PDF file
    for pdf_file in os.listdir(pdf_directory):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, pdf_file)
            text = extract_text_from_pdf(pdf_path)
            normalized_text = normalize_text(text)

            # Check for matches with references
            for ref in references:
                normalized_title = normalize_text(ref["title"])

                title_end_position_in_original = find_title_end_position_in_pdf(normalized_text, normalized_title, text)
                if title_end_position_in_original != -1:
                    print(f"Match found for PDF: {pdf_file}")
                    print(f"Title: {ref['title']}")
                    print(f"Journal Name: {ref['journal_name']}")
                    print(f"Title ends at position: {title_end_position_in_original}")
                    next_200_chars = text[title_end_position_in_original:title_end_position_in_original + 200]
                    print(f"Next 200 characters in original text: {next_200_chars}")
                    print("-" * 80)

                    # Extract additional information
                    authors, last_author_pos_in_original = match_authors_in_pdf(text, ref["authors"], title_end_position_in_original, 1000)

                    if last_author_pos_in_original:
                        next_200_chars_after_author = text[last_author_pos_in_original:last_author_pos_in_original + 200]
                        print(f"Next 200 characters after last author in original text: {next_200_chars_after_author}")
                        print("-" * 80)

                    abstract, abstract_end_index = extract_abstract(text, last_author_pos_in_original)
                    body, body_end_index = extract_body(text, abstract_end_index)
                    bibliography = extract_bibliography(text, body_end_index)
                    publication_date = extract_publication_date(text)
                    doi = extract_doi(text)
                    keywords = extract_keywords(text)

                    data = {
                        "_id": str(ObjectId()),
                        "journal_name": ref["journal_name"],
                        "issue": "Unknown",  # Extract issue number if required, from PDF or references
                        "title": ref["title"],
                        "authors": authors,
                        "abstract": abstract,
                        "body": body,
                        "bibliography": bibliography,
                        "keywords": keywords,
                        "publication_date": publication_date,
                        "DOI": doi,
                        "language": "EN",  # Assuming English for now
                        "raw_text": text,
                        "targets": ["People", "Trainers", "Public Administration"]  # Example targets
                    }

                    extracted_data.append(data)
                    break  # Stop after finding the first match for this PDF

    # Save extracted data to a JSON file
    with open('extracted_papers.json', 'w') as json_file:
        json.dump(extracted_data, json_file, indent=4)

    return extracted_data

if __name__ == "__main__":
    extracted_data = main()


Match found for PDF: main.pdf
Title: COVID-19 mental health impact and responses in low-income and middle-income countries: reimagining global mental health
Journal Name: Lancet Psychiatry
Title ends at position: 5311
Next 200 characters in original text: Lola Kola, Brandon A Kohrt, Charlotte Hanlon, John A Naslund, Siham Sikander, Madhumitha Balaji, Corina Benjet, Eliza Yee Lai Cheung, 
Julian Eaton, Pattie Gonsalves, Maji Hailemariam, Nagendra P Luit
--------------------------------------------------------------------------------
Next 200 characters after last author in original text: Most of the global population live in low-income and middle-income countries (LMICs), which have historically 
received a small fraction of global resources for mental health. The COVID-19 pandemic ha
--------------------------------------------------------------------------------
Abstract found at: -1
Match found for PDF: prior2016.pdf
Title: Substance use disorders comorbid with mood and anxiety disor

In [72]:
extracted_data[0]["abstract"]



In [3]:
extracted_data[0]["authors"]

['Lola Kola',
 'Brandon A Kohrt',
 'Charlotte Hanlon',
 'John A Naslund',
 'Siham Sikander',
 'Madhumitha Balaji',
 'Corina Benjet',
 'Eliza Yee Lai Cheung',
 'Julian Eaton',
 'Pattie Gonsalves',
 'Maji Hailemariam',
 'Eleni Misganaw',
 'Olayinka Omigbodun',
 'Tessa Roberts',
 'Tatiana Taylor Salisbury',
 'Rahul Shidhaye',
 'Charlene Sunkel',
 'Victor Ugo',
 'Janse Van Rensburg',
 'Oye Gureje',
 'Soumitra Pathare',
 'Shekhar Saxena',
 'Graham Thornicroft',
 'Vikram Patel']

In [None]:
###Continue searching for additional names, starting from the beginning
    # in 50-character increments, checking for new lines
    if matched_positions:
        last_match_position = start_pos
        while True:
            additional_text = text[last_match_position:last_match_position + 50]
            additional_text = additional_text.replace('\n', ' ')  # Handle new lines by replacing them with spaces
            
            # Split the text to handle new lines separately
            additional_authors_pattern = r'\b([A-Z][a-z]+(?:\s[A-Z](?:\.|\b))?(?:\s[A-Z][a-z]+)*)\b'
            additional_matches = list(re.finditer(additional_authors_pattern, text_to_search))

            if not additional_matches:
                return matched_authors, last_match_position  # Stop if no matches are found

            for match in additional_matches:
                full_name = match.group(0).strip()
                # Refined inclusion criteria: Only add if it's not in all caps and not an organization name
                if full_name not in matched_authors and not re.search(r'\b[A-Z]+\b', full_name):
                    matched_authors.append(full_name)
                    matched_positions.append(start_pos + match.end())
                    last_match_position += match.end()

    return matched_authors, max(matched_positions) if matched_positions else None