In [168]:
import fitz  # PyMuPDF
import nltk
import re
import string
import json
import requests
from dateutil import parser
from bson import ObjectId
from datetime import datetime

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/mcgregor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mcgregor/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [174]:
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

#WORKING

def extract_text_with_font_sizes_and_colors(pdf_path):
    with fitz.open(pdf_path) as doc:
        text_info = []
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            # Extract color directly from the span
                            # Color is returned as an integer in 0xRRGGBB format
                            color = span["color"]
                            r = (color >> 16) & 0xFF
                            g = (color >> 8) & 0xFF
                            b = color & 0xFF
                            rgb_color = (r, g, b)
                            
                            text_info.append({
                                "text": span["text"],
                                "font_size": span["size"],
                                "font": span["font"],
                                "bold": bool(span["flags"] & 2),  # Bitwise check for bold
                                "italic": bool(span["flags"] & 1),  # Bitwise check for italic
                                "color": rgb_color  # Add the color information as RGB tuple
                            })
    return text_info

def find_journal_name(text_info, journals_file):
    # Read the list of journal names from the file
    with open(journals_file, 'r') as file:
        journal_names = [line.strip() for line in file.readlines()]
    
    # Cycle through the text entries in the paper
    for entry in text_info:
        cleaned_text = entry['text'].strip()
        
        # Check if the cleaned text matches any of the journal names
        for journal in journal_names:
            if journal.lower() in cleaned_text.lower():
                return journal
    
    # If no match is found, return None
    return None

#WORKING

def extract_issue(text_info, journal_name="CADMUS"):
    issue = None

    for entry in text_info:
        # Clean the text
        cleaned_text = entry['text'].strip()

        # Look for the journal name in the text
        if journal_name.lower() in cleaned_text.lower():
            # Find the start of the issue information after the journal name
            issue_start = cleaned_text.lower().find(journal_name.lower()) + len(journal_name)

            # Skip any trailing punctuation or whitespace right after the journal name
            while issue_start < len(cleaned_text) and cleaned_text[issue_start] in string.whitespace + string.punctuation:
                issue_start += 1

            # Extract the issue
            issue = cleaned_text[issue_start:].strip()
            break  # Stop after finding the first occurrence

    return issue

#WORKING

def extract_title(text_info):
    title_found = False
    title_lines = []

    for entry in text_info:
        # Check if the current line matches the title attributes
        if (entry['font_size'] == 15.0 and
            entry['font'] == 'TimesNewRomanPS-BoldMT' and
            not entry['bold'] and
            not entry['italic'] and
            entry['color'] == (127, 19, 39)):
            
            # If title attributes are found, start collecting title lines
            title_found = True
            cleaned_text = entry['text'].strip()
            if cleaned_text:  # Avoid adding empty lines
                title_lines.append(cleaned_text)
            continue
        
        # If title has started and the next line does not match the title attributes, stop collecting
        if title_found:
            if (entry['font_size'] != 15.0 or
                entry['font'] != 'TimesNewRomanPS-BoldMT' or
                entry['bold'] or
                entry['italic'] or
                entry['color'] != (127, 19, 39)):
                break

            # If the next line still matches the title attributes, keep collecting
            cleaned_text = entry['text'].strip()
            if cleaned_text:  # Avoid adding empty lines
                title_lines.append(cleaned_text)

    # Join all lines into a single string to form the title
    title_text = " ".join(title_lines)
    return title_text

#WORKING

def extract_authors(text_info):
    authors_found = False
    authors = []

    for entry in text_info:
        # Start looking for authors after the title has been found
        if authors_found and 'abstract' in entry['text'].lower():
            break  # Stop collecting when "Abstract" is found

        # Look for lines that match the author attributes
        if authors_found and (
            entry['font_size'] == 10.0 and
            entry['font'] == 'TimesNewRomanPS-BoldMT' and
            not entry['bold'] and
            not entry['italic'] and
            entry['color'] == (209, 34, 41)):
            
            # If author attributes are found, collect the author's name
            cleaned_text = entry['text'].strip()
            if cleaned_text:  # Avoid adding empty lines
                authors.append(cleaned_text)
            continue

        # Once the title has been processed, start looking for authors
        if not authors_found and (
            entry['font_size'] == 15.0 and
            entry['font'] == 'TimesNewRomanPS-BoldMT' and
            not entry['bold'] and
            not entry['italic'] and
            entry['color'] == (127, 19, 39)):
            authors_found = True

    return authors

#WORKING

def extract_abstract(text_info, title, journal_name, issue, authors):
    abstract_found = False
    abstract_started = False
    skip_next_entry = False
    abstract_lines = []
    abstract_end_index = 0

    for i, entry in enumerate(text_info):
        # Detect the word "Abstract" in the text with specific font size
        if not abstract_found and 'abstract' in entry['text'].lower() and entry['font_size'] == 12.0:
            abstract_found = True
            continue  # Skip the "Abstract" line itself

        # Skip the next entry if the current entry's color is (36, 63, 142)
        if skip_next_entry:
            skip_next_entry = False
            continue

        # If the line following "Abstract" has the color (36, 63, 142), skip the next entry
        if abstract_found and entry['color'] == (36, 63, 142):
            skip_next_entry = True
            continue

        # Start collecting the abstract only after encountering italic and bold text after "Abstract"
        if abstract_found and not abstract_started:
            if entry['font'] == 'TimesNewRomanPS-ItalicMT' and entry['bold']:
                abstract_started = True
                # Start from this line if it's part of the abstract
                cleaned_text = entry['text'].strip()
                if cleaned_text:  # Avoid adding empty lines
                    abstract_lines.append(cleaned_text)
            continue

        # If abstract has started, keep collecting lines with 'TimesNewRomanPS-ItalicMT'
        if abstract_started:
            if entry['font'] != 'TimesNewRomanPS-ItalicMT':
                abstract_end_index = i
                break  # Stop collecting if the font changes
            cleaned_text = entry['text'].strip()
            
            # Skip lines that are exactly the title, journal name, issue, or any author name
            if (cleaned_text.lower() == title.lower() or
                cleaned_text.lower() == journal_name.lower() or
                cleaned_text.lower() == issue.lower() or
                cleaned_text.lower() in [author.lower() for author in authors]):
                continue
                
            if cleaned_text:  # Avoid adding empty lines
                abstract_lines.append(cleaned_text)

    # Join all lines into a single string to form the abstract
    abstract_text = " ".join(abstract_lines)
    return abstract_text, abstract_end_index


# Assuming text_with_font_sizes is already defined
#abstract = extract_abstract(text_with_font_sizes)
#print(f"Extracted Abstract: {abstract}")

def extract_body(text_info, abstract_end_index, title, journal_name, issue, authors):
    body_lines = []
    skip_next_entry = False
    body_end_index = 0

    for i, entry in enumerate(text_info[abstract_end_index:], start=abstract_end_index):
        # Clean the text
        cleaned_text = entry['text'].strip()

        # Stop collecting when encountering the "Bibliography" keyword with specific attributes
        if (cleaned_text.lower() == "bibliography" and
            entry['font_size'] == 12.0 and
            entry['font'] == 'TimesNewRomanPS-BoldMT' and
            not entry['bold'] and
            not entry['italic'] and
            entry['color'] == (34, 31, 31)):
            body_end_index = i + 1  # Set the body end index to start after "Bibliography"
            break

        # Skip the next entry if the current entry's color is (36, 63, 142)
        if skip_next_entry:
            skip_next_entry = False
            continue

        # If the current entry has the color (36, 63, 142), skip the next entry
        if entry['color'] == (36, 63, 142):
            skip_next_entry = True
            continue

        # Skip lines that are exactly the title, journal name, issue, or any author name
        if (cleaned_text.lower() == title.lower() or
            cleaned_text.lower() == journal_name.lower() or
            cleaned_text.lower() == issue.lower() or
            cleaned_text.lower() in [author.lower() for author in authors]):
            continue

        # Add the line to the body if it's not empty
        if cleaned_text:
            body_lines.append(cleaned_text)

    # Join all lines into a single string to form the body
    body_text = " ".join(body_lines)
    return body_text, body_end_index

#WORKING 

def extract_bibliography(text_info):
    bibliography = []
    current_entry = []
    expected_number = 1
    bibliography_started = False
    
    for entry in text_info:
        cleaned_text = entry['text'].strip()

        # Check if we've reached the "Bibliography" section
        if (cleaned_text.lower() == "bibliography" and
            entry['font_size'] == 12.0 and
            entry['font'] == 'TimesNewRomanPS-BoldMT' and
            not entry['bold'] and
            not entry['italic'] and
            entry['color'] == (34, 31, 31)):
            bibliography_started = True
            continue  # Skip the "Bibliography" header itself

        # Start processing the bibliography entries after the "Bibliography" header
        if bibliography_started:
            expected_number_str = f"{expected_number}."

            # Check if the line starts with the expected number and dot, and has the specified attributes
            if (cleaned_text.startswith(expected_number_str) and
                entry['font_size'] == 7.0 and
                entry['font'] == 'TimesNewRomanPSMT' and
                not entry['bold'] and
                not entry['italic'] and
                entry['color'] == (34, 31, 31)):
                
                # If we have a current entry, append it to the bibliography with the number included
                if current_entry:
                    bibliography.append(f"{expected_number - 1}. " + " ".join(current_entry).strip())
                    current_entry = []
                
                # Prepare for the next entry
                expected_number += 1
                current_entry.append(cleaned_text[len(expected_number_str):].strip())  # Add the text after the number
            else:
                # Add the current line to the ongoing bibliography entry
                current_entry.append(cleaned_text)

    # Append the last entry if it's valid
    if current_entry:
        bibliography.append(f"{expected_number - 1}. " + " ".join(current_entry).strip())

    return bibliography



def extract_keywords(text):
    # Using a naive keyword extraction (you can use more sophisticated methods)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    tagged_words = nltk.pos_tag(words)
    keywords = [word for word, pos in tagged_words if pos in ('NN', 'NNS', 'NNP', 'NNPS')]
    return list(set(keywords))[:10]

def extract_publication_date(text):
    # You may have to tweak this based on your specific PDF format
    date_match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', text)
    if date_match:
        return parser.parse(date_match.group(0)).isoformat()
    return datetime.utcnow().isoformat()

def extract_doi(text):
    doi_match = re.search(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', text, re.IGNORECASE)
    if doi_match:
        return doi_match.group(0)
    return ""

def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    text_with_font_sizes = extract_text_with_font_sizes_and_colors(pdf_path)
    
    #first step, cycle trought the journal_list file to find what journal are we on
    journal_name = find_journal_name(text_with_font_sizes, journals_file)
    issue = extract_issue(text_with_font_sizes, journal_name)
    title = extract_title(text_with_font_sizes)
    authors = extract_authors(text_with_font_sizes)
    abstract, abstract_end_index = extract_abstract(text_with_font_sizes, title, journal_name, issue, authors)
    body, body_end_index = extract_body(text_with_font_sizes, abstract_end_index, title, journal_name, issue, authors)
    bibliography = extract_bibliography(text_with_font_sizes)
    
    data = {
        "_id": str(ObjectId()),
        "journal_name": journal_name,
        "issue": issue,
        "title": title,
        "authors": authors,
        "abstract": abstract,
        "body": body,
        "bibliography": bibliography,
        "keywords": extract_keywords(text), #need improvment, can do when alla data are done
        "publication_date": extract_publication_date(text),
        "DOI": extract_doi(text), #TODO not working
        "language": "EN",  #TODO You can use langdetect to auto-detect the language if needed
        "raw_text": text,
        "targets": ["People", "Trainers", "Public Administration"] #TODO this we can improve later
    }
    
    with open('output.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)
    return data, text, text_with_font_sizes
    

In [181]:

if __name__ == "__main__":
    pdf_path = '/home/mcgregor/Desktop/IACP/TICBot/materials/papers/Zucconi&Rolle_Health&Economic_Burdens_from_Human_Security_Destruction_2023.pdf'
    pdf_path = '/home/mcgregor/Desktop/IACP/TICBot/materials/papers/prova.pdf'
    #pdf_path = '/home/mcgregor/Desktop/IACP/TICBot/materials/papers/czl003.pdf'
    journals_file='/home/mcgregor/Desktop/IACP/TICBot/materials/papers/list_of_journals'
    data,text, text_with_font_sizes = main(pdf_path)


In [182]:
data["journal_name"]

'CADMUS'

In [157]:
for entry in data["bibliography"]:
    print(entry)

1. Anderson, T. W. (1990). Reality isn’t what it used to be . San Francisco: Harper & Row.
2. Anderson, T. W. (1997). The future of the self . New York: Tarcher/Putnam.
3. Anderson, T. W. (2016). We the Planet: Evolutionary Governance and Biophilia in the Anthropocene . The Meridian International Institute, Carlsbad, CA. ISBN 9780692793848. The Need for Person-Centered Education Alberto Zucconi 23
4. Anyanwu, U. S.;  & Iwuamadi, N. F. (2015). Student-centered Teaching and Learning in Higher Education: Transition from Theory to Practice in Nigeria. International Journal of Education and Research Vol. 3 No. 8 August 2015. pp. 349-358.
5. Armstrong, J.S (2012). Natural Learning in Higher Education. Encyclopedia of the Sciences of Learning . Heidelberg: Springer.
6. Aspy, D. and Roebuck, F. (1977). Kids Don’t Learn from People They Don’t Like . Amherst, MA: Human Resources Development Press.
7. Aspy, D., & Roebuck, F. N. (1983). Researching Person-Centered Issues in Education. Freedom to L

In [167]:
data["bibliography"][105]

'106. Zucconi, A. (2016). What kind of Education is Needed to Navigate the Fourth Industrial Revolution? Book of Proceedings of the international conference on “Technology and  Society, what kind of  Future”, held by the Montenegrin Academy of Sciences and Arts, in cooperation with WAAS, EASA, GRT and ALLEA, in Podgorica, on 19\xad -20 May, 2016.'

In [21]:
def search_keywords_with_attributes(text_info, keywords, context_window=1):
    keyword_occurrences = []

    # Iterate through the text_info entries
    for i, entry in enumerate(text_info):
        for keyword in keywords:
            lower_text = entry['text'].lower()
            lower_keyword = keyword.lower()

            if lower_keyword in lower_text:
                # Collect context entries
                start_context = max(0, i - context_window)
                end_context = min(len(text_info), i + context_window + 1)
                context = text_info[start_context:end_context]

                # Store the keyword occurrence with all associated attributes
                keyword_occurrences.append({
                    'keyword': keyword,
                    'position': i,
                    'text': entry['text'],
                    'font_size': entry['font_size'],
                    'font': entry['font'],
                    'bold': entry['bold'],
                    'italic': entry['italic'],
                    'color': entry['color'],
                    'context': context  # Include surrounding context entries
                })

    return keyword_occurrences