In [1]:
import fitz
import re
import pandas as pd

pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)

from unidecode import unidecode 
import numpy as np

In [2]:
class PDFTextExtractor:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
        self.block_dict = {}
        self.span_df = None
        self.text_df = None

    def extract_blocks(self):
        page_num = 1

        for page in self.doc:
            file_dict = page.get_text('dict')
            block = file_dict['blocks']
            self.block_dict[page_num] = block
            page_num += 1

    def create_span_dataframe(self):
        rows = []

        for page_num, blocks in self.block_dict.items():
            for block in blocks:
                if block['type'] == 0:
                    for line in block['lines']:
                        for span in line['spans']:
                            xmin, ymin, xmax, ymax = list(span['bbox'])
                            font_size = span['size']
                            text = unidecode(span['text'])
                            span_font = span['font']
                            is_upper = "bold" in span_font.lower()
                            is_bold = re.sub("[\(\[].*?[\)\]]", "", text).isupper()

                            if text.replace(" ","") != "":
                                rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))

        self.span_df = pd.DataFrame(rows, columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'is_upper', 'is_bold', 'span_font', 'font_size'])

    def calculate_span_scores(self):
        span_scores = []

        for _, span_row in self.span_df.iterrows():
            score = round(span_row.font_size)
            text = span_row.text

            if not re.search(r'[(_:/,#%\=@)]', text):
                if span_row.is_bold:
                    score += 1
                if span_row.is_upper:
                    score += 1

            span_scores.append(score)

        values, counts = np.unique(span_scores, return_counts=True)
        style_dict = dict(zip(values, counts))

        p_size = max(style_dict, key=style_dict.get)

        idx = 0
        tag = {}

        for size in sorted(values, reverse=True):
            idx += 1
            if size == p_size:
                idx = 0
                tag[size] = 'p'
            if size > p_size:
                tag[size] = f'h{idx}'
            if size < p_size:
                tag[size] = f's{idx}'

        span_tags = [tag[score] for score in span_scores]
        self.span_df['tag'] = span_tags

    def extract_headings_and_content(self):
        headings_list = []
        text_list = []
        tmp = []
        heading = ''

        for _, span_row in self.span_df.iterrows():
            text = span_row.text
            tag = span_row.tag

            if 'h' in tag:
                headings_list.append(text)
                text_list.append('\n'.join(tmp))
                tmp = []
                heading = text
            else:
                tmp.append(text)

        text_list.append('\n'.join(tmp))
        text_list = text_list[1:]
        self.text_df = pd.DataFrame(zip(headings_list, text_list), columns=['heading', 'content'])

In [3]:
pdf_extractor = PDFTextExtractor("/Users/L051360/Downloads/ljad096.pdf")
pdf_extractor.extract_blocks()
pdf_extractor.create_span_dataframe()
pdf_extractor.calculate_span_scores()
pdf_extractor.extract_headings_and_content()

In [4]:
pdf_extractor.text_df[26:40]

Unnamed: 0,heading,content
26,"University of California San Diego and Rady Children's Hospital, San Diego, CA, USA",Correspondence: Antonio Torrelo. Email: \natorrelo@aedv.es
27,Abstract,
28,Background,"Baricitinib, an oral selective Janus kinase (JAK)1/JAK2 inhibitor, is approved in many countries for moderate-to-severe atopic \ndermatitis (AD) in adults who are candidates for systemic therapy."
29,Objectives,To evaluate the efficacy and safety of three doses of baricitinib in combination with low-to-moderate potency topical corticoste-\nroids in paediatric patients with moderate-to-severe AD.
30,Methods,"Patients (aged 2 to \n<\n 18 years) were randomized (1 : 1 : 1 : 1) to once-daily baricitinib low dose (1 mg equivalent), medium dose \n(2 mg equivalent), high dose (4 mg equivalent) or placebo for 16 weeks. The primary endpoint was the proportion of patients achieving a vali-\ndated Investigator Global Assessment\n(r)\n (vIGA-AD) of 0/1 with a \n>=\n 2-point improvement at week 16. Key secondary endpoints included the pro-\nportions of patients achieving \n>=\n 75% and \n>=\n 90% improvement in the Eczema Area and Severity Index (EASI-75 and EASI-90, respectively), \n>=\n 75% improvement in the SCORing Atopic Dermatitis (SCORAD 75), mean change from baseline in EASI score and proportion of patients \nachieving a 4-point improvement in the Itch Numeric Rating scale (NRS) for patients aged \n>=\n 10 years. Primary and key secondary efficacy \nanalyses were conducted on the intent-to-treat population and adjusted for multiplicity. Safety analyses included all randomized patients who \nreceived \n>=\n 1 dose of study treatment."
31,Results,A total of 483 patients were randomized (mean age 12 years). The baricitinib 4 mg equivalent achieved a statistically significant \n(
32,P,"<\n 0.05) improvement vs. placebo on all 16-week endpoints (vIGA 0/1 with \n>=\n 2-point improvement, EASI-75, EASI-90, SCORAD 75, mean \nchange in EASI score and Itch NRS 4-point improvement for patients aged \n>=\n 10 years). Improvement ("
33,P,"<\n 0.05, non-multiplicity adjusted) was \nalso observed for baricitinib 4 mg equivalent vs. placebo in the ability to fall asleep and in reduction of topical corticosteroid use. Few patients \ndiscontinued due to adverse events (1.6% for placebo and 0.6% for those treated with baricitinib). There were no deaths, venous thrombo-\nembolic events, arterial thrombotic events, major adverse cardiovascular events, malignancies, gastrointestinal perforations or opportunistic \ninfections seen."
34,Conclusions,The results indicate that baricitinib offers a potential therapeutic option with a favourable benefit-risk profile for paediatric pa-\ntients with moderate-to-severe AD who are candidates for systemic therapies.
35,What is already known about this topic?,"* Baricitinib, an oral selective Janus kinase (JAK)1 and JAK2 inhibitor, is approved for the treatment of moderate-to-severe atopic \ndermatitis (AD) in adults.\nDownloaded from https://academic.oup.com/bjd/article/189/1/23/7097622 by guest on 02 August 2023"
