In [79]:
import fitz
from operator import itemgetter
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [80]:
pdf_path = "C:/Users/wloon/Desktop/NTU/Hackathon/research1.pdf"
pdf_doc = fitz.open(pdf_path)
fonts(pdf_doc)

([('9.800000190734863', 645),
  ('7.5', 246),
  ('8.0', 236),
  ('10.0', 55),
  ('9.803299903869629', 34),
  ('7.0', 32),
  ('7.450399875640869', 29),
  ('5.25', 19),
  ('11.0', 18),
  ('9.199999809265137', 16),
  ('7.699999809265137', 14),
  ('6.860000133514404', 13),
  ('10.300000190734863', 12),
  ('4.900000095367432', 11),
  ('9.0', 10),
  ('5.599999904632568', 6),
  ('6.300000190734863', 5),
  ('24.0', 4),
  ('13.0', 2),
  ('12.430000305175781', 1)],
 {'8.0': {'size': 8.0, 'font': 'MyriadPro-Regular3'},
  '13.0': {'size': 13.0, 'font': 'MyriadPro-Bold'},
  '7.0': {'size': 7.0, 'font': 'MyriadPro-Regular'},
  '12.430000305175781': {'size': 12.430000305175781,
   'font': 'MyriadPro-Regular2'},
  '24.0': {'size': 24.0, 'font': 'MyriadPro-Regular'},
  '11.0': {'size': 11.0, 'font': 'MyriadPro-Light'},
  '7.699999809265137': {'size': 7.699999809265137, 'font': 'MyriadPro-Light'},
  '10.300000190734863': {'size': 10.300000190734863, 'font': 'MyriadPro-Bold'},
  '10.0': {'size': 10.0, 'f

In [81]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [82]:
font_counts, styles = fonts(pdf_doc, granularity=False)
size_tag = font_tags(font_counts, styles)

In [83]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [107]:
textList1 = headers_para(pdf_doc, size_tag)
print(textList1[8])
text = "\n".join(textList1)
#print(text)

# split the text into lines and remove the '\n' characters
lines = [line.replace('\n', '') for line in text.split('\n')]

# remove '|' characters
lines = [line.replace('|', '') for line in lines]

text = "\n".join(lines)

<h3>BMC Health Services Research|


In [85]:
import time
from tqdm import tqdm
import csv
import json
import numpy as np
import argparse
import pandas as pd
import requests
from bs4 import BeautifulSoup
import concurrent.futures
from csv import reader
import timeout_decorator
import datetime
from pandas.io.json import json_normalize
import nltk
import re
import datefinder
from tqdm import tqdm
from urllib.request import urlopen, Request

In [150]:
# Create an empty dataframe with columns for titles and paragraphs
df = pd.DataFrame(columns=["title", "paragraph"])

#textList2 = textList1
# Initialize variables to store title and paragraph
title = None
paragraph = None
# Loop through each line in the file
for line in textList1:
    try:
        # Check if the line starts with a title tag
        if line.startswith("<h1>") or line.startswith("<h2>") or line.startswith("<h3>") or line.startswith("<h4>") or line.startswith("<h5>"): #or line.startswith("<h6>"):
            # If the previous title and paragraph are not None, add them to the dataframe
            if title is not None and paragraph is not None:
                df = df.append({"title": title, "paragraph": paragraph}, ignore_index=True)
            # Set the current line as the title
            title = line.strip()
            # Reset the paragraph variable
            paragraph = ""
        else:
            # If the paragraph variable is not empty, add a line break before the new paragraph
            if paragraph:
                paragraph += "\n"
            # Add the line to the paragraph variable
            paragraph += line.strip()
    except:
        continue
# Add the last title and paragraph to the dataframe
df = df.append({"title": title, "paragraph": paragraph}, ignore_index=True)

df

Unnamed: 0,title,paragraph
0,<h2>RESEARCH|,<s7>© The Author(s) 2023. Open Access This a...
1,<h2>Open Access|,
2,<h3>BMC Health Services Research|,
3,<h1>Change in glycaemic control with structure...,
4,<h4>Roberta Lamptey,"<s4>1,2,3,19*"
5,"<h4>, Mary Amoakoh‑Coleman","<s4>3,4"
6,"<h4>, Mary Moffett Barker","<s4>5,6"
7,"<h4>, Samuel Iddi",<s4>7
8,"<h4>, | Michelle Hadjiconstantinou","<s4>5,6"
9,"<h4>, Melanie Davies","<s4>5,6,8,9"


In [151]:
df = df.replace('<h\d+>', '', regex=True)
df = df.replace('<s\d+>', '', regex=True)
df = df.replace('<p>', '', regex=True)
df['title'] = df['title'].str.replace(',', '')
df['title'] = df['title'].str.replace(', |', '')
df['title'] = df['title'].str.replace('|', '')
df = df.replace("• ", "")
df

  df['title'] = df['title'].str.replace(', |', '')
  df['title'] = df['title'].str.replace('|', '')


Unnamed: 0,title,paragraph
0,RESEARCH,© The Author(s) 2023. Open Access This artic...
1,Open Access,
2,BMC Health Services Research,
3,Change in glycaemic control with structured d...,
4,Roberta Lamptey,"1,2,3,19*"
5,Mary Amoakoh‑Coleman,34
6,Mary Moffett Barker,56
7,Samuel Iddi,7
8,Michelle Hadjiconstantinou,56
9,Melanie Davies,5689
