# Requirements

In [1]:
import glob
import json
from operator import itemgetter
import pathlib
import warnings

# Article structure

The PDF documents have been processed by OCR software resulting in a JSON file per document.  The JSON object is a list of pages.  Each page has a list of lines, and each line has a text element.

In [2]:
with open('data/1-papers-processed/5513.json') as json_file:
    article_json = json.load(json_file)

In [3]:
article_json[0]['lines'][5]['text']

'Pest categorisation of Pseudopityophthorus minutissimus'

# Article parser

We define a class that represents an article.  It's constructor will read a JSON file, and properties are defined for all the relevant metadata such as the title, authors, abstract, panel (if any), keywords (if any).

In [4]:
class Article:
    
    def __init__(self, file_name):
        self._file_name = file_name
        with open(file_name) as json_file:
            self._article = json.load(json_file)
            
    def _line_nr_bracket(self, previous_line_start, next_line_start):
        start_line_nr = None
        for line_nr, line in enumerate(self._article[0]['lines']):
            if any(map(lambda x: line['text'].startswith(x), previous_line_start)):
                start_line_nr = line_nr + 1
            if any(map(lambda x: line['text'].startswith(x), next_line_start)) and start_line_nr is not None:
                end_line_nr = line_nr
                return start_line_nr, end_line_nr
        raise ValueError(f'object not found, markers: "{previous_line_start}" -> "{next_line_start}"')
    
    @property
    def title(self):
        try:
            start_line_nr, end_line_nr = self._line_nr_bracket(['doi'],
                                                               ['EFSA Panel',
                                                                'European Food Safety Authority',
                                                                'EFSA (European Food',
                                                                'EFSA Food',
                                                                'EFSA Scientific',
                                                                'EFSA BIOHAZ',
                                                               ])
            return ' '.join(map(itemgetter('text'),
                                self._article[0]['lines'][start_line_nr:end_line_nr]))
        except ValueError:
            raise ValueError('no title found')
    
    @property
    def panel(self):
        for line in self._article[0]['lines']:
            if line['text'].startswith('EFSA Panel'):
                return line['text']
        return None
        
    @property
    def authors(self):
        try:
            start_line_nr, end_line_nr = self._line_nr_bracket(['EFSA Panel',
                                                                'European Food Safety Authority',
                                                                'EFSA (European Food',
                                                                'EFSA Food',
                                                                'EFSA Scientific',
                                                                'EFSA BIOHAZ',
                                                               ],
                                                               ['Abstract'])
            author_str = ' '.join(map(itemgetter('text'),
                                  self._article[0]['lines'][start_line_nr:end_line_nr]))
            return author_str.replace(' and ', ', ').split(', ')
        except ValueError:
            raise ValueError('no authors found')
            
    @property
    def abstract(self):
        try:
            start_line_nr, end_line_nr = self._line_nr_bracket(['Abstract'], ['Keywords:', 'Requestor'])
            return ' '.join(map(itemgetter('text'),
                                self._article[0]['lines'][start_line_nr:end_line_nr]))
        except ValueError:
            raise ValueError('no abstract found')
            
    @property
    def keywords(self):
        try:
            start_line_nr, end_line_nr = self._line_nr_bracket(['Keywords:'],
                                                               ['Requestor',
                                                                '*',
                                                                'www.efsa'])
            start_line_nr -= 1  # keywords are listed on the line starts with Keywords
            keyword_str = ' '.join(map(itemgetter('text'),
                                   self._article[0]['lines'][start_line_nr:end_line_nr]))
            return keyword_str.replace('Keywords: ', '').split(', ')
        except ValueError:
            warnings.warn(f'no keywords found for {self._file_name}')
            return []

Below is an example for one of the articles.

In [5]:
article = Article('data/1-papers-processed/5513.json')

In [6]:
article.title

'Pest categorisation of Pseudopityophthorus minutissimus and P. pruinosus'

In [7]:
article.panel

'EFSA Panel on Plant Health (EFSA PLH Panel),'

In [8]:
article.authors

['Claude Bragard',
 'Katharina Dehnen-Schmutz',
 'Francesco Di Serio',
 'Paolo Gonthier',
 'Marie-Agnes Jacques',
 'Josep Anton Jaques Miret',
 'Annemarie Fejer Justesen',
 'Alan MacLeod',
 'Christer Sven Magnusson',
 'Juan A Navas-Cortes',
 'Stephen Parnell',
 'Roel Potting',
 'Philippe Lucien Reignault',
 'Hans-Hermann Thulke',
 'Wopke Van der Werf',
 'Antonio Vicent Civera',
 'Jonathan Yuen',
 'Lucia Zappala',
 'Jean-Claude Gr egoire',
 'Vir ag Kert esz',
 'Panagiotis Milonas']

In [9]:
article.abstract

'The Panel on Plant Health performed a pest categorisation of Pseudopityophthorus minutissimus and Pseudopityophthorus pruinosus, two well-defined insect species in the family Curculionidae, subfamily Scolytinae (Insecta: Coleoptera). They can be identified using taxonomic keys. P. minutissimus is present in parts of Canada and the USA, and P. pruinosus is present in parts of the USA, Guatemala, Honduras and Mexico. The main host plants of the two species are Quercus spp., but they also attack several other genera. The two species mostly colonise weakened or dead branches but can also attack the stems. They are mostly secondary pests but they vector the oak wilt fungus, Bretziella fagacearum, which causes heavy damage in American Quercus spp. populations. The fungus is mainly transmitted by the young adults during their maturation feeding on twigs, leaf petioles and young acorn stems. The beetles are polygamous and have two generations per year in most of their range. The main pathways

In [10]:
article.keywords

['Bretziella fagacearum',
 'European Union',
 'oak bark beetle',
 'pest risk',
 'plant health',
 'plant pest',
 'quarantine']

We can now define a function that takes a file name as an argument, and returns a JSON representation of the metadata.

In [11]:
def extract_metadata(file_name):
    article = Article(file_name)
    metadata = {
        'title': article.title,
        'authors': article.authors,
        'panel': article.panel,
        'abstract': article.abstract,
        'keywords': article.keywords,
    }
    return json.dumps(metadata, indent=2)

For example:

In [15]:
print(extract_metadata('data/1-papers-processed/5865.json'))

{
  "title": "Safety assessment of the process \u2018POLY RECYCLING PET DIRECT IV+\u2019, used to recycle post-consumer PET into food contact materials",
  "authors": [
    "Vittorio Silano",
    "Jose Manuel Barat Baviera",
    "Claudia Bolognesi",
    "Andrew Chesson",
    "Pier Sandro Cocconcelli",
    "Riccardo Crebelli",
    "David Michael Gott",
    "Konrad Grob",
    "Alicja Mortensen",
    "Gilles Riviere",
    "Inger-Lise Steffensen",
    "Christina Tlustos",
    "Henk Van Loveren",
    "Laurence Vernis",
    "Holger Zorn",
    "Vincent Dudler",
    "Maria Rosaria Milana",
    "Constantine Papaspyrides",
    "Maria de Fatima Tavares Poc as",
    "Cristina Croera",
    "Evgenia Lampi"
  ],
  "panel": "EFSA Panel on Food Contact Materials, Enzymes and Processing Aids (CEP),",
  "abstract": "The EFSA Panel on Food Contact Materials, Enzymes and Processing Aids (CEP Panel) assessed the recycling process POLY RECYCLING PET direct IV+ (EU register number RECYC161). The input is hot 

# Processing

We can now create the JSON files containing the metadata for each of the articles.

In [13]:
metadata_dir = pathlib.Path('data/3-papers-metadata')
metadata_dir.mkdir()

In [14]:
count = 0
for file_name in sorted(glob.glob('data/1-papers-processed/*.json')):
    json_file = pathlib.Path(file_name)
    try:
        json_str = extract_metadata(file_name)
        with open(metadata_dir / json_file.name, 'w') as metadata_file:
            print(json_str, file=metadata_file)
    except Exception as e:
        print(f'{file_name}: {e}')
        count += 1
print(f'{count} problems')



data/1-papers-processed/6041.json: no title found
1 problems
