In [33]:
import glob
import itertools as it
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Some experimenting with pandas

In [43]:
metadata_path = 'C:/Users/Mario/Desktop/NLP/project/CORD-19-research-challenge/metadata.csv' 

meta_df = pd.read_csv(metadata_path)

In [44]:
meta_df.shape

(44220, 15)

In [45]:
meta_df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [37]:
meta_df.dropna(subset=['sha'], inplace=True)
meta_df.shape

(28462, 15)

In [38]:
# meta_df[meta_df.duplicated(subset=['sha'])]
meta_df.full_text_file.unique()

array(['custom_license', 'noncomm_use_subset', 'comm_use_subset',
       'biorxiv_medrxiv'], dtype=object)

### And now serious business :D

In [39]:
class Document:
    
    def __init__(self, paper_id, abstract, body_text):
        self.paper_id = paper_id
        self.abstract = abstract
        self.body_text = body_text
    
    @classmethod
    def from_json(cls, path):
        with open(path, 'r') as fd:
            data = json.load(fd)
        
        paper_id = data['paper_id']
        abstract = [record['text'] for record in data['abstract']]
        abstract = '\n'.join(abstract)
        body_text = [record['text'] for record in data['body_text']]
        body_text = '\n'.join(body_text)
        return cls(paper_id, abstract, body_text)
    
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]} ... {self.body_text[:200]} ...'
        
    def _repr_html_(self):
        paper_html = f'<b>Paper ID:</b> {self.paper_id}'
        abstract_html = ['<p>' + record + '</p>' for record in self.abstract.split('\n')]
        abstract_html = '<h3>' + 'Abstract' + '</h3>' + ''.join(abstract_html)
        body_text_html = ['<p>' + record + '</p>' for record in self.body_text.split('\n')]
        body_text_html = '<h3>' + 'Body text' + '</h3>' + ''.join(body_text_html)  
        return paper_html + abstract_html + body_text_html

In [40]:
class CollectionLoader:
    
    def __init__(self, dirs, spec=''):
        spec = self._parse_spec(spec, dirs)   

        docfiles = []
        for i, dirname in enumerate(dirs):
            dirfiles = glob.glob(f'{dirname}/**/*.json', recursive=True)
            limit = spec[i] or len(dirfiles)            
            docfiles.extend(dirfiles[:limit])
        
        self.docfiles = docfiles
            
    @staticmethod
    def _parse_spec(spec, dirs):
        if not spec: return [None] * len(dirs)
            
        spec_to_int = [int(s) if s.isdigit() else None
                       for s in spec.split(':')]
        
        if len(dirs) != len(spec_to_int):
            raise ValueError('length of dirs does not match length of spec')
        
        return spec_to_int
        
    def __iter__(self):
        for fname in self.docfiles:
            yield Document.from_json(fname)

#### Basic usage 



In [41]:
# list of directories, each containing json files
# note: if topmost directory does not contain json files, 
# recursive search is performed

# dirs = ('./CORD-19-research-challenge/custom_license', 
#         './CORD-19-research-challenge/noncomm_use_subset', 
#         './CORD-19-research-challenge/comm_use_subset',
#         './CORD-19-research-challenge/biorxiv_medrxiv')

dirs = ('./dataset/noncomm_use100', 
        './dataset/comm_use100',
        './dataset/biorxiv_medrxiv100')

# pass above list and spec string
# each entry, delimited by :, in spec string represents the number of json files 
# that will be read from corresponding directory
collection_loader = CollectionLoader(dirs, spec='2:1:3')
collection = list(collection_loader)

# sanity check
print(collection[3])
print('number of documents:', len(collection))

00d16927588fb04d4be0e6b269fc02f0d3c2aa7b: Infectious bronchitis (IB) causes significant economic losses in the global poultry industry. Control of infectious bronchitis is hindered by the genetic diversity of the causative agent, infectious b ... Infectious bronchitis (IB), which is caused by infectious bronchitis virus (IBV), is one of the most important diseases of poultry, causing severe economic losses worldwide. 8 Clinical signs of diseas ...
number of documents: 6


In [42]:
# rich output (only available in Jupyter)
collection[3]

### TODO: Preprocessing pipeline