# Load Data

In [2]:
import numpy as np
import pandas as pd
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [5]:
# meta_df = pd.read_csv("metadata.csv")

In [8]:
# if you were to do this individually
# meta_df['pubmed_id'] = meta_df['pubmed_id'].astype(str)


In [12]:
meta_df = pd.read_csv('metadata.csv', dtype ={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi' : str
})

## Fetch All of JSON File Path

In [23]:
ls

COVID.DATA.LIC.AGMT.pdf  [34mcomm_use_subset[m[m/         metadata.readme
README.md                [34mcustom_license[m[m/          [34mnoncomm_use_subset[m[m/
Untitled.ipynb           json_schema.txt
[34mbiorxiv_medrxiv[m[m/         metadata.csv


In [27]:
pwd

'/Users/johnrick/Desktop/DataScienceLearn/blog_covid19'

In [34]:
all_json = glob.glob(f'{"/Users/johnrick/Desktop/DataScienceLearn/blog_covid19"}/**/*.json', recursive=True)

In [35]:
len(all_json)

33375

# Create Helper Functions

In [37]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            #abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            #body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row=FileReader(all_json[0])
print(first_row)

86a998617c077f4fe2ab26214995a3548fbc0fc5: The recent emergence of the Middle East respiratory syndrome (MERS)-CoV, a close relative of the Severe Acute respiratory syndrome (SARS)-CoV, both of which caused a lethal respiratory infection in hu... While most CoVs cause the common cold in humans, infection with two recently emerged CoVs, SARS-CoV and MERS-CoV, resulted in more severe pulmonary disease with alarmingly high case fatality rates [1]...


In [38]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0
    
    #add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

## Load into DataFrame

In [39]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

Processing index: 0 of 33375
Processing index: 3337 of 33375
Processing index: 6674 of 33375
Processing index: 10011 of 33375
Processing index: 13348 of 33375
Processing index: 16685 of 33375
Processing index: 20022 of 33375
Processing index: 23359 of 33375
Processing index: 26696 of 33375
Processing index: 30033 of 33375
Processing index: 33370 of 33375


Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,,The evolutionary history of humans is characte...,"Scarpino, S.V.",Evolutionary Medicine IV. Evolution and<br>Em...,Encyclopedia of Evolutionary Biology,Not provided.
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...","Macintosh, Andrew. Wallace, Lailey",International aviation emissions to 2025: Can...,Energy Policy,"International aviation is growing rapidly,<br..."
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,,Acute infections of the gastrointestinal tract...,"Booth, I.W.. McNeish, A.S.",Mechanisms of diarrhoea,Baillière's Clinical Gastroenterology,Not provided.
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,,"There are three domains of life-Bacteria, Arch...","Louten, Jennifer",Chapter 3 Features of Host Cells Cellular and...,Essential Human Virology,Not provided.
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ...","Hui, Victoria Tin-bor",Beijing's Hard and Soft Repression in Hong Kong,Orbis,Hong Kong's new Police Commissioner Chris Tan...
