In [1]:
import numpy as np
import pandas as pd
import json
import itertools

# COVID-19 Research Paper Data Cleaning

- Read in the metadata csv, select only the papers with full texts available.
- Split the df into four dfs according to the license type, so its easier to find the corresponding full text file
- Drops rows where the "sha" corresponds to more than one full text file


In [None]:
"""Split up metadata.csv into separate dfs based on licence type."""

df = pd.read_csv("CORD-19-research-challenge/metadata.csv")
df_with_full = df.loc[df['has_full_text'] == True].drop(columns = 'has_full_text')

df_custom_license = df_with_full.loc[df_with_full['full_text_file'].str.match("custom_license")]
df_noncomm_use_subset = df_with_full.loc[df_with_full['full_text_file'].str.match("noncomm_use_subset")]
df_comm_use_subset = df_with_full.loc[df_with_full['full_text_file'].str.match("comm_use_subset")]
df_biorxiv_medrxiv = df_with_full.loc[df_with_full['full_text_file'].str.match("biorxiv_medrxiv")]

invalid_rows = []
for index, row in df_custom_license.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)

df_custom_license = df_custom_license.drop(invalid_rows)

invalid_rows = []
for index, row in df_noncomm_use_subset.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)
        
df_noncomm_use_subset = df_noncomm_use_subset.drop(invalid_rows)

invalid_rows = []
for index, row in df_comm_use_subset.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)
        
df_comm_use_subset = df_comm_use_subset.drop(invalid_rows)

invalid_rows = []
for index, row in df_biorxiv_medrxiv.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)
        
df_biorxiv_medrxiv = df_biorxiv_medrxiv.drop(invalid_rows)

In [None]:
"""Method takes in the full text file of paper and returns all the text as a string."""

def json_to_body_string(body_json):
    body_text = body_json[0][3]
    body_text_json = json.dumps(body_text)
    body_text_df = pd.read_json(body_text_json, orient="records")
    body_string = ""
    for i in np.arange(len(body_text_df)):
        if (isinstance(body_text_df["text"][i], str)):
            body_string += body_text_df["text"][i]
        else:
            not_strings.append(type(body_text_df["text"][i]))
    return body_string

In [None]:
"""Method takes in df_license (one of the four license dfs) and license_type (its license type as a string),
and adds the contents of each paper into full_texts_dict.
"""

def compile_full_texts(df_license, license_type):
    for i in df_license["sha"]:
        temp_json = pd.read_json("CORD-19-research-challenge/" + license_type +
                                 "/" + license_type + "/" + i + ".json", orient="index")
        full_text_string = json_to_body_string(temp_json)
        full_texts_dict[i] = full_text_string

In [None]:
# full_texts_dict = {}
# not_strings = []
# for i in df_custom_license["sha"]:
#     temp_json = pd.read_json("CORD-19-research-challenge/custom_license/custom_license/" + i + ".json", orient="index")
#     full_text_string = json_to_body_string(temp_json)
#     full_texts_dict[i] = full_text_string

In [None]:
"""Compiles the full texts into the dictionary full_texts_dict, which contains each paper "sha" mapped to its
contents as a string.
"""

full_texts_dict = {}
not_strings = []
compile_full_texts(df_custom_license, "custom_license")
compile_full_texts(df_noncomm_use_subset, "noncomm_use_subset")
compile_full_texts(df_comm_use_subset, "comm_use_subset")
compile_full_texts(df_biorxiv_medrxiv, "biorxiv_medrxiv")

In [None]:
len(full_texts_dict)