# Data Cleaning

Create and export a df containing each paper ID (sha) and its correponding full text, as a string. This will be used in the COVID-19 Research Papers Text Extraction notebook to obtain the most "important" sentences in the relevant papers.


In [1]:
import numpy as np
import pandas as pd
import json
import itertools

Split up metadata.csv into separate dfs based on licence type.

In [2]:
df = pd.read_csv("data/metadata.csv")
df_with_full = df.loc[df['has_full_text'] == True].drop(columns = 'has_full_text')

df_custom_license = df_with_full.loc[df_with_full['full_text_file'].str.match("custom_license")]
df_noncomm_use_subset = df_with_full.loc[df_with_full['full_text_file'].str.match("noncomm_use_subset")]
df_comm_use_subset = df_with_full.loc[df_with_full['full_text_file'].str.match("comm_use_subset")]
df_biorxiv_medrxiv = df_with_full.loc[df_with_full['full_text_file'].str.match("biorxiv_medrxiv")]

invalid_rows = []
for index, row in df_custom_license.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)

df_custom_license = df_custom_license.drop(invalid_rows)

invalid_rows = []
for index, row in df_noncomm_use_subset.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)
        
df_noncomm_use_subset = df_noncomm_use_subset.drop(invalid_rows)

invalid_rows = []
for index, row in df_comm_use_subset.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)
        
df_comm_use_subset = df_comm_use_subset.drop(invalid_rows)

invalid_rows = []
for index, row in df_biorxiv_medrxiv.iterrows():
    if ";" in row['sha']:
        invalid_rows.append(index)
        
df_biorxiv_medrxiv = df_biorxiv_medrxiv.drop(invalid_rows)

Define `json_to_body_string(body_json)`, which takes in the full text file of a paper (as a json) and returns all the body text as a string.

In [42]:
def json_to_body_string(body_json):
    body_text = body_json[0][3]
    body_text_json = json.dumps(body_text)
    body_text_df = pd.read_json(StringIO(body_text_json), orient="records")
    body_string = ""
    for i in np.arange(len(body_text_df)):
        if (isinstance(body_text_df["text"][i], str)):
            body_string = body_string + " " + body_text_df["text"][i]
        else:
            not_strings.append(type(body_text_df["text"][i]))
    return body_string

Define `compile_full_texts(df_license, license_type)`, which takes in
- df_license (one of the four license dfs) and
- license_type (its license type as a string)  

and adds the contents of each paper into full_texts_dict.

In [43]:
def compile_full_texts(df_license, license_type):
    for i in df_license["sha"]:
        temp_json = pd.read_json("data/" + license_type +
                                 "/" + license_type + "/" + i + ".json", orient="index")
        full_text_string = json_to_body_string(temp_json)
        full_texts_dict[i] = full_text_string

In [5]:
# full_texts_dict = {}
# not_strings = []
# for i in df_custom_license["sha"]:
#     temp_json = pd.read_json("CORD-19-research-challenge/custom_license/custom_license/" + i + ".json", orient="index")
#     full_text_string = json_to_body_string(temp_json)
#     full_texts_dict[i] = full_text_string

Compile the full texts into the dictionary full_texts_dict, which contains each paper sha mapped to its
contents as a string

In [44]:
full_texts_dict = {}
not_strings = []
compile_full_texts(df_custom_license, "custom_license")
compile_full_texts(df_noncomm_use_subset, "noncomm_use_subset")
compile_full_texts(df_comm_use_subset, "comm_use_subset")
compile_full_texts(df_biorxiv_medrxiv, "biorxiv_medrxiv")

In [45]:
len(full_texts_dict)

27678

In [46]:
dict_df = pd.DataFrame.from_dict(full_texts_dict, orient='index')
dict_df.head()

Unnamed: 0,0
aecbc613ebdab36753235197ffb4f35734b5ca63,"The patient (Fo, ) was a 58 year old mentally..."
212e990b378e8d267042753d5f9d4a64ea5e9869,Pathogenesis and Risk Factors J. ROBERT CANTE...
bf5d344243153d58be692ceb26f52c08e2bd2d2f,"In the pathogenesis of rheumatoid arthritis, ..."
ddd2ecf42ec86ad66072962081e1ce4594431f9c,Respiratory Tract Infections JERROLD J. ELLNE...
a55cb4e724091ced46b5e55b982a14525eea1c7e,"A cute bronchitis, an illness frequently enco..."


In [47]:
dict_df.to_csv("full_texts.csv")