# Part 1: Remove those articles from CORD-19 which are not in CORD-19-on-FHIR.

## Inputs and outputs

In [150]:
DATA_PATH = "./dataset/metadata.csv" #source: https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge?select=metadata.csv
FHIR_DATA_PATH = "./dataset/PUBMED/" #"unzip the "PUBMED.zip" for FHIR data (source; https://github.com/fhircat/CORD-19-on-FHIR)
FHIR_SUBSET_PATH = './dataset/metadata_fhir_subset.csv'  #output (intermediate result)
METADATA_WITH_OPENCITATIONS = './dataset/metadata_with_opencitations.csv' #output

### CORD-19

* metadata.csv You can download the Full metadata file from this link -  https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge?select=metadata.csv
* new permalink for replication: https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-10-12.tar.gz (results sligtly differ)

### CORD-on-FHIR

1. You can download all the PUBMED data set (.ttl file) from the CORD-19-on-FHIR GitHub.

2. Download all the Pubtator abstracts files in the "CORD-19-Abstracts" folder from the link (https://github.com/fhircat/CORD-19-on-FHIR/tree/master/datasets/Pubtator_RDF/CORD-19-Abstracts) and copy all the .ttl file in a single folder (Example: PUBMED). 

Folder Name (Github): CORD-19-Abstracts
zip Files name:
1. PubTator_Abstracts_comm_use_subset_2020-04-09.zip
2. PubTator_Abstracts_custom_license_2020-04-09.zip
3. PubTator_Abstracts_noncomm_use_subset_2020-04-09.zip
4. PubTator_Abstracts_unknown_license_2020-04-09.zip



In [151]:
import pandas as pd
import os
from rdflib import Graph

## Reading _all_ documents - NEW VERSION

In [152]:
FHIR_SUBSET_PATH_ALL = './dataset/metadata_fhir_subset_ALL.csv'  #output (intermediate result)

In [153]:
documents_all = pd.read_csv(DATA_PATH, dtype=object)


In [154]:
len(documents_all)

302449

In [155]:
import glob
fhir = [os.path.basename(f).replace(".ttl","") for f in glob.glob(FHIR_DATA_PATH+ "*.ttl")]

In [156]:
documents_all_2 = documents_all[documents_all['pubmed_id'].isin(fhir)]

### The number of paired documents

In [157]:
len(documents_all_2)

31761

In [158]:
documents_all_2.fillna(0).to_csv(FHIR_SUBSET_PATH_ALL,encoding='utf-8',index=False)   #metadata_fhir_subset

## Replication of V1 code - only first 10% of documents from CORD19 were used in the pairing process

In [159]:
FHIR_SUBSET_PATH_V1_new_code = './dataset/metadata_fhir_subset_new_code.csv'

In [202]:
maxRows=int(len(documents_all)/10)

In [203]:
maxRows

30244

In [204]:
documents_all_first= documents_all[0:maxRows] #13308

In [205]:
documents_all_3 = documents_all_first[documents_all_first['pubmed_id'].isin(fhir)]


### The number of paired documents

In [206]:
len(documents_all_3)

5153

In [163]:
documents_all_3.fillna(0).to_csv(FHIR_SUBSET_PATH_V1_new_code,encoding='utf-8',index=False)   #metadata_fhir_subset

# Part 2: Citation extraction 

## Inputs

In [174]:
import pandas as pd
from Opencitations.OpenCitations import retrieveOpenCitationFromAPI #This extracts citations through the Opencitations API and stores them locally in the Opencitations/data/metadata.json file. To extract new citation count every time from the API, remove the docs folder inside the citation_Extraction_and_Count/Opencitations folder, then run it again.
from Opencitations.OpenCitationExtraction import Open_Citation_Extraction #This file tries to match the DOI from the metadata file to the citation list in the cached JSON file with OpenCitations responses

ModuleNotFoundError: No module named 'Opencitations'

In [None]:
research_metadata_df = pd.read_csv(FHIR_SUBSET_PATH, error_bad_lines=False)

In [None]:
research_metadata_df.drop_duplicates(subset=['doi'], keep=False)

In [None]:
research_metadata_df = research_metadata_df.dropna()

In [None]:
research_metadata_df.drop(research_metadata_df[research_metadata_df['abstract'].map(len) < 3].index)

In [None]:
#To clear cache, remove the docs folder inside the Opencitations/docs folder
#output_path = "Opencitations/data"
retrieveOpenCitationFromAPI(clearCache=True) #

In [None]:
json_path = './Opencitations/docs/Opencitations/data/metadata.json'
Opencitation_df = Open_Citation_Extraction(json_path, research_metadata_df, FHIR_SUBSET_PATH)

In [None]:
FHIR_SUBSET_PATH

In [None]:
Opencitation_df

In [None]:
Opencitation_df.isin(['NaN']).sum()

In [None]:
Opencitation_df.fillna(0).to_csv(METADATA_WITH_OPENCITATIONS,encoding='utf-8',index=False) #metadata_with_opencitations