# Extract semantic relations from SemRep files

In [None]:
import os
import json
import pandas as pd
import numpy as np
import logging
import urllib, sys
import urllib.parse

from tqdm import tqdm
from pathlib import Path
from semrep import SemRep
from utils import *

from lxml import etree, objectify
from datetime import datetime
from urllib.request import urlopen

logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)

SEMREP_DIR = './data/CORD-19/2020-09-28/CORD-19.SemRep'
JSON_DIR = './data/json'
METADATA_FILE = './data/cord-19/2020-09-25/metadata2.csv'
CONSTRAINTS = './conf/constraints.json'
OUTPUT_FILE = './data/CORD-19/2020-09-28/sub_rel_obj_pyear_edat_pmid_sent_id_sent.tsv.gz'

## Process files

In [None]:
logging.info('Parse SemRep files...')

input_dir = SEMREP_DIR
output_dir = JSON_DIR
Path(output_dir).mkdir(parents=True, exist_ok=True)
files = os.listdir(input_dir)

for infile in tqdm(files):
    input_file = os.path.join(input_dir, infile)
    output_file = os.path.join(output_dir, infile.split(os.extsep, 1)[0] + '.json')
    with open(output_file, 'w') as file:
        semrep = SemRep(input_file)
        file.write(json.dumps(semrep.extraction(), indent=4))

In [None]:
logging.info('Write JSON files...')

input_dir = JSON_DIR
files = os.listdir(input_dir)
all_relations = []

for infile in tqdm(files):
    input_file = os.path.join(input_dir, infile)
    with open(input_file) as file:
        doc = json.loads(file.read())
        for key, value in doc.items():
            tuple_relations = [(rel['subject_cui'], rel['subject_label'], rel['subject_sem_type'], rel['subject_geneid'], rel['predicate'].upper(), \
                rel['object_cui'], rel['object_label'], rel['object_sem_type'], rel['object_geneid'], rel['sent_id'], rel['sent_text']) if rel['negation'] == ''
                else (rel['subject_cui'], rel['subject_label'], rel['subject_sem_type'], rel['subject_geneid'], 'NEG_' + rel['predicate'].upper(), \
                rel['object_cui'], rel['object_label'], rel['object_sem_type'], rel['object_geneid'], rel['sent_id'], rel['sent_text']) for rel in value]
        all_relations.append({'cord_uid':key, 'relations':tuple_relations})

In [None]:
logging.info('Create relations...')

# Read GENERIC_CONCEPT CUIs
with open(CONSTRAINTS) as fh:
    data = json.load(fh)
    
allow_lst = data['generic_concepts']

# Combine relations
relations = []
for path in all_relations:
    cord_uid = path['cord_uid']
    relation = path['relations']
    for r in relation:
        relations.append((cord_uid,
                          r[0], r[1], r[2], r[3], is_novel(r[0], allow_lst),
                          r[4],
                          r[5], r[6], r[7], r[8], is_novel(r[5], allow_lst), r[9], r[10]))

In [None]:
logging.info('Create data frame...')

# Build data frame
df = pd.DataFrame(relations, columns=['cord_uid', 'subject_cui', 'subject_label', 'subject_sem_type', \
    'subject_geneid', 'subject_novelty', 'predicate', 'object_cui', 'object_label', 'object_sem_type', \
    'object_geneid', 'object_novelty', 'sent_id', 'sent_text'])

In [None]:
logging.info('Read and merge metadata file...')

# Read metadata.csv file
metadata = pd.read_csv(METADATA_FILE, usecols=['cord_uid', 'pubmed_id', 'publish_time'],
                       dtype={'cord_uid': 'str', 'pubmed_id': 'str', 'publish_time': 'str'},
                       parse_dates=['publish_time'], low_memory=False)

# Drop duplicate rows in metadata file
metadata = metadata.drop_duplicates(subset=['cord_uid'])

# Merge both data frames
##df = pd.merge(df, metadata[['cord_uid','pubmed_id','publish_time']], how='left', on='cord_uid')
df = pd.merge(df, metadata, how='left', on='cord_uid')

In [None]:
logging.info('Modify data frame...')

df.replace('', np.nan, inplace=True)

# Modify Pub year
df['publish_year'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%d').dt.strftime('%Y')
df['publish_time'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%d').dt.strftime('%Y-%m-%d')

# Concatenate CUI and gene ID's
df['subject_cui'] = df['subject_cui'].str.cat(df['subject_geneid'], na_rep='', sep='|').str.strip('|').str.replace(',', '|', regex=False)
df['object_cui'] = df['object_cui'].str.cat(df['object_geneid'], na_rep='', sep='|').str.strip('|').str.replace(',', '|', regex=False)

# Filter by concept's novelty
df = df[(df['subject_novelty'] == 1) & (df['object_novelty'] == 1)]

# Select fields
df = df[['subject_cui', 'subject_label', 'subject_sem_type', 'predicate', 'object_cui', 'object_label', \
    'object_sem_type', 'publish_year', 'publish_time', 'pubmed_id', 'sent_id', 'sent_text']]

In [None]:
# Prepare PMIDs for retrieving EDATs
pmids = df['pubmed_id'].dropna().unique().tolist()
pmids = [str(x) for x in pmids]

In [None]:
logging.info('Retrieva data from NCI server...')

# Retrieve EDATs from NCBI server
# Store in file for potential later use
fh = open('pmids.txt', 'w')

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

for group in tqdm(chunker(pmids, 300)):
    efetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?&db=pubmed&retmode=xml&id=%s" % (','.join(group))
    handle = urlopen(efetch)
    p = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
    root = etree.parse(handle, p).getroot()

    for i in range(len(root)):
        try:
            pmid = root[i].xpath(".//PMID/text()")[0]
            date = root[i].xpath(".//PubMedPubDate[@PubStatus='pubmed']")[0]
            year = date.xpath('./Year/text()')[0]
            month = date.xpath('./Month/text()')[0]
            day = date.xpath('./Day/text()')[0]
            edat = datetime.strptime('-'.join([year,month,day]), '%Y-%m-%d').strftime('%Y-%m-%d')
            fh.write(pmid + '\t' + edat + '\n')
        except:
            pass

fh.close()

In [None]:
# Read stored PMIDs with EDATs
pmid_df = pd.read_csv('./pmids.txt', sep='\t', header=None, names=['pmid', 'edat'],
                     dtype={'pmid': 'str', 'edat': 'str'})
pmid2edat = dict(zip(pmid_df.pmid, pmid_df.edat))

In [None]:
logging.info('Update data frame with EDAT data...')

# Update data frame with proper dates
def convert(pmid, edat):
    if pmid in pmid2edat:
        return pmid2edat[pmid]
    else:
        return edat
    
df['publish_time'] = df.apply(lambda row: convert(row['pubmed_id'], row['publish_time']), axis=1)
df.to_csv(OUTPUT_FILE, sep='\t', header=False, index=False, compression='gzip')