# Extract semantic relations from SemRep files

In [29]:
import os
import json
import pandas as pd
import numpy as np

from tqdm import tqdm
from pathlib import Path
from semrep import SemRep
from utils import *

SEMREP_DIR = './data/CORD-19/2020-09-09/CORD-19.SemRep'
JSON_DIR = './data/json'
METADATA_FILE = './data/cord-19/2020-08-29/metadata.csv'
OUTPUT_FILE = './data/semrep_relations_cord_19.tsv'
CONSTRAINTS = '/home/andrej/Documents/git/lbd-covid/preprocessing/constraints.json'

## Process files

In [27]:
input_dir = SEMREP_DIR
output_dir = JSON_DIR
Path(output_dir).mkdir(parents=True, exist_ok=True)
files = os.listdir(input_dir)

for infile in tqdm(files):
    input_file = os.path.join(input_dir, infile)
    output_file = os.path.join(output_dir, infile.split(os.extsep, 1)[0] + '.json')
    with open(output_file, 'w') as file:
        semrep = SemRep(input_file)
        file.write(json.dumps(semrep.extraction(), indent=4))

100%|██████████| 227712/227712 [52:20<00:00, 72.50it/s]  


In [28]:
input_dir = JSON_DIR
files = os.listdir(input_dir)
all_relations = []

for infile in tqdm(files):
    input_file = os.path.join(input_dir, infile)
    with open(input_file) as file:
        doc = json.loads(file.read())
        for key, value in doc.items():
            tuple_relations = [(rel['subject_cui'], rel['subject_label'], rel['subject_sem_type'], rel['subject_geneid'], rel['predicate'].upper(), \
                rel['object_cui'], rel['object_label'], rel['object_sem_type'], rel['object_geneid'], rel['sent_id'], rel['sent_text']) if rel['negation'] == ''
                else (rel['subject_cui'], rel['subject_label'], rel['subject_sem_type'], rel['subject_geneid'], 'NEG_' + rel['predicate'].upper(), \
                rel['object_cui'], rel['object_label'], rel['object_sem_type'], rel['object_geneid'], rel['sent_id'], rel['sent_text']) for rel in value]
        all_relations.append({'cord_uid':key, 'relations':tuple_relations})

100%|██████████| 227712/227712 [29:20<00:00, 129.32it/s] 


In [30]:
# Read GENERIC_CONCEPT CUIs
with open(CONSTRAINTS) as fh:
    data = json.load(fh)
    
allow_lst = data['generic_concepts']

# Combine relations
relations = []
for path in all_relations:
    cord_uid = path['cord_uid']
    relation = path['relations']
    for r in relation:
        relations.append((cord_uid,
                          r[0], r[1], r[2], r[3], is_novel(r[0], allow_lst),
                          r[4],
                          r[5], r[6], r[7], r[8], is_novel(r[5], allow_lst), r[9], r[10]))

In [31]:
# Build data frame
df = pd.DataFrame(relations, columns=['cord_uid', 'subject_cui', 'subject_label', 'subject_sem_type', \
    'subject_geneid', 'subject_novelty', 'predicate', 'object_cui', 'object_label', 'object_sem_type', \
    'object_geneid', 'object_novelty', 'sent_id', 'sent_text'])

In [32]:
# Read metadata.csv file
metadata = pd.read_csv(METADATA_FILE)

# Drop duplicate rows in metadata file
metadata = metadata.drop_duplicates(subset=['cord_uid'])

# Merge both data frames
df = pd.merge(df, metadata[['cord_uid','pubmed_id','publish_time']], how='left', on='cord_uid')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [33]:
df.replace('', np.nan, inplace=True)

# Modify PMID and Pub year
df['pubmed_id'] = df['pubmed_id'].astype("Float32").astype("Int32")
df['publish_time'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%d').dt.strftime('%Y')

# Concatenate CUI and gene ID's
df['subject_cui'] = df['subject_cui'].str.cat(df['subject_geneid'], na_rep='', sep='|').str.strip('|').str.replace(',', '|', regex=False)
df['object_cui'] = df['object_cui'].str.cat(df['object_geneid'], na_rep='', sep='|').str.strip('|').str.replace(',', '|', regex=False)

# Filter by concept's novelty
df = df[(df['subject_novelty'] == 1) & (df['object_novelty'] == 1)]

# Select fields
df = df[['subject_cui', 'subject_label', 'subject_sem_type', 'predicate', 'object_cui', 'object_label', \
    'object_sem_type', 'publish_time', 'pubmed_id', 'sent_id', 'sent_text']]

# Export to tsv file
df.to_csv(OUTPUT_FILE, sep='\t', header=False, index=False)