In [1]:
# Generation of updated vocabulary files for minimap

In [25]:
import tqdm
import csv
import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
import pandas as pd
import tqdm
from collections import defaultdict
import pickle

In [3]:
nlp = spacy.load("en_core_web_sm")

In [6]:
# regular expressions and text processing functions

import re

with open('../robotreviewer/data/minimap/prepositions_conjunctions.txt', 'r') as f:
    prep_conj = [l.strip() for l in f]

prep_conj_re = re.compile(r'\b({})\b'.format('|'.join(prep_conj)))
nos_ignore = re.compile(r'\bNOS\b') # note do after lowercase
pos_ignore = re.compile(r"(?<=\w)(\'s?)\b")
left_paren = re.compile(r"^\[(X|V|D|M|EDTA|SO|Q)\]")
paren = re.compile(r"[\(\[]\w+[\)\]]")
strip_space = re.compile(r"\s+")

def remove_nos(text):
    return nos_ignore.sub(' ', text)

def remove_pos(text):
    return pos_ignore.sub('', text)

def syn_uninv(text):
    try:
        inversion_point = text.index(', ')
    except ValueError:
        # not found
        return text
    
    if inversion_point+2 == len(text):
        # i.e. if the ', ' is at the end of the string
        return text
    
    if prep_conj_re.search(text[inversion_point+2:]):
        return text
    else:
        return text[inversion_point+2:] + " " + text[:inversion_point]
    
def ne_parentheticals(text_str):
    text_str = left_paren.sub('', text_str)
    text_str = paren.sub('', text_str)
    return text_str


In [7]:
# pipelines

def minimap(text_str, chunks=False):
    return matcher(pipeline(text_str, umls_mode=False), chunks=chunks)


def pipeline(text_str, umls_mode=True):
        
    # 1. removal of parentheticals
    if umls_mode:
        text_str = ne_parentheticals(text_str)
    
    # hyphens to spaces
    text_str = text_str.replace('-', ' ')
    # 3. conversion to lowercase
    # text_str = text_str.lower()
    # 2. syntactic uninverstion
    if umls_mode:
        text_str = syn_uninv(text_str)
    # 4. stripping of possessives
    text_str = remove_pos(text_str)
    # strip NOS's
    if umls_mode:
        text_str = remove_nos(text_str)
    # last... remove any multiple spaces, or starting/ending with space
    text_str = strip_space.sub(' ', text_str)    
    text_str = text_str.strip()
    return text_str

In [None]:
# first generate str to CUI map
str_to_cui_full = defaultdict(list)

with open('umls_full_index.csv') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in tqdm.tqdm(reader):
        if row['sab'] in ['MSH', 'SNOMEDCT_US', 'MDR', "ATC", "RXNORM", "ICD10"]:
            # just keep those which are in the Cochrane vocabs
            doc = nlp(pipeline(row['str'], umls_mode=True).lower())
            str_to_cui_full[' '.join(t.lemma_ for t in doc)].append(row['cui'])


In [None]:
str_to_cui = {}
for k, v in str_to_cui_full.items():
    str_to_cui[k] = list(set(v))

In [None]:
import pickle
with open('str_to_cui.pck', 'wb') as f:
    pickle.dump(str_to_cui_full, f)

In [9]:
# now CUI to preferred term map

df = pd.read_csv('cui_str.csv', sep='\t')

In [None]:


cui_to_pstr = defaultdict(dict)
for i, r in tqdm.tqdm(df.iterrows()):
    cui_to_pstr[r['cui']][r['sab']] = r['str']
order = ["RXNORM", "MSH", "SNOMEDCT_US", "ICD10", "MDR", "ATC"]

In [None]:
cui_to_str = {}

for k, v in cui_to_pstr.items():
    for p in order:
        if p in v:
            cui_to_str[k] = v[p]
            break
                
            

In [None]:
with open('cui_to_str.pck', 'wb') as f:
    pickle.dump(cui_to_str, f)

In [12]:
import networkx as nx

In [None]:
(graph_data.iterrows())

In [None]:
graph_data = pd.read_csv('cui_graph.csv', sep='\t')
G = nx.DiGraph()
G.add_edges_from(((r['cui2'], r['cui1']) for i, r in tqdm.tqdm(graph_data.iterrows())))


In [26]:
with open('cui_subtrees.pck', 'wb') as f:
    pickle.dump(G, f)