In [91]:
import pandas as pd
import csv

import json

#-------------------------------------
# Read supersense dictionary
#------------------------------------ 

supersense_dictionary = {}
with open('SS_dictionary.json') as json_file:
    supersense_dictionary = json.load(json_file)
#print(supersense_dictionary)

#-------------------------------------
# Read ultrasound term dictionary
#------------------------------------ 

rad_csv = pd.read_csv('radiology_dictionary.csv',names=['entity','category']) 
rad_dict = dict(zip(rad_csv.entity, rad_csv.category))


#-------------------------------------
# Read supersence-relations mapping
#------------------------------------ 

rel_ss_df = pd.read_csv("SS_rel_mapping.csv", names=['SS','rel'])

dict_from_csv = dict(zip(rel_ss_df.SS, rel_ss_df.rel))

#-------------------------------------
# Read ultrasound term dictionary
#------------------------------------ 

#dict_df = pd.read_csv('UltrasoundDictionary.csv',names=['entity']) 
dict_list = rad_csv['entity'].tolist()

#dict_list = [_ for i in range(len(dict_list)) for _ in dict_list[i]]

#print(dict_list[0:20])
#--------------------------------------------------------------
#  find max substring
#--------------------------------------------------------------       
def substringSieve(string_list):
  out = []
  for s in string_list:
    if not any([s in entity for entity in string_list if s != entity]):
      out.append(s)
  return out

def get_pos(word, tokens,pos_list):
  try :
    index = tokens.index(word)
  except ValueError :
    index  = 0
  
  return pos_list[index]


def get_property_relation(ent_1,ent_2, single_entity):
  
  seq = 1
  rel = 'PropertyOf'
  try:
    cat1 = rad_dict[ent_1]
  except KeyError as e:
    cat1 = 'property'
  try:
    cat2 = rad_dict[ent_2]
  except KeyError as e:
    cat2 = 'property'
  if cat1 == 'anatomy' and cat2 == 'anatomy':
    rel = 'PartOf'
    #depend on adp : todo
    if single_entity:
      seq = 2
    else:
      seq = 1

  elif cat1 == 'finding' and cat2 == 'anatomy':
    rel = 'FoundIn'
    seq = 1
  elif cat1 == 'descriptor' and cat2 == 'anatomy':
    rel = 'PropertyOf'
    seq = 1
  elif cat1 == 'property' and cat2 == 'finding':
    rel = 'PropertyOf'
    seq = 1
  elif cat1 == 'property' and cat2 == 'anatomy':
    rel = 'PropertyOf'
    seq = 1
  elif cat1 == 'descriptor' and cat2 == 'finding':
    rel = 'PropertyOf'
    seq = 1

  elif cat2 == 'descriptor' and cat1 == 'anatomy':
    rel = 'DescriptorOf'
    seq = 2
  elif cat2 == 'property' and cat1 == 'finding':
    rel = 'PropertyOf'
    seq = 2
  elif cat2 == 'property' and cat1 == 'anatomy':
    rel = 'PropertyOf'
    seq = 2
  elif cat2 == 'descriptor' and cat1 == 'finding':
    rel = 'PropertyOf'
    seq = 2



  return rel, seq

def process_noun_chunk(ent_1, token_SS_dict):
  token1 = token_SS_dict[ent_1]
  token_list = token1['token_list']
  root_list = token1['root_list']
  pos_list = token1['pos_list']
  #print(token_list, root_list,pos_list )
  #get root of element


  dictinary_match_list  = [s for s in dict_list if (" " + s.strip() + " ") in " " + ent_1.lemma_.strip() + " "]
  dictinary_match_list = substringSieve(dictinary_match_list)   
  #print("dictinary_match_list",dictinary_match_list)

    #find unmatched entities
  unmatched_entities = []
  if (len(dictinary_match_list)!=0):
    matched_sent = ' '.join(dictinary_match_list)
    unmatched_entities = set(token_list).difference(set(matched_sent.split()))

      #unmatched_entities = unmatched_entities +  [item for item in token_list if item not in dict_item]
  else:
    unmatched_entities = token_list

  #print('unmatched',unmatched_entities)
  if (len(dictinary_match_list)!=0):
    root_element_list = [s for s in dictinary_match_list if str(root_list) in s]
    if (len(root_element_list) != 0):
      root_element = root_element_list[0]
    else:
      root_element = str(root_list)        
  else:
    root_element = str(root_list)


  for s in dictinary_match_list:
      #print('s ',s)
    if s != root_element:
      s_pos = get_pos(s,token_list, pos_list )
      if s_pos in (['PNOUN','NOUN','ADJ']):

        rel, seq = get_property_relation(s, root_element, True)
          
        if seq == 1:
          #writer.writerow({'ent1': s , 'rel' : rel ,'ent2': root_element })
          print('ent1: ', s  , 'rel: ', rel ,'ent2: ',  root_element)
        else:
          #writer.writerow({'ent1': root_element , 'rel' : rel ,'ent2': s })
          print('ent1: ', root_element  , 'rel: ', rel ,'ent2: ',  s)
  
  for word in unmatched_entities:
    #print('word', word)
    s_pos = get_pos(word,token_list, pos_list)
    #print(type(word), type(root_element))
    if word != root_element:
      if s_pos in (['PNOUN','NOUN','ADJ']):
        writer.writerow({'ent1': word , 'rel' : 'PropertyOf' ,'ent2': root_element })
        print('ent1: ', word , 'rel: PropertyOf' ,'ent2: ',  root_element)

  return root_element


def get_relation(ent_1, verb, ent_2, token_SS_dict):
  print('Input : ',ent_1, verb, ent_2)

  #---------------------process entity 1

  root_element1 = process_noun_chunk(ent_1, token_SS_dict)
  root_element2 = process_noun_chunk(ent_2, token_SS_dict)

  #----------------------------------

  token1 = token_SS_dict[ent_1]
  token1_list = token1['token_list']
  root1_list = token1['root_list']
  pos1_list = token1['pos_list']

  token2 = token_SS_dict[ent_2]
  token2_list = token2['token_list']
  root2_list = token2['root_list']
  pos2_list = token2['pos_list']
  seq = 1
  rel = 'PropertyOf'
  if verb == None:
    rel, seq = get_property_relation(root_element1, root_element2, False)
    if seq == 1:
      #writer.writerow({'ent1': s , 'rel' : rel ,'ent2': root_element })
      print('ent1: ', root_element1  , 'rel: ', rel ,'ent2: ',  root_element2)
    else:
      #writer.writerow({'ent1': root_element , 'rel' : rel ,'ent2': s })
      print('ent1: ', root_element2  , 'rel: ', rel ,'ent2: ',  root_element1)
  
 
  else:

    try:
      cat1 = rad_dict[root_element1]
    except KeyError as e:
      cat1 = 'NotFound'
    try:
      cat2 = rad_dict[root_element2]
    except KeyError as e:
      cat2 = 'NotFound'

    if cat1 == 'NotFound' or cat2 == 'NotFound':
      varb_token = token_SS_dict[verb]
      verb_SS_list = varb_token['ss_list']
      ss = verb_SS_list[0]
      rel = dict_from_csv[ss]
      print('ent1:', root_element1, 'rel:', rel, 'ent2:', root_element2)

    else:

      if cat1 == 'anatomy' and cat2 == 'anatomy':
        rel = 'PartOf'
        #depend on adp : todo
        seq = 1
      elif cat1 == 'finding' and cat2 == 'anatomy':
        rel = 'FoundIn'
        seq = 1
      elif cat1 == 'descriptor' and cat2 == 'anatomy':
        rel = 'PropertyOf'
        seq = 1
      elif cat1 == 'property' and cat2 == 'finding':
        rel = 'PropertyOf'
        seq = 1
      elif cat1 == 'property' and cat2 == 'anatomy':
        rel = 'PropertyOf'
        seq = 1

      elif cat2 == 'descriptor' and cat1 == 'anatomy':
        rel = 'DescriptorOf'
        seq = 2
      elif cat2 == 'property' and cat1 == 'finding':
        rel = 'FoundIn'
        seq = 1
      elif cat2 == 'property' and cat1 == 'anatomy':
        rel = 'PropertyOf'
        seq = 2
      
      if seq == 1:
        print('ent1:', root_element1, 'rel:', rel, 'ent2:', root_element2)
      else:
        print('ent1:', root_element2, 'rel:', rel, 'ent2:', root_element1)

  return rel, seq

In [99]:
import spacy
nlp = spacy.load("en_core_web_sm")
#nlp.add_pipe("merge_entities")

#-------------------------------------
# Read cleaned text data
#------------------------------------ 

df = open('Data.txt','r') 
lines = df.readlines()

debugFile = open('Debug.txt', 'w')

#-------------------------------------
# Open file to write triplets
#------------------------------------ 

csvfile = open('NewKG.csv', 'w', newline='\n')
fieldnames = ['ent1', 'rel', 'ent2']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

#-------------------------------------
# For each line find chunk dependencies
#------------------------------------ 

error_cnt = 0
for line in lines:
  if line.find('/') != -1:
    continue
  print(line)
  sentence_matadata_df = pd.DataFrame(columns=['chunk_token','chunk_POS','chunk_dep','token_list','index_list','ss_list','root_list'])
  ent1 = line.replace('\n','')
  ent1 = ent1.lower()
  
  #-------------------------------------
  #get supersenses
  #-------------------------------------
  try:
    value1 = supersense_dictionary[ent1]
  except KeyError:
    error_cnt = error_cnt + 1
    #print(error_cnt)
    continue
  ent1_val = pd.DataFrame(value1).T
  ent1_val.columns = ['token', 'pos', 'supersense']  
  supersense = ent1_val['supersense'].tolist()
  pos_list = ent1_val['pos'].tolist()
  #print(supersense)
  doc = nlp(ent1)
  root_list = []
  for chunk in doc.noun_chunks:
    if (len(chunk)>1):
      root_list.append(chunk.root.lemma_)
  
  nlp.add_pipe(nlp.create_pipe('merge_noun_chunks'))
  doc = nlp(ent1)
  
  #for token in doc:
  #    print(token.text, token.pos_, token.dep_)
  sentences = list(doc.sents)
  sent = sentences[0]

  leaf_nodes = []

  root_token = sent.root
  #get all leaf nodes
  final_root_list = []
  cnt = 0
  i = 0
  token_SS_dict = dict()

  for token in sent:   

    if (token.n_lefts + token.n_rights) == 0 :
      leaf_nodes.append(token)

    tokens= token.lemma_.split(' ')
    if (len(tokens)>1):
      root = root_list[cnt]
      cnt = cnt + 1
    else:
      root = token.lemma_
    index = [item for item in range(i, i + len(tokens))]

    ss_list = supersense[i:i + len(tokens)]
    pos_lst = pos_list[i:i + len(tokens)]
    df2 = {'token_list':tokens, 'index_list':index, 'pos_list':pos_lst, 'ss_list':ss_list, 'root_list':root}
    
    token_SS_dict[token] = df2

    i = i + len(tokens)

  #print('token_SS_dict',token_SS_dict)
  #print(sent, leaf_nodes)
  print(leaf_nodes)
  
  object_to_process = []
  verb_to_process = []
  adp_to_process = []
  subj_list = []
  path_aux = []

  #foreach leaf node find path upto root

  for leaf_token in leaf_nodes:
    path_obj = []
    path_adp = []
    path_subj = []
    temp_path_subj = []
    token = leaf_token
    #print('leaf', leaf_token)
    while True:
      #find ancestor
      #print('token',token)
      print(token)
      if token.dep_ == 'attr' and token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
        path_obj.append(token)
      if token.dep_ == 'appos' and token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
        if len(path_obj) != 0:
          prev_obj = path_obj.pop()
          if len(path_adp)!=0:
            prev_adp = path_adp.pop()
            rel, seq = get_relation( prev_obj, prev_adp, token,token_SS_dict)

      if token.dep_ == 'xcomp':
        verb_to_process.append(token)

      if token.pos_ == 'AUX':
        path_aux.append(token)

      if token.dep_ == 'prep':
        path_adp.append(token)

      if token.dep_ == 'nsubj' or token.dep_ == 'nsubjpass':        
        subj_list.append(token)
        subj_list = subj_list + temp_path_subj
        #remove from obj
        #print('before obj', object_to_process,temp_path_subj)
        object_to_process = [i for i in object_to_process if i not in temp_path_subj]
        #print('after obj', object_to_process)
        if len(path_obj) != 0:
          prev_obj = path_obj.pop()
          if len(path_adp)!=0:
            prev_adp = path_adp.pop()
            rel, seq = get_relation( prev_obj, prev_adp, token,token_SS_dict)

     
      #-----------------------process conj
      
      if token.dep_ == 'conj':

        if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
          if len(path_obj) != 0:
            prev_obj = path_obj.pop()
            if len(path_adp)!=0:
              prev_adp = path_adp.pop()
              rel, seq = get_relation( prev_obj, prev_adp, token,token_SS_dict)
        else:
          verb_processed = False
          right_childeren = token.rights
          child = None
          for r_child in right_childeren:
            child = r_child
            break
          if child != None:
            if child.dep_ == 'pobj' or child.dep_ == 'dobj':
              verb_processed = True
              rel, seq = get_relation( token, None, child, token_SS_dict)
              #print('token',token)

          if token.pos_ == 'VERB' and not verb_processed:
            verb_to_process.append(token)
          elif not verb_processed:
            object_to_process.append(token)
            temp_path_subj.append(token)
            #print('temp_path_subj',token,temp_path_subj)

      


      if token.dep_ == 'pobj' or token.dep_ == 'dobj':

        if len(path_obj) != 0:
          prev_obj = path_obj.pop()
          if len(path_adp)!=0:
            prev_adp = path_adp.pop()
            rel, seq = get_relation( token, prev_adp, prev_obj,token_SS_dict)

        path_obj.append(token)

      if token.pos_ == 'ADJ':
        #print('ADJ', token, len(path_obj), len(path_adp))
        if len(path_obj) != 0:
          prev_obj = path_obj[0]
          if len(path_adp)!=0:
            prev_adp = path_adp.pop()
            rel, seq = get_relation( token, prev_adp, prev_obj,token_SS_dict)

      #print('token',token)
      
      for ancestor in token.ancestors:
        ans = ancestor
        #print('ans',ans)
        break

      if token.pos_ == 'AUX' and ans.pos_ == 'VERB':
        verb_to_process.append(ans)

      if (ans == root_token):
        if token.pos_ == 'VERB':
          print(token)
          verb_to_process.append(token)
          
        if ans.dep_ == 'ROOT' and (ans.pos_ == 'NOUN' or ans.pos_ == 'PROPN'):
          #print("In root")
          if len(path_obj) != 0:
            prev_obj = path_obj.pop()
            if len(path_adp)!=0:
              prev_adp = path_adp.pop()
              rel, seq = get_relation( ans, prev_adp, prev_obj,token_SS_dict)
              #print('ent1:', ans.text, 'rel:', prev_adp.text, 'ent2:', prev_obj.text)
        object_to_process = object_to_process + path_obj
        adp_to_process = adp_to_process + path_adp
        
        break
       # if is_last_obj_processed:
        #  break
        #else:
          # find subject of root
         # object_to_process.append()
          #findSubject()
      else:
        #print('ans',ans)
        token = ans

  #print(subj_list,object_to_process, verb_to_process)

  subj_list = set(subj_list)
  object_to_process = set(object_to_process)
  verb_to_process = set(verb_to_process)

  #if len(subj_list)!=0 and len(object_to_process)==0 and len(verb_to_process)==0:



  if len(subj_list)!=0 and len(object_to_process)!=0:
    for sub in subj_list:
      for obj in object_to_process:
        if len(adp_to_process) != 0:
          rel = adp_to_process.pop()
        else:
          rel = root_token
        rel, seq = get_relation( sub, rel, obj,token_SS_dict)

  if len(subj_list) and len(verb_to_process)!=0:
    for sub in subj_list:
      for obj in verb_to_process:
        rel, seq = get_relation( obj, None, sub,token_SS_dict)

  nlp.remove_pipe('merge_noun_chunks')
  


small calculus seen in middle 1.1 mm and lower 11 mm calyceal group of right kidney.
[middle, 1.1 mm, and, right kidney, .]
middle
in
seen
seen
Input :  small calculus in middle
ent1:  small rel:  PropertyOf ent2:  calculus
ent1:  small rel:  PropertyOf ent2:  calculus
ent1: calculus rel: FoundIn ent2: middle
1.1 mm
seen
seen
and
seen
seen
right kidney
of
11 mm calyceal group
Input :  11 mm calyceal group of right kidney
ent1:  calyceal rel: PropertyOf ent2:  group
ent1: group rel: PartOf ent2: right kidney
lower
Input :  lower None 11 mm calyceal group
ent1:  calyceal rel: PropertyOf ent2:  group
ent1:  low rel:  PropertyOf ent2:  group
seen
seen
.
