# Metamap output Prototyping

A notebook devoted to wranggling output from metamap. Details on JSON format can be found here: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/JSON.pdf

In [1]:
import os
import json

from concept import Concept, dict_to_concept
import pandas as pd
import re

In [2]:
JSON_SAMPLE = "/home/xc383@drexel.edu/text2graph/experiments/metamap/data/mtsamples-type-70-sample-154.txt.json"
with open(JSON_SAMPLE, "r") as file:
    data = json.loads(file.readlines()[1])
print("There are %d documents" % len(data["AllDocuments"]))

There are 9 documents


In [3]:
def failproof(func, default=None, verbose:bool=False):
    def wrap(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if verbose:
                print(e)
            return default
    return wrap

In [4]:
from metaparse import calc_relative_positions, remove_multinewline, pull_concepts, conflate_instances, process_metamap_json
positions = calc_relative_positions(data)

In [5]:
process_metamap_json(data)

[{'document_n': 0,
  'utterance_n': 0,
  'phrase_n': 0,
  'mapping_n': 0,
  'canidate_n': 0,
  'CandidateCUI': 'C0370003',
  'SemTypes': ['sbst'],
  'Sources': ['SNOMEDCT_US'],
  'CandidateScore': -645,
  'CandidateMatched': 'Specimen',
  'CandidatePreferred': 'Specimen',
  'IsHead': 'no',
  'Negated': False,
  'StartPos': 0,
  'EndPos': 6,
  'document_start': 0},
 {'document_n': 0,
  'utterance_n': 0,
  'phrase_n': 0,
  'mapping_n': 0,
  'canidate_n': 1,
  'CandidateCUI': 'C0332307',
  'SemTypes': ['qlco'],
  'Sources': ['SNOMEDCT_US'],
  'CandidateScore': -645,
  'CandidateMatched': 'Type - attribute',
  'CandidatePreferred': 'Type - attribute',
  'IsHead': 'no',
  'Negated': False,
  'StartPos': 7,
  'EndPos': 11,
  'document_start': 0},
 {'document_n': 0,
  'utterance_n': 0,
  'phrase_n': 0,
  'mapping_n': 0,
  'canidate_n': 2,
  'CandidateCUI': 'C0037778',
  'SemTypes': ['bmod'],
  'Sources': ['SNOMEDCT_US'],
  'CandidateScore': -861,
  'CandidateMatched': 'Medical speciality',
  

In [5]:
index_names = ["document_n", "utterance_n", "phrase_n", "mapping_n", "canidate_n", "word_n"]

In [6]:
dp = pd.DataFrame(positions)
df = conflate_instances(pull_concepts(data))

In [7]:
dfull = pd.merge(df, dp, on=index_names[0])

In [8]:
with open("/home/xc383@drexel.edu/text2graph/data/mtsamples/raw/mtsamples-type-70-sample-154.txt", "r") as fi:
    text = remove_multinewline(fi.read())

In [9]:
print(text)

Sample Type / Medical Specialty:  Cosmetic / Plastic Surgery
Sample Name: Lipectomy - Abdomen/Thighs 
Description: Suction-assisted lipectomy - lipodystrophy of the abdomen and thighs.
(Medical Transcription Sample Report)
-----
PREOPERATIVE DIAGNOSIS:  Lipodystrophy of the abdomen and thighs.
POSTOPERATIVE DIAGNOSIS:  Lipodystrophy of the abdomen and thighs.
OPERATION:  Suction-assisted lipectomy.
ANESTHESIA:  General.
FINDINGS AND PROCEDURE:  With the patient under satisfactory general endotracheal anesthesia, the entire abdomen, flanks, perineum, and thighs to the knees were prepped and draped circumferentially in sterile fashion.  After this had been completed, a #15 blade was used to make small stab wounds in the lateral hips, the pubic area, and upper edge of the umbilicus.  Through these small incisions, a cannula was used to infiltrate lactated Ringers with 1000 cc was infiltrated initially into the abdomen.  A 3 and 4-mm cannulas were then used to carry out the liposuction of 

In [10]:
doc = dict(dfull.iloc[110])
doc

{'document_n': 8,
 'utterance_n': 7,
 'phrase_n': 0,
 'mapping_n': 0,
 'canidate_n': 0,
 'CandidateCUI': 'C0030705',
 'SemTypes': ['podg'],
 'Sources': ['SNOMEDCT_US'],
 'CandidateScore': -1000,
 'CandidateMatched': 'Patient',
 'CandidatePreferred': 'Patients',
 'IsHead': 'yes',
 'Negated': True,
 'StartPos': 1010,
 'EndPos': 1017,
 'document_start': 416}

In [11]:
start = doc["document_start"] + doc["StartPos"] + doc["document_n"]
end = doc["document_start"] + doc["EndPos"] + doc["document_n"]
print(start,"-",end)
text[start:end]

1434 - 1441


'patient'

In [12]:
c = dict_to_concept(doc, "abc.json")

In [14]:
c.asdict()

{'file_name': 'abc.json',
 'document_id': 8,
 'utterance_id': 7,
 'phrase_id': 0,
 'mapping_id': 0,
 'canidate_id': 0,
 'cui': 'C0030705',
 'start_pos': 1010,
 'end_pos': 1017,
 'document_offset': 416,
 'semantic_types': ['podg'],
 'sources': ['SNOMEDCT_US'],
 'score': -1000,
 'matched': 'Patient',
 'preferred': 'Patients',
 'is_head': 'yes',
 'negated': True}