# Metamap output Prototyping

A notebook devoted to wranggling output from metamap. Details on JSON format can be found here: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/JSON.pdf

In [1]:
import os
import json

In [2]:
JSON_SAMPLE = "/home/xc383@drexel.edu/text2graph/experiments/metamap/data/mtsamples-type-70-sample-154.txt.json"
with open(JSON_SAMPLE, "r") as file:
    data = json.loads(file.readlines()[1])
print("There are %d documents" % len(data["AllDocuments"]))

There are 9 documents


In [2]:
from typing import List, Optional
from dataclasses import dataclass, asdict, astuple

@dataclass
class Mapping:

    cui: str
    sources: List[str]
    score: int
    matched: str
    preferred: str
    matched_words: List[str]
    semantic_types: List[str]

    is_head: Optional[bool]=None
    negated: Optional[bool]=None


    def asdict(self) -> dict:
        return asdict(self)

    def astuple(self) -> dict:
        return astuple(self)

def map_to_uterance(mapping: dict) -> Mapping:
    ret = Mapping(
        cui=mapping.get("CandidateCUI"),
        sources=mapping.get("Sources"),
        score=int(mapping.get("CandidateScore")),
        matched=mapping.get("CandidateMatched"),
        matched_words=mapping.get("MatchedWords"),
        preferred=mapping.get("CandidatePreferred"),
        semantic_types=mapping.get("SemTypes"),
        is_head=mapping.get("isHead")
    )

    if mapping.get("IsHead") == "no":
        ret.is_head = False
    elif mapping.get("IsHead") == "yes":
        ret.is_head = True

    if mapping.get("Negated") != "0":
        ret.negated = True
    else:
        ret.negated = False

    return ret

In [109]:
def failproof(func, default=None):
    def wrap(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            return default
    return wrap


In [133]:
def crawl(mapping, keyset, func=lambda x: x):

    content = []
    key = keyset[0]
    value = mapping.get(key)

    if not keyset[1:] and isinstance(value, list):
        for item in value:
            content.append(func(item))
    elif not keyset[1:] and value:
        content.append(func(value))
    elif keyset[1:] and isinstance(value, list):
        for item in value:
            content += crawl(item, keyset[1:], func)
    elif keyset[1:] and isinstance(value, dict):
        content += crawl(value, keyset[1:], func)

    return content


In [117]:
test = {
    "a": [
        {"b": 1},
        {"b": 2}
    ]
}

In [118]:
crawl(test, ["a", "b"], lambda x: x)

[1, 2]

In [134]:
path = ["AllDocuments", "Document", "Utterances", "Phrases", "Mappings", "MappingCandidates"]
func = failproof(map_to_uterance, [])

x = crawl(data, path, func)

In [6]:
from dataclasses import fields


['cui',
 'sources',
 'score',
 'matched',
 'preferred',
 'matched_words',
 'semantic_types',
 'is_head',
 'negated']

In [23]:
def parse_mapping_canidates(mapping):
    concepts = []
    for canidate in mapping.get("MappingCandidates"):
        concepts.append(map_to_uterance(canidate))
    return concepts

def parse_phrase_mapping(phrase):
    concepts = []
    for mapping in phrase.get("Mappings"):
        concepts += parse_mapping_canidates(mapping)
    return concepts

def parse_utterance_phrase(utterance):
    concepts = []
    for phrase in utterance.get("Phrases"):
        concepts += parse_phrase_mapping(phrase)
    return concepts

def parse_document_utterances(document):
    concepts = []
    for utterance in document.get("Utterances"):
        concepts += parse_utterance_phrase(utterance)
    return concepts

['AllDocuments',
 'Document',
 'Utterances',
 'Phrases',
 'Mappings',
 'MappingCandidates']

In [114]:
parse_document_utterances(data["AllDocuments"][8]["Document"])

[Mapping(cui='C0037088', sources=['SNOMEDCT_US'], score=-966, matched='Clinical finding', preferred='Signs and Symptoms', matched_words=['finding'], semantic_types=['sosy'], is_head=True, negated=False),
 Mapping(cui='C0184661', sources=['SNOMEDCT_US'], score=-1000, matched='Procedure, NOS', preferred='Interventional procedure', matched_words=['procedure'], semantic_types=['topp'], is_head=True, negated=False),
 Mapping(cui='C0332287', sources=['SNOMEDCT_US'], score=-578, matched='In addition to', preferred='In addition to', matched_words=['with'], semantic_types=['ftcn'], is_head=False, negated=False),
 Mapping(cui='C0030702', sources=['SNOMEDCT_US'], score=-731, matched='Client satisfaction', preferred='Client satisfaction', matched_words=['patient', 'satisfaction'], semantic_types=['qlco'], is_head=True, negated=False),
 Mapping(cui='C0002915', sources=['SNOMEDCT_US'], score=-600, matched='General anesthesia, NOS', preferred='General Anesthesia', matched_words=['general', 'anaesthes

In [7]:
data["AllDocuments"][8]["Document"]["AAs"]

NameError: name 'data' is not defined