# cTakes Prototyping

To be used in developing a cTakes:
1. Data class
2. KG construction

In [1]:
from lxml import etree
import os

In [2]:
USER_HOME = os.getenv("HOME")
SAMPLE_XMI = os.path.join(USER_HOME, "text2graph/experiments/ctakes/sample/mtsamples-type-3-sample-343.txt.xmi")

SAMPLE_XMI

'/home/xc383@drexel.edu/text2graph/experiments/ctakes/sample/mtsamples-type-3-sample-343.txt.xmi'

In [38]:
import os
from lxml import etree

class Document:

    def __init__(self, path:str):
        self.__path = path
        self.__validate_path()

        # Create xml parser
        self.__tree = etree.parse(SAMPLE_XMI)
        self.__namespaces = namespaces = tree.getroot().nsmap.copy()
        self.__root = root = tree.getroot()

    def __str__(self):
        return self.get_xpath("/xmi:XMI/structured:DocumentID/@documentID")[0]

    def __validate_path(self):
        return os.path.isfile(self.__path)

    @property
    def root(self):
        return self.__root

    def get_xpath(self, xpath:str):
        return self.__root.xpath(xpath, namespaces=self.__namespaces)

    def get_id(self, id: int):
        elems = root.xpath(".//*[@xmi:id='%d']" % x, namespaces=namespaces)
        if elems:
            return elems[0]
        return None


In [39]:
d = Document(SAMPLE_XMI)
str(d)

'mtsamples-type-3-sample-343'

## Data Class

### Dictionary

Below is a dictionary that contains the fields in the output of the cTakes clinical pipeline. This dictionary can be used to parse a given XMI file.

In [3]:
clinpipe_dict = {
    "text": {
        "description": "The text in which all artifacts are extracted from. Text is stored under the value of 'sofaString'.",
        "xpath": "/xmi:XMI/cas:Sofa"
    },
    "metadata": {
        "document_id": {
            "description": "The name of document.",
            "xpath": "/xmi:XMI/structured:DocumentID/@documentID"
        },
        "document_path": {
            "description": "The origin of document.",
            "xpath": "/xmi:XMI/structured:DocumentPath/@documentPath"
        }
    },
    "index": {
        "syntax": {
            "sentence": {
                "description": "The start and stop index of a sentence in the document.",
                "xpath": "/xmi:XMI/textspan:Sentence"
            },
            "newline": {
                "description": "The index of a newlines in the document.",
                "xpath": "/xmi:XMI/syntax:NewlineToken"
            },
            "word": {
                "description": "The index of a word in the document.",
                "xpath": "/xmi:XMI/syntax:WordToken"
            },
            "punctuation": {
                "description": "The index of punctuation in the document.",
                "xpath": "/xmi:XMI/syntax:PunctuationToken"
            },
            "number": {
                "description": "The index of punctuation in the document.",
                "xpath": "/xmi:XMI/syntax:NumToken"
            },
            "symbol": {
                "description": "The index of a symbol in the document.",
                "xpath": "/xmi:XMI/syntax:SymbolToken"
            },
            "chunk": {
                "description": "The index of a chunk in the document.",
                "xpath": "/xmi:XMI/syntax:Chunk"
            },
            "dependant": {
                "description": "The index of a dependancy node in the document.",
                "xpath": "/xmi:XMI/syntax:ConllDependencyNode"
            },
            "predicate": {
                "description": "The index of a predicate in the document.",
                "xpath": "/xmi:XMI/textsem:Predicate"
            }
        },
        "mention": {
            "numeral": {
                "description": "The index of a mentioned Roman numeral in the document.",
                "xpath": "/xmi:XMI/textsem:RomanNumeralAnnotation"
            },
            "measurement": {
                "description": "The index of a mentioned measurement in the document.",
                "xpath": "/xmi:XMI/textsem:MeasurementAnnotation"
            },
            "fraction": {
                "description": "The index of a mentioned fraction in the document.",
                "xpath": "/xmi:XMI/textsem:FractionAnnotation"
            },
            "medication": {
                "description": "The index of a mentioned medication in the document.",
                "xpath": "/xmi:XMI/textsem:MedicationMention"
            },
            "disease": {
                "description": "The index of a mentioned disease or disorder in the document.",
                "xpath": "/xmi:XMI/textsem:DiseaseDisorderMention"
            },
            "symptom": {
                "description": "The index of a mentioned symptom or sign in the document.",
                "xpath": "/xmi:XMI/textsem:SignSymptomMention"
            },
            "procedure": {
                "description": "The index of a mentioned procedure in the document.",
                "xpath": "/xmi:XMI/textsem:ProcedureMention"
            },
            "anatomy": {
                "description": "The index of a mentioned anatomical site in the document.",
                "xpath": "/xmi:XMI/textsem:AnatomicalSiteMention"
            }
        },
        "cas": {
            "fs_list": {
                "description": "A non-empty fs list.",
                "xpath": "/xmi:XMI/cas:NonEmptyFSList"
            },
            "empty_fs_list": {
                "description": "A empty fs list.",
                "xpath": "/xmi:XMI/cas:EmptyFSList"
            }
        }
    },
    "semantic_role": {
        "argument": {
            "description": "The index of a semantic argument in the document.",
            "xpath": "/xmi:XMI/textsem:SemanticArgument"
        },
        "relation": {
            "description": "The index of a semantic role in the document.",
            "xpath": "/xmi:XMI/textsem:SemanticRoleRelation"
        }
    },
    "concept": {
        "umls": {
            "description": "A reference to an UMLS concept.",
            "xpath": "/xmi:XMI/refsem:UmlsConcept"
        }
    }
}

### Parsing cTakes Clincial Pipeline Output

The construction of a data class for cTakes is a relitivly simple task with complicated means. The output of the cTakes clinical pipeline is a XMI file with many fields for each step of the pipeline

In [4]:
tree = etree.parse(SAMPLE_XMI)

# Required to parse file
namespaces = tree.getroot().nsmap.copy()

root = tree.getroot()
print("XML Version:", tree.docinfo.xml_version)

XML Version: 1.0


In [5]:
# Get an element of id x.
get_id = lambda x: root.xpath(".//*[@xmi:id='%d']" % x, namespaces=namespaces)[0]

get_id(1).items()

[('{http://www.omg.org/XMI}id', '1'),
 ('sofaNum', '1'),
 ('sofaID', '_InitialView'),
 ('mimeType', 'text'),
 ('sofaString',
  'Sample Type / Medical Specialty:  Allergy / Immunology\n\nSample Name: Followup on Asthma \nDescription: A female for a complete physical and follow up on asthma with allergic rhinitis.\n\n(Medical Transcription Sample Report)\n\n-----\n\n\nSUBJECTIVE:  This is a 42-year-old white female who comes in today for a complete physical and follow up on asthma.  She says her asthma has been worse over the last three months.  She has been using her inhaler daily.  Her allergies seem to be a little bit worse as well.  Her husband has been hauling corn and this seems to aggravate things.  She has not been taking Allegra daily but when she does take it, it seems to help somewhat.  She has not been taking her Flonase which has helped her in the past.  She also notes that in the past she was on Advair but she got some vaginal irritation with that.\n\nShe had been noticing 

In [6]:
# Get an element of id x.
get_xpath = lambda x: root.xpath(x, namespaces=namespaces)

get_xpath(".")

[<Element {http://www.omg.org/XMI}XMI at 0x7fec723da300>]

In [7]:
document_path = root.xpath(
    clinpipe_dict["metadata"]["document_path"]["xpath"],
    namespaces=namespaces
)[0]

document_text = get_xpath(clinpipe_dict["text"]["xpath"])[0].get("sofaString")
print(document_text[:255] + " . . . \n\nsource: " + document_path)

Sample Type / Medical Specialty:  Allergy / Immunology

Sample Name: Followup on Asthma 
Description: A female for a complete physical and follow up on asthma with allergic rhinitis.

(Medical Transcription Sample Report)

-----


SUBJECTIVE:  This is a 4 . . . 

source: /home/xc383@drexel.edu/text2graph/data/mtsamples/raw/mtsamples-type-3-sample-343.txt


In [24]:
get_id(37682).items()

[('{http://www.omg.org/XMI}id', '37682'),
 ('sofa', '1'),
 ('begin', '139'),
 ('end', '145'),
 ('relations', '37713 37723 37733 37743'),
 ('frameSet', 'follow.03')]

In [23]:
get_id(37713).items()

[('{http://www.omg.org/XMI}id', '37713'),
 ('id', '0'),
 ('category', 'A1'),
 ('discoveryTechnique', '0'),
 ('confidence', '0.0'),
 ('polarity', '0'),
 ('uncertainty', '0'),
 ('conditional', 'false'),
 ('predicate', '37682'),
 ('argument', '37689')]

In [21]:
root.xpath(
    clinpipe_dict["semantic_role"]["argument"]["xpath"],
    namespaces=namespaces
)[0].items()

[('{http://www.omg.org/XMI}id', '37689'),
 ('sofa', '1'),
 ('begin', '104'),
 ('end', '110'),
 ('relation', '37713'),
 ('label', 'A1')]

In [19]:
root.xpath(
    clinpipe_dict["concept"]["umls"]["xpath"],
    namespaces=namespaces
)[0].items()

[('{http://www.omg.org/XMI}id', '14645'),
 ('codingScheme', 'sno_rx_16ab'),
 ('score', '0.0'),
 ('disambiguated', 'false'),
 ('cui', 'C0310367'),
 ('tui', 'T109'),
 ('preferredText', 'Today')]

In [8]:
medi = root.xpath(
    clinpipe_dict["index"]["mention"]["medication"]["xpath"],
    namespaces=namespaces
)[2]
medi.items()

[('{http://www.omg.org/XMI}id', '15016'),
 ('sofa', '1'),
 ('begin', '697'),
 ('end', '704'),
 ('id', '0'),
 ('ontologyConceptArr', '14992 15002'),
 ('typeID', '1'),
 ('discoveryTechnique', '1'),
 ('confidence', '0.0'),
 ('polarity', '1'),
 ('uncertainty', '0'),
 ('conditional', 'false'),
 ('generic', 'false'),
 ('subject', 'patient'),
 ('historyOf', '0')]

In [9]:

get_id(15002).items()

[('{http://www.omg.org/XMI}id', '15002'),
 ('codingScheme', 'RXNORM'),
 ('code', '83373'),
 ('score', '0.0'),
 ('disambiguated', 'false'),
 ('cui', 'C0286677'),
 ('tui', 'T121'),
 ('preferredText', 'Flonase')]

In [11]:
root.xpath(clinpipe_dict["concept"]["umls"]["xpath"], namespaces=namespaces)

[<Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadf4a6c0>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dbf664540>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dbf61c880>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadfa8440>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadfaafc0>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadfabec0>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadfabe80>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadfab080>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadfa92c0>,
 <Element {http:///org/apache/ctakes/typesystem/type/refsem.ecore}UmlsConcept at 0x7f1dadfabf00>,
 <Element {http:///o

In [35]:
medi.get("ontologyConceptArr")

'14992 15002'

In [30]:
document_text[int(medi.get("begin")):int(medi.get("end"))]

'Flonase'