In [1]:
# Take all JSON from Blob Container and upload to Azure Search

In [2]:
import globals

import os
import pickle

import json
import requests
from pprint import pprint

from nltk.tokenize import sent_tokenize, word_tokenize

import time

from joblib import Parallel, delayed

In [3]:
max_sentences = 25

In [8]:
def processFile(data):
    try:
        json_response = requests.post(url = globals.ta_base_url, json= data).json()
        return json_response
    except Exception as ex:
        return {"error": [str(ex)]}


def createSearchUploadDoc(docID, json_content):
    try:
               
        CONDITION_QUALIFIER = []
        DIAGNOSIS = []
        DIRECTION = []
        EXAMINATION_NAME = []
        EXAMINATION_RELATION = []
        FAMILY_RELATION = []
        GENDER = []
        GENE = []
        MEDICATION_CLASS = []
        MEDICATION_NAME = []
        ROUTE_OR_MODE = []
        SYMPTOM_OR_SIGN = []
        TREATMENT_NAME = []
        BODY_STRUCTURE = []
        VARIANT = []

        
#         remove if negated....

        for doc in json_content['documents']:
            for ent in doc['entities']:
                if 'links' in ent:
                    if 'category' in ent:
                        for link in ent['links']:
                            if 'UMLS' in str(link):
                                if link['id'] in umls_concept_dict:
                                    if ent['category'] == 'TreatmentName':
                                        TREATMENT_NAME.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'BodyStructure':
                                        BODY_STRUCTURE.append(umls_concept_dict[link['id']])
                                    if ent['category'] == 'ConditionQualifier':
                                        CONDITION_QUALIFIER.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'Diagnosis':
                                        DIAGNOSIS.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'Direction':
                                        DIRECTION.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'ExaminationName':
                                        EXAMINATION_NAME.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'ExaminationRelation':
                                        EXAMINATION_RELATION.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'FamilyRelation':
                                        FAMILY_RELATION.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'Gender':
                                        GENDER.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'Gene':
                                        GENE.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'MedicationClass':
                                        MEDICATION_CLASS.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'MedicationName':
                                        MEDICATION_NAME.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'RouteOrMode':
                                        ROUTE_OR_MODE.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'SymptomOrSign':
                                        SYMPTOM_OR_SIGN.append(umls_concept_dict[link['id']])
                                    elif ent['category'] == 'Variant':
                                        VARIANT.append(umls_concept_dict[link['id']])

                                

        #unique terms and back to list
        BODY_STRUCTURE = list(set(BODY_STRUCTURE))
        CONDITION_QUALIFIER = list(set(CONDITION_QUALIFIER))
        DIAGNOSIS = list(set(DIAGNOSIS))
        DIRECTION = list(set(DIRECTION))
        EXAMINATION_NAME = list(set(EXAMINATION_NAME))
        EXAMINATION_RELATION = list(set(EXAMINATION_RELATION))
        FAMILY_RELATION = list(set(FAMILY_RELATION))
        GENDER = list(set(GENDER))
        GENE = list(set(GENE))
        MEDICATION_CLASS = list(set(MEDICATION_CLASS))
        MEDICATION_NAME = list(set(MEDICATION_NAME))
        ROUTE_OR_MODE = list(set(ROUTE_OR_MODE))
        SYMPTOM_OR_SIGN = list(set(SYMPTOM_OR_SIGN))
        TREATMENT_NAME = list(set(TREATMENT_NAME))
        VARIANT = list(set(VARIANT))

        return {"@search.action": "merge","docID": docID, "appliedMedicalEntities": True, "treatmentName":TREATMENT_NAME, "examinationName": EXAMINATION_NAME, 
                                   "bodyStructure": BODY_STRUCTURE, "diagnosis": DIAGNOSIS, "conditionQualifier": CONDITION_QUALIFIER, "direction": DIRECTION,
                                   "examinationRelation": EXAMINATION_RELATION, "familyRelation": FAMILY_RELATION, "gender": GENDER, "gene": GENE,
                                   "medicationClass": MEDICATION_CLASS, "medicationName": MEDICATION_NAME, "routeOrMode": ROUTE_OR_MODE, "symptomOrSign": SYMPTOM_OR_SIGN,
                                  "variant": VARIANT, "taHealthJSON": str(json_content)}
#         documents = {"value": []}
#         documents["value"].append({"@search.action": "merge","docID": docID, "treatmentName":TREATMENT_NAME, "examinationName": EXAMINATION_NAME, 
#                                    "bodyStructure": BODY_STRUCTURE, "diagnosis": DIAGNOSIS, "conditionQualifier": CONDITION_QUALIFIER, "direction": DIRECTION,
#                                    "examinationRelation": EXAMINATION_RELATION, "familyRelation": FAMILY_RELATION, "gender": GENDER, "gene": GENE,
#                                    "medicationClass": MEDICATION_CLASS, "medicationName": MEDICATION_NAME, "routeOrMode": ROUTE_OR_MODE, "symptomOrSign": SYMPTOM_OR_SIGN,
#                                   "variant": VARIANT})
#         url = globals.endpoint + "indexes/" + globals.indexName + "/docs/index" + globals.api_version
#         response  = requests.post(url, headers=globals.headers, json=documents)

    except Exception as ex:
        print ("Error:", ex)
        return {"@search.action": "merge","docID": docID}


In [5]:
# Load the UMLS dictionary of concept id's to names
with open(os.path.join(globals.umls_dir, 'umls_concept_dict.pickle'), 'rb') as handle:
    umls_concept_dict = pickle.load(handle)


In [9]:
def createTAHealthDoc(title, abstract, body):
    data = {"documents": []}

    try:
        id_counter = 0
        if len(title) > 1:
            id_counter += 1
            data["documents"].append({"language": "en","id": str(id_counter),"text": title})

        id_counter += 1
        if (len(abstract.strip())) > 1:
            data["documents"].append({"language": "en","id": str(id_counter),"text": abstract})
        else:
            data["documents"].append({"language": "en","id": str(id_counter),"text": body[0:5000]})

    except Exception as ex:
            print ("Error:", ex)

    return data


In [10]:
print ("Uploading concepts to Azure Cognitive Search...")

# Get a document that has not been completed
url = globals.endpoint + "indexes/" + globals.indexName + "/docs" + globals.api_version + "&$count=true&$top=20&search=*&$select=docID,title,abstractContent,body&$filter=appliedMedicalEntities eq null"
response  = requests.get(url, headers=globals.headers)
docs = response.json()

while len(docs["value"]) > 0:
    print ('[' +  str(docs['@odata.count']) + " left]...")
    for doc in docs["value"]:
        docID = doc['docID']

        ta_health_doc = createTAHealthDoc(doc["title"], doc["abstractContent"], doc["body"])
        print ("Processing docID:", docID, "[Len: " + str(len(str(ta_health_doc))) + "]")

        ta_health_resp = processFile(ta_health_doc)
        if 'error' not in ta_health_resp:
            search_upload = createSearchUploadDoc(docID, ta_health_resp)

            documents = {"value": []}
            documents["value"].append(search_upload)
            url = globals.endpoint + "indexes/" + globals.indexName + "/docs/index" + globals.api_version
        else:
            print ('Error - Will Apply Empty Concepts: ', ta_health_resp['error'])
            documents = {"value": []}
            search_upload = {"@search.action": "merge","docID": docID, "appliedMedicalEntities": True}
            documents["value"].append(search_upload)
            url = globals.endpoint + "indexes/" + globals.indexName + "/docs/index" + globals.api_version
            time.sleep(3)

        upload_response  = requests.post(url, headers=globals.headers, json=documents)

        if upload_response.json()['value'][0]['status'] != True:
            print ("Error:", upload_response.json()['value'][0]['errorMessage'])

            
    # Get another doc...
    url = globals.endpoint + "indexes/" + globals.indexName + "/docs" + globals.api_version + "&$count=true&$top=20&search=*&$select=docID,title,abstractContent,body&$filter=appliedMedicalEntities eq null"
    response  = requests.get(url, headers=globals.headers)
    docs = response.json()



Uploading concepts to Azure Cognitive Search...
[24364 left]...
Processing docID: 03069298205fe6b0be10fbd78556aa4b8c772ade [Len: 3184]
Processing docID: 4ddf5d95158203e4053fba41b1d3fe3948a5d9a3 [Len: 5238]
Processing docID: d682465041554fddc9e1dbc626bae7a0a512c631 [Len: 1152]
Processing docID: a7e361f72a101cc83a7e59d48f3378c404953fb7 [Len: 5058]
Processing docID: 96aa124328bbf7a4035a9bbeb0781fd88b1e0328 [Len: 5038]
Processing docID: f63f9fc68b5246d913e6d243e06f229729c917a5 [Len: 6026]
Processing docID: 063561d8ab083c6a70826e25d2ce7b72cb9ee5b4 [Len: 5058]
Processing docID: 7a7aa345486356561071e652c8433da5fe613afe [Len: 543]
Processing docID: 828bdb06f7e81b91792cd716ca34080c95c1a116 [Len: 1219]
Processing docID: f7026692f134a65a599ed94c02199f9735baf7dc [Len: 1105]
Processing docID: 5d724fd6b5bd4ce9a79bbd3d7a07d0f5c204ce80 [Len: 1749]
Processing docID: 1069688a257849848d15032b7e300835f7eb950c [Len: 5058]
Processing docID: 859fc5e0c7ae10e2bbe36a3f320ef554427fb3a3 [Len: 1217]
Processing doc

KeyboardInterrupt: 