In [4]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import spacy
from transformers import pipeline


In [17]:
text="""An 81-year-old female with hypertension and gout was admitted to Taichung Veterans General Hospital due to abdominal pain and hematemesis.
She began to suffer from intermittent epigastralgia more than 10 years ago, and a 4 cm gastric tumor was found.
The abdominal pain got worse 2 years before admission, and she went to a local hospital where abdominal CT scan revealed a gastric tumor about 6 cm in length with well-circumscribed calcification.
Surgical intervention was suggested, but she declined.
About 10 days before admission, tarry stool passage was noted, and bloody vomitus was found 1 day later.
UGI scope revealed submucosal gastric tumor with central ulceration and she was then transferred to our hospital.
Physical examination showed upper abdominal tenderness with mild muscle guarding.
The plain radiography showed an irregular shape calcification over upper abdomen.
UGI scope revealed deep gastric ulcer with foreign body.
CT scan showed an irregularly shaped space-occupying lesion in front of the stomach with plate calcifications and localized free air.
Under the impression of perforated gastric tumor, emergent laparotomy was performed.
An infiltrative mass between the stomach and transverse colon was noted during operation.
A sharp, bone-like and thick calcified plate penetrating into the gastric mucosa and pericolic soft tissue was observed.
A submucosal tumor about 2.3 cm in size adherent to the calcified plate was also noted.
Distal subtotal gastrectomy and partial colectomy were performed.
The patient was discharged 13 days after operation uneventfully.
Microscopically, spindle-shaped tumor cells with low mitotic frequency were found.
Immunohistochemical staining of the tumor demonstrated diffusely strong positive reactivity for CD 117, positive reactivity for CD34, but negative reactivity for S100 protein and desmin.
The diagnosis of the tumor was established as GIST.
Due to the small size and the paucity of mitotic figures of the tumor located in the stomach, it was classified as very low risk[4].
Sporadic GIST was impressed due to no family history of GIST nor other GIST presented in this patient."""

"biomedical-ner-all" - model for NERs extraction

In [18]:
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
pipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities = True )
ner_results = pipe(text)
ner_results



[{'entity_group': 'Age',
  'score': 0.997478,
  'word': '81 - year - old',
  'start': 3,
  'end': 14},
 {'entity_group': 'Sex',
  'score': 0.99952865,
  'word': 'female',
  'start': 15,
  'end': 21},
 {'entity_group': 'History',
  'score': 0.90855014,
  'word': 'hypertension',
  'start': 27,
  'end': 39},
 {'entity_group': 'History',
  'score': 0.9878803,
  'word': 'gout',
  'start': 44,
  'end': 48},
 {'entity_group': 'Clinical_event',
  'score': 0.9998809,
  'word': 'admitted',
  'start': 53,
  'end': 61},
 {'entity_group': 'Nonbiological_location',
  'score': 0.9921479,
  'word': 'taichung veterans general hospital',
  'start': 65,
  'end': 99},
 {'entity_group': 'Biological_structure',
  'score': 0.99772674,
  'word': 'abdominal',
  'start': 107,
  'end': 116},
 {'entity_group': 'Sign_symptom',
  'score': 0.99996626,
  'word': 'pain',
  'start': 117,
  'end': 121},
 {'entity_group': 'Sign_symptom',
  'score': 0.9998373,
  'word': 'hem',
  'start': 126,
  'end': 129},
 {'entity_grou

In [19]:
def extract_entities(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=-1)[0] 
    tokens = inputs["input_ids"][0]
    entities = []
    current_entity = ''
    for token, prediction in zip(tokens, predictions):
        token_str = tokenizer.decode([token], skip_special_tokens=True)
        if prediction.item() > 0:  
            if current_entity:
                current_entity += token_str.replace("##", "")
            else:
                current_entity = token_str
        else:
            if current_entity:
                entities.append(current_entity)
                current_entity = ''
    if current_entity:
        entities.append(current_entity)
    return entities

ner_result = extract_entities(text)
ner_result

['81-year-oldfemale',
 'hypertension',
 'gout',
 'admitted',
 'taichungveteransgeneralhospital',
 'abdominalpain',
 'hemate',
 'intermittentepigastralgia',
 'than10years',
 '4cmgas',
 'tumor',
 'abdominalpaingotworse2yearsbefore',
 'went',
 'hospital',
 'abdominalct',
 'gastrictumor',
 '6cm',
 'well',
 '##ums',
 'cal',
 'surgicalintervention',
 '10daysbefore',
 'tarrystool',
 'bloodyvomitus',
 '1day',
 'ugiscope',
 'submucos',
 'gastrictumor',
 'centralulceration',
 'transferred',
 'physicalexamination',
 'abdominaltenderness',
 'muscle',
 'plainradiography',
 'irregularshapecalcification',
 'ugiscope',
 'gas',
 'ulcer',
 'ct',
 'irregularlyshapedspace-occupyinglesion',
 'stomach',
 'cal',
 'emerge',
 'laparotom',
 'in',
 'mass',
 'stomach',
 'transversecolon',
 'sharp',
 'bone',
 'like',
 'thickcal',
 'gastric',
 '##cosa',
 'per',
 'tissue',
 'submucosaltumor',
 '2.3cm',
 'cal',
 'plate',
 'distal',
 'gastre',
 'partialcolectomy',
 'discharged13daysafter',
 'spindle-shapedtumor',
 'im

In [9]:
nlp = spacy.load("en_core_web_trf") 

In [10]:
def extract_relationships(doc, entities):
    triplets = []
    for ent1 in entities:
        for ent2 in entities:
            if ent1 != ent2:
                ent1_start = doc.text.find(ent1)
                ent1_end = ent1_start + len(ent1)
                ent2_start = doc.text.find(ent2)
                ent2_end = ent2_start + len(ent2)
                if ent1_start != -1 and ent2_start != -1:
                    ent1_span = doc.char_span(ent1_start, ent1_end)
                    ent2_span = doc.char_span(ent2_start, ent2_end)

                    if ent1_span and ent2_span:
                        if ent1_span.root.head == ent2_span.root:
                            triplets.append((ent1, ent1_span.root.head.lemma_, ent2))
                        elif ent2_span.root.head == ent1_span.root:
                            triplets.append((ent2, ent2_span.root.head.lemma_, ent1))
    return triplets


doc = nlp(text)
triplets = extract_relationships(doc, ner_result)
triplets

[('gout', 'hypertension', 'hypertension'),
 ('gout', 'hypertension', 'hypertension'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor'),
 ('gastric', 'tumor', 'tumor')]

Separating the entities given by "bioepidemic-ner-all" model

In [20]:
ss = []
for entity in ner_results:
    if entity["entity_group"] == "Sign_symptom":
        ss.append(entity["word"])
    elif entity["entity_group"] == "Sign_symptom" and ss:
        ss[-1] += f" {entity['word']}"

print(f"symptoms: {', '.join(ss)}")

symptoms: pain, hem, ##ate, ep, ##igastralgia, tumor, pain, tumor, cal, tar, stool, bloody vomitus, tumor, tenderness, cal, ##cification, ulcer, les, ##ion, cal, mass, cal, tumor, tumor


In [21]:
structs = []
for entity in ner_results:
    if entity["entity_group"] == "Biological_structure":
        structs.append(entity["word"])
    elif entity["entity_group"] == "Biological_structure" and structs:
        structs[-1] += f" {entity['word']}"

print(f"Biological Structures: {', '.join(structs)}")

Biological Structures: abdominal, gas, abdominal, abdominal, gastric, submu, ##cos, gastric, abdominal, muscle, gas, stomach, stomach, transverse colon, gastric, ##cosa, per, tissue, submucosal


In [22]:
ther = []
for entity in ner_results:
    if entity["entity_group"] == "Therapeutic_procedure":
        ther.append(entity["word"])
    elif entity["entity_group"] == "Therapeutic_procedure" and ther:
        ther[-1] += f" {entity['word']}"

print(f"Therapeutic Procedures: {', '.join(ther)}")

Therapeutic Procedures: surgical intervention, gastre, cole, ##ct, ##omy


In [23]:

diag = []
for entity in ner_results:
    if entity["entity_group"] == "Diagnostic_procedure":
        diag.append(entity["word"])
    elif entity["entity_group"] == "Diagnostic_procedure" and diag:
        diag[-1] += f" {entity['word']}"

print(f"Diagnostic Procedure: {', '.join(diag)}")

Diagnostic Procedure: ct, ugi scope, physical examination, radiography, ugi scope, ct, lap, ##arotom, im, ##mun, ##ohistochemical stain, cd, s


BERT

In [13]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

pipe = pipeline("ner", model=model, tokenizer=tokenizer,grouped_entities = True)

ner_results = pipe(text)
ner_results

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'LOC',
  'score': 0.99404734,
  'word': 'Taichung Veterans General Hospital',
  'start': 65,
  'end': 99},
 {'entity_group': 'MISC',
  'score': 0.58302057,
  'word': 'CT',
  'start': 355,
  'end': 357},
 {'entity_group': 'ORG',
  'score': 0.9707205,
  'word': 'UGI',
  'start': 608,
  'end': 611},
 {'entity_group': 'ORG',
  'score': 0.9777513,
  'word': 'UGI',
  'start': 886,
  'end': 889}]

BioBERT

In [15]:
tokenizer = AutoTokenizer.from_pretrained("ugaray96/biobert_ncbi_disease_ner")
model = AutoModelForTokenClassification.from_pretrained("ugaray96/biobert_ncbi_disease_ner")

biobert_pipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities = True)
ner_results = biobert_pipe(text)
ner_results



[{'entity_group': 'No Disease',
  'score': 0.9999254,
  'word': 'An 81 - year - old female with',
  'start': 0,
  'end': 26},
 {'entity_group': 'Disease',
  'score': 0.9995314,
  'word': 'h',
  'start': 27,
  'end': 28},
 {'entity_group': 'Disease Continuation',
  'score': 0.86109966,
  'word': '##ypertension',
  'start': 28,
  'end': 39},
 {'entity_group': 'No Disease',
  'score': 0.9955342,
  'word': 'and',
  'start': 40,
  'end': 43},
 {'entity_group': 'Disease',
  'score': 0.9921278,
  'word': 'go',
  'start': 44,
  'end': 46},
 {'entity_group': 'Disease Continuation',
  'score': 0.99645233,
  'word': '##ut',
  'start': 46,
  'end': 48},
 {'entity_group': 'No Disease',
  'score': 0.9997368,
  'word': 'was admitted to Taichung Veterans General Hospital due to',
  'start': 49,
  'end': 106},
 {'entity_group': 'Disease',
  'score': 0.9617289,
  'word': 'abdominal',
  'start': 107,
  'end': 116},
 {'entity_group': 'Disease Continuation',
  'score': 0.97351295,
  'word': 'pain',
  'star

***For models like GPT, Claude and Copylot i have used chat feature. I am including the prompts i used for getting entities and relations along with the results.*** 

Claude - 3 sonnet

In [24]:
# These are the results given by Claude - 3 Sonnet via the chat feature.


# Prompt : "Text" give me all the named entities and relations from the text given.

# Results:
# (81-year-old female, had, hypertension)
# (81-year-old female, had, gout)
# (81-year-old female, was admitted to, Taichung Veterans General Hospital)
# (81-year-old female, suffered from, intermittent epigastralgia)
# (4 cm gastric tumor, was found, )
# (abdominal pain, got worse, 2 years before admission)
# (81-year-old female, went to, a local hospital)
# (abdominal CT scan, revealed, 6 cm gastric tumor)
# (Surgical intervention, was suggested, )
# (81-year-old female, declined, surgical intervention)
# (tarry stool passage, was noted, 10 days before admission)
# (bloody vomitus, was found, 1 day later)
# (UGI scope, revealed, submucosal gastric tumor)
# (submucosal gastric tumor, had, central ulceration)
# (81-year-old female, was transferred to, our hospital)
# (Physical examination, showed, upper abdominal tenderness)
# (Physical examination, showed, mild muscle guarding)
# (Plain radiography, showed, irregular shape calcification)
# (UGI scope, revealed, deep gastric ulcer)
# (UGI scope, revealed, foreign body)
# (CT scan, showed, irregularly shaped space-occupying lesion)
# (space-occupying lesion, was in front of, stomach)
# (space-occupying lesion, had, plate calcifications)
# (space-occupying lesion, had, localized free air)
# (Emergent laparotomy, was performed, )
# (infiltrative mass, was noted between, stomach and transverse colon)
# (sharp, bone-like thick calcified plate, was penetrating into, gastric mucosa and pericolic soft tissue)
# (2.3 cm submucosal tumor, was adherent to, calcified plate)
# (Distal subtotal gastrectomy, was performed, )
# (Partial colectomy, was performed, )
# (Spindle-shaped tumor cells, had, low mitotic frequency)
# (Tumor, demonstrated, strong positive reactivity for CD117)
# (Tumor, demonstrated, positive reactivity for CD34)
# (Tumor, demonstrated, negative reactivity for S100 protein and desmin)
# (Diagnosis of tumor, was established as, GIST)
# (GIST, was classified as, very low risk)
# (GIST case, was impressed as, sporadic)
# (No family history of GIST, was present, )

GPT-4

In [16]:
# These are the results given by GPT-4 via the chat feature.


# Prompt : "Text" give me all the named entities and relations from the text given.

# Results:
# 1. Patient Information Relations:
# (Patient, has_age, 81)
# (Patient, has_gender, Female)
# (Patient, has_condition, Hypertension)
# (Patient, has_condition, Gout)
# (Patient, admitted_to, Taichung Veterans General Hospital)
# 2. Symptom and Presentation Relations:
# (Patient, has_symptom, Abdominal pain)
# (Patient, has_symptom, Hematemesis)
# (Patient, has_symptom, Epigastralgia)
# (Patient, has_symptom, Tarry stool passage)
# (Patient, has_symptom, Bloody vomitus)
# Diagnostic Test and Findings Relations:
# (Gastric tumor, revealed_by, Abdominal CT scan)
# (Gastric tumor, size, 6 cm)
# (Gastric tumor, property, Well-circumscribed calcification)
# (Submucosal gastric tumor, revealed_by, UGI scope)
# (Submucosal gastric tumor, has_feature, Central ulceration)
# (Gastric ulcer, revealed_by, UGI scope)
# (Gastric ulcer, has_feature, Foreign body)
# (Space-occupying lesion, revealed_by, CT scan)
# (Space-occupying lesion, property, Plate calcifications)
# (Space-occupying lesion, property, Localized free air)
# 3. Medical Procedure Relations:
# (Patient, underwent_procedure, Surgical Intervention declined initially)
# (Patient, underwent_procedure, Emergent laparotomy)
# (Patient, underwent_procedure, Distal subtotal gastrectomy)
# (Patient, underwent_procedure, Partial colectomy)
# 4. Pathology and Diagnosis Relations:
# (Tumor cells, described_as, Spindle-shaped)
# (Tumor cells, mitotic_frequency, 4/50 HPF)
# (Tumor, tested_positive_for, CD 117)
# (Tumor, tested_positive_for, CD34)
# (Tumor, tested_negative_for, S100 protein)
# (Tumor, tested_negative_for, Desmin)
# (Patient, diagnosed_with, GIST)
# (GIST, risk_classification, Very low risk)
# 5. Family History and Outcome Relations:
# (Patient, family_history_of, No GIST)
# (Patient, discharge_status, Uneventfully discharged after 13 days)

Copilot

In [25]:
# These are the results given by GPT-4 via the chat feature.


# Prompt : "Text" give me all the named entities and relations from the text given as triplets in the form of subject-object-predicate.

#Result:
# 1. ("81-year-old female", "has", "hypertension")
# 2. ("81-year-old female", "has", "gout")
# 3. ("81-year-old female", "admitted to", "Taichung Veterans General Hospital")
# 4. ("81-year-old female", "has", "abdominal pain")
# 5. ("81-year-old female", "has", "hematemesis")
# 6. ("81-year-old female", "suffers from", "intermittent epigastralgia")
# 7. ("81-year-old female", "has", "4 cm gastric tumor")
# 8. ("81-year-old female", "went to", "local hospital")
# 9. ("abdominal CT scan", "revealed", "gastric tumor about 6 cm in length with well-circumscribed calcification")
# 10. ("81-year-old female", "declined", "surgical intervention")
# 11. ("81-year-old female", "had", "tarry stool passage")
# 12. ("81-year-old female", "had", "bloody vomitus")
# 13. ("UGI scope", "revealed", "submucosal gastric tumor with central ulceration")
# 14. ("81-year-old female", "transferred to", "our hospital")
# 15. ("Physical examination", "showed", "upper abdominal tenderness with mild muscle guarding")
# 16. ("plain radiography", "showed", "irregular shape calcification over upper abdomen")
# 17. ("UGI scope", "revealed", "deep gastric ulcer with foreign body")
# 18. ("CT scan", "showed", "irregularly shaped space-occupying lesion in front of the stomach with plate calcifications and localized free air")
# 19. ("laparotomy", "performed", "emergently")
# 20. ("operation", "noted", "infiltrative mass between the stomach and transverse colon")
# 21. ("operation", "observed", "sharp, bone-like and thick calcified plate penetrating into the gastric mucosa and pericolic soft tissue")
# 22. ("operation", "noted", "submucosal tumor about 2.3 cm in size adherent to the calcified plate")
# 23. ("Distal subtotal gastrectomy and partial colectomy", "performed", "on 81-year-old female")
# 24. ("81-year-old female", "discharged", "13 days after operation")
# 25. ("Microscopic examination", "found", "spindle-shaped tumor cells with low mitotic frequency (4/50 HPF)")
# 26. ("Immunohistochemical staining", "demonstrated", "diffusely strong positive reactivity for CD 117")
# 27. ("Immunohistochemical staining", "demonstrated", "positive reactivity for CD34")
# 28. ("Immunohistochemical staining", "demonstrated", "negative reactivity for S100 protein and desmin")
# 29. ("tumor", "diagnosed as", "GIST")
# 30. ("tumor", "classified as", "very low risk")
# 31. ("81-year-old female", "has", "Sporadic GIST")



Running the data with Cluade 3 opus

In [15]:
import os
import requests
import json

CLAUDE_API_ENDPOINT = "https://api.anthropic.com/v1/messages"

CLAUDE_API_KEY = "YOUR API KEY"

def extract_triplets(text):
    headers = {
        "Content-Type": "application/json",
        "X-API-Key": CLAUDE_API_KEY,
        "anthropic-version": "2023-06-01"  
    }

    data = {
        "model": "claude-3-opus-20240229",  
            "messages": [
            {"role": "user", "content": f"Here is a text:\n\n{text}\n\nExtract all the subject-predicate-object triplets from this text and format them as follows:\n\n(subject, predicate, object)\n(subject, predicate, object)\n... and dont give me any text just give me the triplets in the output"}
        ],        "max_tokens": 2048,
        "temperature": 0.2,
    }

    response = requests.post(CLAUDE_API_ENDPOINT, headers=headers, data=json.dumps(data))
    print(f"Response Status Code: {response.status_code}")
    print(f"Response Content: {response.content}")

    if response.status_code == 200:
        try:
            response_data = response.json()
            print(response_data["content"])
            if "content" in response_data:
                res = response_data["content"]
                text_data = res[0]['text']
                entries = text_data.split('\n')
                return entries
            else:
                print(f"Error: {response_data['error']}")
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
    else:
        print(f"Error: {response.text}")

    return []

text_files_dir = "C:/Users/koush/Desktop/ml in cl project/data"

annotations_dir = "C:/Users/koush/Desktop/ml in cl project/data/results"
os.makedirs(annotations_dir, exist_ok=True)

for file_name in os.listdir(text_files_dir):
    if file_name.endswith(".txt"):
        file_path = os.path.join(text_files_dir, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        triplets = extract_triplets(text)

        if triplets:
            annotation_file_name = os.path.splitext(file_name)[0] + "_annotations.txt"
            annotation_file_path = os.path.join(annotations_dir, annotation_file_name)

            with open(annotation_file_path, "w", encoding="utf-8") as annotation_file:
                for i in triplets:
                    annotation_file.write(i)#("\n".join(triplets))

            print(f"Annotations saved for {file_name} in {annotation_file_path}")
        else:
            print(f"No triplets extracted for {file_name}")


# DID NOT RUN ALL THE DATA CLAUDE WAS CHARGING MORE.

Response Status Code: 200
Response Content: b'{"id":"msg_01TtJ5qYpU5KwvEwR4L1mxBg","type":"message","role":"assistant","model":"claude-3-opus-20240229","stop_sequence":null,"usage":{"input_tokens":516,"output_tokens":467},"content":[{"type":"text","text":"(A 28-year-old previously healthy man, presented with, a 6-week history of palpitations)\\n(The symptoms, occurred during, rest)\\n(The symptoms, lasted, up to 30 minutes at a time)\\n(The symptoms, were associated with, dyspnea)\\n(physical examination, yielded, unremarkable findings)\\n(An electrocardiogram (ECG), revealed, normal sinus rhythm)\\n(An electrocardiogram (ECG), revealed, a Wolff\xe2\x80\x93 Parkinson\xe2\x80\x93 White pre-excitation pattern)\\n(a right-sided accessory pathway, produced, a Wolff\xe2\x80\x93 Parkinson\xe2\x80\x93 White pre-excitation pattern)\\n(Transthoracic echocardiography, demonstrated, the presence of Ebstein\'s anomaly of the tricuspid valve)\\n(Transthoracic echocardiography, demonstrated, apical 

KeyboardInterrupt: 

After getting all the triplets as subject-predicate-object we pass it to the graph module to create the Knowledge graph

***Creating the RDF Graph Database - Sparql***

In [None]:
# from rdflib import Graph, Literal, URIRef, Namespace
# from rdflib.namespace import RDF, RDFS

# # Define namespaces
# med = Namespace("http://medd.org/medical#")

# # Create an empty Graph
# g = Graph()

# # Define classes
# patient_class = med.Patient
# tumor_class = med.Tumor
# hospital_class = med.Hospital

# # Define properties
# had = med.had
# was_admitted_to = med.was_admitted_to
# suffered_from = med.suffered_from
# tumor_size = med.tumor_size
# tumor_was_found = med.tumor_was_found
# tumor_type = med.tumor_type
# tumor_risk_level = med.tumor_risk_level
# tumor_impression = med.tumor_impression
# abdominal_pain_got_worse = med.abdominal_pain_got_worse
# surgical_intervention_was_suggested = med.surgical_intervention_was_suggested
# surgical_intervention_was_declined = med.surgical_intervention_was_declined
# tarry_stool_passage_was_noted = med.tarry_stool_passage_was_noted
# bloody_vomitus_was_found = med.bloody_vomitus_was_found
# ugi_scope_revealed = med.ugi_scope_revealed
# physical_examination_showed = med.physical_examination_showed
# plain_radiography_showed = med.plain_radiography_showed
# ct_scan_showed = med.ct_scan_showed
# emergent_laparotomy_was_performed = med.emergent_laparotomy_was_performed
# infiltrative_mass_was_noted_between = med.infiltrative_mass_was_noted_between
# calcified_plate_was_penetrating_into = med.calcified_plate_was_penetrating_into
# submucosal_tumor_was_adherent_to = med.submucosal_tumor_was_adherent_to
# distal_subtotal_gastrectomy_was_performed = med.distal_subtotal_gastrectomy_was_performed
# partial_colectomy_was_performed = med.partial_colectomy_was_performed
# tumor_cells_had = med.tumor_cells_had
# tumor_demonstrated = med.tumor_demonstrated
# diagnosis_of_tumor_was_established_as = med.diagnosis_of_tumor_was_established_as
# no_family_history_of_gist_was_present = med.no_family_history_of_gist_was_present

# # Add classes to the Graph
# g.add((patient_class, RDF.type, RDFS.Class))
# g.add((tumor_class, RDF.type, RDFS.Class))
# g.add((hospital_class, RDF.type, RDFS.Class))

# # Add properties to the Graph
# g.add((had, RDF.type, RDF.Property))
# g.add((was_admitted_to, RDF.type, RDF.Property))
# g.add((suffered_from, RDF.type, RDF.Property))
# g.add((tumor_size, RDF.type, RDF.Property))
# g.add((tumor_was_found, RDF.type, RDF.Property))
# g.add((tumor_type, RDF.type, RDF.Property))
# g.add((tumor_risk_level, RDF.type, RDF.Property))
# g.add((tumor_impression, RDF.type, RDF.Property))
# g.add((abdominal_pain_got_worse, RDF.type, RDF.Property))
# g.add((surgical_intervention_was_suggested, RDF.type, RDF.Property))
# g.add((surgical_intervention_was_declined, RDF.type, RDF.Property))
# g.add((tarry_stool_passage_was_noted, RDF.type, RDF.Property))
# g.add((bloody_vomitus_was_found, RDF.type, RDF.Property))
# g.add((ugi_scope_revealed, RDF.type, RDF.Property))
# g.add((physical_examination_showed, RDF.type, RDF.Property))
# g.add((plain_radiography_showed, RDF.type, RDF.Property))
# g.add((ct_scan_showed, RDF.type, RDF.Property))
# g.add((emergent_laparotomy_was_performed, RDF.type, RDF.Property))
# g.add((infiltrative_mass_was_noted_between, RDF.type, RDF.Property))
# g.add((calcified_plate_was_penetrating_into, RDF.type, RDF.Property))
# g.add((submucosal_tumor_was_adherent_to, RDF.type, RDF.Property))
# g.add((distal_subtotal_gastrectomy_was_performed, RDF.type, RDF.Property))
# g.add((partial_colectomy_was_performed, RDF.type, RDF.Property))
# g.add((tumor_cells_had, RDF.type, RDF.Property))
# g.add((tumor_demonstrated, RDF.type, RDF.Property))
# g.add((diagnosis_of_tumor_was_established_as, RDF.type, RDF.Property))
# g.add((no_family_history_of_gist_was_present, RDF.type, RDF.Property))

# # Add instances to the Graph
# patient_1 = med.patient_1
# g.add((patient_1, RDF.type, patient_class))
# g.add((patient_1, had, med.hypertension))
# g.add((patient_1, had, med.gout))
# g.add((patient_1, was_admitted_to, med.Taichung_Veterans_General_Hospital))
# g.add((patient_1, suffered_from, med.intermittent_epigastralgia))

# gastric_tumor_1 = med.gastric_tumor_1
# g.add((gastric_tumor_1, RDF.type, tumor_class))
# g.add((gastric_tumor_1, tumor_size, Literal("4 cm")))
# g.add((gastric_tumor_1, tumor_was_found, Literal("")))

# g.add((patient_1, abdominal_pain_got_worse, Literal("2 years before admission")))
# g.add((patient_1, was_admitted_to, med.local_hospital))
# # g.add((med.abdominal_ct_scan, revealed, gastric_tumor_1))
# # g.add((,surgical_intervention_was_suggested, Literal("None")))
# g.add((patient_1, surgical_intervention_was_declined, Literal("None")))
# g.add((patient_1, tarry_stool_passage_was_noted, Literal("10 days before admission")))
# g.add((patient_1, bloody_vomitus_was_found, Literal("1 day later")))
# g.add((med.ugi_scope, ugi_scope_revealed, med.submucosal_gastric_tumor))
# g.add((med.submucosal_gastric_tumor, had, med.central_ulceration))
# # g.add((patient_1, was_transferred_to, med.our_hospital))
# g.add((med.physical_examination, physical_examination_showed, med.upper_abdominal_tenderness))
# g.add((med.physical_examination, physical_examination_showed, med.mild_muscle_guarding))
# g.add((med.plain_radiography, plain_radiography_showed, med.irregular_shape_calcification))
# g.add((med.ugi_scope, ugi_scope_revealed, med.deep_gastric_ulcer))
# g.add((med.ugi_scope, ugi_scope_revealed, med.foreign_body))
# g.add((med.ct_scan, ct_scan_showed, med.irregularly_shaped_space_occupying_lesion))
# # g.add((med.irregularly_shaped_space_occupying_lesion, was_in_front_of, med.stomach))
# g.add((med.irregularly_shaped_space_occupying_lesion, had, med.plate_calcifications))
# g.add((med.irregularly_shaped_space_occupying_lesion, had, med.localized_free_air))

# g.serialize(destination="madd.rdf", format="turtle")

In [26]:
import os
import rdflib

g = rdflib.Graph()

namespace = rdflib.Namespace("http://meddb.org/")

def format_uri(text):
    cleaned_text = text.strip().replace(" ", "_").replace("(", "").replace(")", "")
    return rdflib.URIRef(namespace + cleaned_text)

file_path = "C:/Users/koush/Desktop/ml in cl project/data/results"
os.makedirs(file_path, exist_ok=True)
for file_name in os.listdir(file_path):
    file_path = "C:/Users/koush/Desktop/ml in cl project/data/results"
    if file_name.endswith(".txt"):
        file_path = os.path.join(file_path, file_name)
        print(file_path)
        with open(file_path, 'r') as file:
            content = file.read().strip()
            triplets = content.split(')(')

        for triplet in triplets:
            parts = triplet.strip('()').split(', ')
            if len(parts) == 3:
                subj, pred, obj = parts
                subj = format_uri(subj)
                pred = format_uri(pred)
                obj = format_uri(obj)
                g.add((subj, pred, obj))


g.serialize(destination="med_db.rdf",format='turtle')

############ This code shoes some formating error but the results were achived


http://meddb.org/formation_of_an_"atrialized"_right_ventricle does not look like a valid URI, trying to serialize this will break.
http://meddb.org/abnormal_impulse_conduction_in_the_"atrialized"_right_ventricle does not look like a valid URI, trying to serialize this will break.


C:/Users/koush/Desktop/ml in cl project/data/results\15939911_annotations.txt
C:/Users/koush/Desktop/ml in cl project/data/results\16778410_annotations.txt
C:/Users/koush/Desktop/ml in cl project/data/results\17803823_annotations.txt
C:/Users/koush/Desktop/ml in cl project/data/results\18236639_annotations.txt
C:/Users/koush/Desktop/ml in cl project/data/results\18258107_annotations.txt
C:/Users/koush/Desktop/ml in cl project/data/results\18416479_annotations.txt
C:/Users/koush/Desktop/ml in cl project/data/results\18561524_annotations.txt


Exception: "http://meddb.org/formation_of_an_"atrialized"_right_ventricle" does not look like a valid URI, I cannot serialize this as N3/Turtle. Perhaps you wanted to urlencode it?

The triplets we retrived from the datasets needed much more pre processing to create a more meaningful knowledge graph. So to test the abity of llms generating Sparql queries i created a medical RDF database manually with details from the text and prompt GPT and CLAUDE chat models.

In [18]:
from rdflib import Graph, Literal, RDF, RDFS, URIRef, Namespace, XSD

g = Graph()

ex = Namespace("http://t4.org/")
g.bind("ex", ex)

Patient = ex.Patient
Doctor = ex.Doctor
Disease = ex.Disease
Medication = ex.Medication
Treatment = ex.Treatment
Hospital = ex.Hospital
BloodTest = ex.BloodTest
MedicalEvent = ex.MedicalEvent
FamilyHistory = ex.FamilyHistory
Cardiologist = ex.Cardiologist
SurgicalProcedure = ex.SurgicalProcedure

hasDoctor = ex.hasDoctor
hasDisease = ex.hasDisease
prescribesMedication = ex.prescribesMedication
undergoesTreatment = ex.undergoesTreatment
attends = ex.attends
undergoesTest = ex.undergoesTest
hasFamilyHistoryOf = ex.hasFamilyHistoryOf

entities = [
    Patient, Doctor, Disease, Medication, Treatment, Hospital,
    BloodTest, MedicalEvent, FamilyHistory, Cardiologist, SurgicalProcedure
]

properties = [
    (hasDoctor, Doctor), (hasDisease, Disease), (prescribesMedication, Medication),
    (undergoesTreatment, Treatment), (attends, Hospital), (undergoesTest, BloodTest),
    (hasFamilyHistoryOf, Disease)
]

for entity in entities:
    g.add((entity, RDF.type, RDFS.Class))

for prop, rng in properties:
    g.add((prop, RDF.type, RDF.Property))
    g.add((prop, RDFS.range, rng))

patients = [ex.john, ex.Mary, ex.Alice, ex.matt]
doctors = [ex.DrSmith, ex.DrJones, ex.Drdan, ex.Drjon]
hospitals = [ex.GeneralHospital, ex.CityClinic, ex.SpecialtyCenter, ex.iuhealth]
diseases = [ex.Diabetes, ex.Hypertension, ex.Asthma, ex.Sinus]
medications = [ex.Insulin, ex.Amlodipine, ex.Ventolin, ex.asprin]
treatments = [ex.InsulinTherapy, ex.BloodPressureManagement, ex.AsthmaInhaler, ex.surgery]

for patient, disease, medication, treatment, doctor, hospital in zip(
        patients, diseases, medications, treatments, doctors, hospitals):
    g.add((patient, RDF.type, Patient))
    g.add((patient, hasDisease, disease))
    g.add((patient, undergoesTreatment, treatment))
    g.add((patient, hasDoctor, doctor))
    g.add((patient, attends, hospital))
    g.add((doctor, RDF.type, Doctor))
    g.add((doctor, prescribesMedication, medication))

    g.add((hospital, RDF.type, Hospital))

g.add((ex.Mary, hasFamilyHistoryOf, ex.Diabetes))  
g.add((ex.Mary, RDF.type, Patient))
g.add((ex.DrSmith, RDF.type, Doctor))
g.add((ex.General_Hospital, RDF.type, Hospital))
g.add((ex.Diabetes, RDF.type, Disease))
g.add((ex.Hypertension, RDF.type, Disease))
g.add((ex.Insulin, RDF.type, Medication))

g.add((ex.john, hasDoctor, ex.DrSmith))
g.add((ex.john, hasDisease, ex.Diabetes))
g.add((ex.john, attends, ex.General_Hospital))
g.add((ex.john, hasDoctor, ex.Drdan))
g.add((ex.john, hasDisease, ex.Diabetes))
g.add((ex.john, attends, ex.General_Hospital))
g.add((ex.Drdan, prescribesMedication, ex.asprin))

g.add((ex.Mary, hasDisease, ex.Hypertension))
g.add((ex.Mary, hasFamilyHistoryOf, ex.Diabetes))
g.add((ex.DrSmith, prescribesMedication, ex.Insulin))
g.add((ex.Mary, hasDisease, ex.Hypertension))
g.add((ex.Mary, hasFamilyHistoryOf, ex.Diabetes))
g.add((ex.DrSmith, prescribesMedication, ex.Insulin))


g.add((ex.john, hasDoctor, ex.DrSmith))
g.add((ex.john, hasDisease, ex.Diabetes))
g.add((ex.john, attends, ex.General_Hospital))
g.add((ex.Mary, hasDisease, ex.Hypertension))
g.add((ex.Mary, hasFamilyHistoryOf, ex.Diabetes))
g.add((ex.DrSmith, prescribesMedication, ex.Insulin))

g.serialize(destination="t5.rdf", format="turtle")

<Graph identifier=N955a446b5d8c492f94ad1bff11e28f17 (<class 'rdflib.graph.Graph'>)>

In [19]:
from rdflib import Graph, Namespace, RDF, RDFS

g.parse("t5.rdf", format= "turtle")


def execute_query(graph, query):
    print("\nQuery Results:")
    for row in graph.query(query):
        print(row)

# SPARQL Query to list all classes in the RDF Graph
query_classes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?class WHERE {
  ?class rdf:type rdfs:Class.
}
"""
print("Classes in the RDF Graph:")
execute_query(g, query_classes)

# SPARQL Query to list all properties with their domains and ranges
query_properties = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?property ?domain ?range WHERE {
  ?property rdf:type rdf:Property.
  OPTIONAL {?property rdfs:domain ?domain.}
  OPTIONAL {?property rdfs:range ?range.}
}
"""
print("\nProperties in the RDF Graph (with domains and ranges):")
execute_query(g, query_properties)


Classes in the RDF Graph:

Query Results:
(rdflib.term.URIRef('http://t4.org/Patient'),)
(rdflib.term.URIRef('http://t4.org/Doctor'),)
(rdflib.term.URIRef('http://t4.org/Disease'),)
(rdflib.term.URIRef('http://t4.org/Medication'),)
(rdflib.term.URIRef('http://t4.org/Treatment'),)
(rdflib.term.URIRef('http://t4.org/Hospital'),)
(rdflib.term.URIRef('http://t4.org/BloodTest'),)
(rdflib.term.URIRef('http://t4.org/MedicalEvent'),)
(rdflib.term.URIRef('http://t4.org/FamilyHistory'),)
(rdflib.term.URIRef('http://t4.org/Cardiologist'),)
(rdflib.term.URIRef('http://t4.org/SurgicalProcedure'),)

Properties in the RDF Graph (with domains and ranges):

Query Results:
(rdflib.term.URIRef('http://t4.org/hasDoctor'), None, rdflib.term.URIRef('http://t4.org/Doctor'))
(rdflib.term.URIRef('http://t4.org/hasDisease'), None, rdflib.term.URIRef('http://t4.org/Disease'))
(rdflib.term.URIRef('http://t4.org/prescribesMedication'), None, rdflib.term.URIRef('http://t4.org/Medication'))
(rdflib.term.URIRef('http

In [20]:
#Actual query for numer of doctors
query_classes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX t4: <http://t4.org/>

SELECT (COUNT(?doctor) AS ?numberOfDoctors)
WHERE {
  ?doctor a t4:Doctor .}
"""
print("Classes in the RDF Graph:")
execute_query(g, query_classes)


################################## GPT-4 Generated Query
# PREFIX t4: <http://t4.org/>
# SELECT (COUNT(?doctor) AS ?numberOfDoctors)
# WHERE { ?doctor a t4:Doctor .}# -> same as actual query


################################## Claude Generated Query
# PREFIX t4: <http://t4.org/>
# SELECT (COUNT(?doctor) AS ?totalDoctors)
# WHERE { ?doctor a t4:Doctor .} -> Same as actual query




Classes in the RDF Graph:

Query Results:
(rdflib.term.Literal('4', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')),)


In [21]:
query_classes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX t4: <http://t4.org/>

SELECT (COUNT(?patient) AS ?numberOfPatients)
WHERE {
  ?patient a t4:Patient.
}
"""
execute_query(g, query_classes)


Query Results:
(rdflib.term.Literal('4', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')),)


In [22]:

# g.parse("t5.rdf", format = 'turtle')

query_instances = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT ?instance
WHERE {
  ?instance rdf:type <http://t4.org/Patient> .
}
"""
instances = g.query(query_instances)

for instance in instances:
    describe_query = f"DESCRIBE <{instance[0]}>"
    describe_result = g.query(describe_query)
    for description in describe_result:
        print(description)


(rdflib.term.URIRef('http://t4.org/john'), rdflib.term.URIRef('http://t4.org/hasDoctor'), rdflib.term.URIRef('http://t4.org/DrSmith'))
(rdflib.term.URIRef('http://t4.org/john'), rdflib.term.URIRef('http://t4.org/attends'), rdflib.term.URIRef('http://t4.org/GeneralHospital'))
(rdflib.term.URIRef('http://t4.org/john'), rdflib.term.URIRef('http://t4.org/hasDisease'), rdflib.term.URIRef('http://t4.org/Diabetes'))
(rdflib.term.URIRef('http://t4.org/john'), rdflib.term.URIRef('http://t4.org/hasDoctor'), rdflib.term.URIRef('http://t4.org/Drdan'))
(rdflib.term.URIRef('http://t4.org/john'), rdflib.term.URIRef('http://t4.org/undergoesTreatment'), rdflib.term.URIRef('http://t4.org/InsulinTherapy'))
(rdflib.term.URIRef('http://t4.org/john'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://t4.org/Patient'))
(rdflib.term.URIRef('http://t4.org/john'), rdflib.term.URIRef('http://t4.org/attends'), rdflib.term.URIRef('http://t4.org/General_Hospital'))
(r

In [24]:
# Number of patients with diabetis
query_classes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX t4: <http://t4.org/>

SELECT (COUNT(?patient) AS ?numPatientsWithDiabetes)
WHERE {
    ?patient t4:hasDisease t4:Diabetes .}
"""
execute_query(g, query_classes)                     #->correct solution


################################# GPT-4
# PREFIX t4: <http://t4.org/>

# SELECT (COUNT(DISTINCT ?patient) AS ?numberOfPatientsWithDiabetes)
# WHERE {
#   ?patient a t4:Patient ;
#            t4:hasDisease t4:Diabetes .
# }                                                 #->correct solution

################################# Claude-3 Sonnet
# PREFIX t4: <http://t4.org/>

# SELECT (COUNT(?patient) AS ?numPatientsWithDiabetes)
# WHERE {
#     ?patient t4:hasDisease t4:Diabetes .
#  }                                                #->correct solution



Query Results:
(rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')),)
