In [1]:
%pip install transformers
%pip install datasets
%pip install neo4j
%pip install langchain
%pip install python-dotenv
%pip install langchain_community
%pip install accelerate

Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m929.0 kB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting filelock (from transformers)
  Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp311-cp311-macosx_10_9_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset
import json
import numpy as np
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from configparser import ConfigParser


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset('dmacres/mimiciii-hospitalcourse-meta')
train_dataset = dataset['train']
test_dataset = dataset['test']

In [4]:
import pandas as pd
df = pd.DataFrame(train_dataset)
df.head(5)


Unnamed: 0,subject_id,hadm_id,target_text,extractive_notes_summ,n_notes,notes
0,19278,159960.0,Patient was admitted to the neurosurgery servi...,There remains some low attenuation right subdu...,6,"[{'category': 'Nursing/other', 'chartdate': '2..."
1,82177,176262.0,38 y/o female with history of autoimmune disea...,"Sinus arrhythmiaSince previous tracing of , th...",2,"[{'category': 'ECG', 'chartdate': '2151-03-22 ..."
2,22500,127625.0,The patient was admitted to the Trauma Team fo...,NEEDS ENCOURAGEMENT FOR FLD INTAKE.GU: VOIDING...,5,"[{'category': 'Nursing/other', 'chartdate': '2..."
3,66532,127137.0,"62 yo F PMH of afib, TIA, memory loss s/p hypo...","Q2-3 hrs;no desating,gets resp distress that...",180,"[{'category': 'Nursing', 'chartdate': '2103-03..."
4,95143,127296.0,The patient developed a moderate to large circ...,Thereis brief right atrial diastolic invaginat...,11,"[{'category': 'Radiology', 'chartdate': '2156-..."


In [5]:
unique_values = df['subject_id'].unique()
df_unique_values = df[df['subject_id'].isin(unique_values)]

In [6]:
df['subject_id'].unique()

array([19278, 82177, 22500, ..., 22217, 25557, 22734])

In [7]:
df['subject_id'].describe()

count    24993.000000
mean     35303.249510
std      28582.486807
min          9.000000
25%      12692.000000
50%      25526.000000
75%      56996.000000
max      99995.000000
Name: subject_id, dtype: float64

In [8]:
df['subject_id'].value_counts()

subject_id
109      26
13033    20
11861    19
5727     15
7809     14
         ..
11844     1
65390     1
64715     1
57215     1
22734     1
Name: count, Length: 21104, dtype: int64

In [9]:
for column in df.columns:
    null_values = df[column].isnull().sum()
    print(f"Column '{column}' has {null_values} null value(s).")



Column 'subject_id' has 0 null value(s).
Column 'hadm_id' has 0 null value(s).
Column 'target_text' has 0 null value(s).
Column 'extractive_notes_summ' has 0 null value(s).
Column 'n_notes' has 0 null value(s).
Column 'notes' has 0 null value(s).


In [10]:
for column in df.columns:
  word_count = df['target_text'].str.split().explode().value_counts()
  print(f"Column '{column}' has {word_count} word(s).")

Column 'subject_id' has target_text
was            343570
and            311222
to             260795
the            249322
of             198230
                ...  
walker/cane         1
cental              1
w/staples.          1
Epidermis           1
givers              1
Name: count, Length: 181168, dtype: int64 word(s).
Column 'hadm_id' has target_text
was            343570
and            311222
to             260795
the            249322
of             198230
                ...  
walker/cane         1
cental              1
w/staples.          1
Epidermis           1
givers              1
Name: count, Length: 181168, dtype: int64 word(s).
Column 'target_text' has target_text
was            343570
and            311222
to             260795
the            249322
of             198230
                ...  
walker/cane         1
cental              1
w/staples.          1
Epidermis           1
givers              1
Name: count, Length: 181168, dtype: int64 word(s).
Column 'extract

In [11]:
word_count = df['target_text'].str.split().explode().value_counts()
word_count

target_text
was            343570
and            311222
to             260795
the            249322
of             198230
                ...  
walker/cane         1
cental              1
w/staples.          1
Epidermis           1
givers              1
Name: count, Length: 181168, dtype: int64

In [12]:
short_target_data = df[['subject_id','target_text']].head(10)

short_target_dict = short_target_data.to_dict()

short_target_list = list(short_target_dict.items())
short_target_list

[('subject_id',
  {0: 19278,
   1: 82177,
   2: 22500,
   3: 66532,
   4: 95143,
   5: 86645,
   6: 26523,
   7: 5495,
   8: 4113,
   9: 69472}),
 ('target_text',
  {0: "Patient was admitted to the neurosurgery service and underwent an emergent R craniotomy with drainage of his subdural hemorrhage. Postoperatively he was transferred to the intensive care unit extubated and stable. He was maintained on dilantin for seizure prophylaxis, his blood pressure was controlled and he was monitored with close neuro checks. He was also maintained on prn ativan for prevention of alcohol withdrawal considering patient's significant drinking history. A postoperative CT scan demonstrated postoperative changes and an improvement in his subdural. He remained stable, his diet was advanced and he was awake and appropriate. He was found to have a simple urinary tract infection and was treated with ciprofloxacin. He was evaluated by both occupational and physical therapy and cleared for home. He was discha

In [13]:
pipe = pipeline("token-classification", model="Clinical-AI-Apollo/Medical-NER", aggregation_strategy='simple')


result_list = {}
for index, row in short_target_data.iterrows():
    subject_id = row['subject_id']
    target = row['target_text']

    result = pipe(target)
    scores = [res['score'] for res in result]
    median_score = np.median(scores)

    if target is not result_list:
      result_list[target] = []

    for res in result:
      if res['score'] >= median_score:
        entity_info = {
            'Entity': res['entity_group'],
            'Word': res['word'],
            'Score': str(res['score'])
        }
        result_list[target].append(entity_info)

entities_list = [{'Id': subject_id, 'Text': text, 'Entities': entities} for text, entities in result_list.items()]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
# for result in result_list:
#   print(result['entity_group'])
entities_list[0]

{'Id': 69472,
 'Text': "Patient was admitted to the neurosurgery service and underwent an emergent R craniotomy with drainage of his subdural hemorrhage. Postoperatively he was transferred to the intensive care unit extubated and stable. He was maintained on dilantin for seizure prophylaxis, his blood pressure was controlled and he was monitored with close neuro checks. He was also maintained on prn ativan for prevention of alcohol withdrawal considering patient's significant drinking history. A postoperative CT scan demonstrated postoperative changes and an improvement in his subdural. He remained stable, his diet was advanced and he was awake and appropriate. He was found to have a simple urinary tract infection and was treated with ciprofloxacin. He was evaluated by both occupational and physical therapy and cleared for home. He was discharged to home on POD2 in good condition and will follow up in Dr. clinic for wound  in approximately 10 days.",
 'Entities': [{'Entity': 'CLINICAL_

In [17]:
prompt_message =  "Based on the text and entities, generate relationships terms between each those entities for a knowledge graph. Add create a new key value paired with the key called RELATION and the value will be the generated reponse."


# Constructing the prompt variable with both text and prompt_message
prompt_template = [
    prompt_message,
    {
        "role": "system",
        "content": "text"
    }

]

In [19]:
with open("entity_extraction_results.json", "w") as outfile:
    json.dump(entities_list, outfile, indent=4)

In [20]:
config=ConfigParser()
config.read('.config')
api_token = config['DEFAULT']['HF_API_TOKEN']

model_name_or_path = "google/gemma-7b"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             low_cpu_mem_usage=True, token=api_token)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=api_token)



Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [02:31<00:00, 37.98s/it]


In [22]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10
)

hf = HuggingFacePipeline(pipeline=pipe)


In [None]:
# prompt = "What are the symptoms of diabetes ?"
# prompt_template=f'''
# <|system|>: You are a helpful medical assistant created by M42 Health in the UAE.
# <|prompter|>:{prompt}
# <|assistant|>:
# '''

# response = hf.generate_text(prompt)
# print(response)

# # input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
# # output = hf.generate(inputs=input_ids, temperature=0.7, do_sample=True,eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, max_new_tokens=512)
# # print(tokenizer.decode(output[0]))




In [26]:
prompt = "What are the symptoms of diabetes ?"
prompt_template = '''
Extract the following relationships between the provided entities using the text as context. Please follow the described format:
0. ALWAYS COMPLETE THE OUTPUT. Never send partial responses.
1. Generate each relationship as triples of head, relationship, and tail. The 'ID' is to be referred to by its value. Relationship properties should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed, as defined below:
    Relationship types:
    ID|ADMITTED_TO|neurosurgery service
    ID|DIAGNOSED_WITH|subdural hemorrhage
2. The output should appear as:
{
    "relationships": ["ID|ADMITTED_TO|Entity"]
}

'''

# Fill in the prompt template with the prompt
filled_prompt = prompt_template.format(prompt)

# Generate a response using the pipeline object
response = hf.pipeline(filled_prompt, max_length=100)[0]["generated_text"]

print(response)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=10) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


KeyboardInterrupt: 

In [25]:
# llm_pipeline = pipeline("text-generation")
# dependency_results = llm_pipeline(prompt_message, max_length=256, max_new_tokens=512, truncation=True)

# print(dependency_results)
# dependency_results

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Based on the text and entities, generate relationships terms between each those entities for a knowledge graph. Add create a new key value paired with the key called RELATION and the value will be the generated reponse. The value is sent to the client that includes the ROTC and the key will not be added to them. Use the following syntax to create a new reponse:\n\nCreate a new key value Pairing a value from multiple fields with the same key will create new reponse.\n\nIf you have the same key, the name will be added to the ROTC.\n\nIf you have a different key name, or you have different tokens, or you have two tokens, then create the same new "value" from the previous step (create new key for the one you have previously used and add your pair and you will get a new "listing for it" after step 3).\n\nIn both cases, the output will look like this:\n\nA new value will be added to the ROTC and the token will be added.\n\nIt is worth noting that when creating a new rela

[{'generated_text': 'Based on the text and entities, generate relationships terms between each those entities for a knowledge graph. Add create a new key value paired with the key called RELATION and the value will be the generated reponse. The value is sent to the client that includes the ROTC and the key will not be added to them. Use the following syntax to create a new reponse:\n\nCreate a new key value Pairing a value from multiple fields with the same key will create new reponse.\n\nIf you have the same key, the name will be added to the ROTC.\n\nIf you have a different key name, or you have different tokens, or you have two tokens, then create the same new "value" from the previous step (create new key for the one you have previously used and add your pair and you will get a new "listing for it" after step 3).\n\nIn both cases, the output will look like this:\n\nA new value will be added to the ROTC and the token will be added.\n\nIt is worth noting that when creating a new rela

In [None]:
# Extract relationships based on Named Entities and Dependency Parsing
relationships = []

for token in dependency_results:
    if token['Word'] in entities:
        head = token['head']
        head_word = dependency_results[head - 1]['word']
        if head_word in entities:
            relationships.append({
                "source": head_word,
                "target": token['word'],
                "relation": token['dep']
            })

# Output the relationships as JSON
print(json.dumps(relationships, indent=4))