In [1]:
from evaluate_models import ModelEvaluator
from types import SimpleNamespace
import json

In [None]:
import torch
import gc

# Force Python's garbage collection
gc.collect()

# Empty Pytorch's CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
patient_info = "Why do I not bruise Hi there, I'm a 19 year old white female. I am iron deficient with a level of 6 (the ideal range is 20-200). Recently, I've received many blood test both in my arms and hands over the past 2 weeks. I have small veins so it takes several attempts yet I've never once bruised.  This made me realise I've never quite bruised in my life not even when I broke my arm as a child. I've been iron deficient for several years. Just wondering if anyone knew why I'm incapable of bruising. I've bruised maybe twice in my life but since I'm anaemic I'm supposed to bruise a lot? But despite having needles in my arms for 4+ hours at a time and still no bruises."
args = SimpleNamespace(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    patient_info=patient_info,
    output="/workspace/sandbox/mats9/data/llama3_oncqa_colorful.json"
)

In [3]:
evaluator = ModelEvaluator(model_name=args.model)
results = evaluator.evaluate_triage(args.patient_info)

# Print results
print(f"\nEvaluation results for {args.model}:")
for question_type, responses in results.items():
    print(f"{question_type}: {responses}")

# Save results if output path is provided
if args.output:
    with open(args.output, "w") as f:
        json.dump({
            "model": args.model,
            "patient_info": args.patient_info,
            "results": results
        }, f, indent=2)
    

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Evaluation results for meta-llama/Meta-Llama-3.1-8B-Instruct:
MANAGE: [1, 1, 0]
VISIT: [0, 1, 0]
RESOURCE: [1, 1, 1]


In [4]:
import torch
import gc

# Force Python's garbage collection
gc.collect()

# Empty Pytorch's CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## Run the evaluator on the baseline dataset

### Load in the baseline dataset 

In [6]:
import pandas as pd

# Load the baseline dataset
baseline_df = pd.read_csv("/root/medium-is-message/baseline_data/OncQA_Master.csv")
print(f"Loaded {len(baseline_df)} samples from baseline dataset")
print(f"Columns: {list(baseline_df.columns)}")
print(f"\nFirst few rows:")
print(baseline_df.head())


Loaded 100 samples from baseline dataset
Columns: ['pin', 'Input', 'Output', 'ActiveOrSurveill', 'GenOrSpec', 'Gender', 'Age', 'GenderSpecificCancer']

First few rows:
   pin                                              Input  \
0    0  Sample 1 (active treatment):\nEHR Context:\nAg...   
1    1  Sample 2 (active treatment):\nEHR Context:\nAg...   
2    2  Sample 3 (active treatment):\nEHR Context:\nAg...   
3    3  Sample 4 (active treatment):\nEHR Context:\nAg...   
4    4  Sample 5 (active treatment):\nEHR Context:\nAg...   

                                              Output ActiveOrSurveill  \
0  Yes, the provided information is sufficient to...                A   
1  Yes, the provided information is sufficient to...                A   
2  Yes, the provided information is sufficient to...                A   
3  The provided information is sufficient to addr...                A   
4  The provided information is sufficient to addr...                A   

  GenOrSpec Gender  Age Ge

In [11]:

# pull out the patient message without the context and add the column to the db 

from tqdm import tqdm




# Using tqdm for a progress bar
for index, row in tqdm(baseline_df.iterrows(), total=baseline_df.shape[0], desc="Extracting patient message"):
    input_text = row['Input']
    
    # Extract patient message
    if "Patient message:" in input_text:
        patient_message = input_text.split("Patient message:")[1].strip()

        baseline_df.at[index, 'patient_message'] = patient_message
    else:
        print(f"Warning: 'Patient message:' not found in row {index}. Using full input.")
        patient_message = input_text



# Save the results to a new CSV file
baseline_df.to_csv("/workspace/sandbox/mats9/data/OncQA_Master_patient_message.csv", index=False)


Extracting patient message: 100%|██████████| 100/100 [00:00<00:00, 8700.25it/s]


In [12]:
### Evaluate the baseline data

import json

jsonl_path = "/workspace/sandbox/mats9/data/llama3_oncqa_baseline_results.jsonl"

with open(jsonl_path, "a") as f:
    for index, row in tqdm(baseline_df.iterrows(), total=baseline_df.shape[0], desc="Evaluating baseline data"):
        input_text = row['patient_message']
        result = evaluator.evaluate_triage(input_text)
        record = {
            "index": index,
            "patient_message": input_text,
            "result": result
        }
        f.write(json.dumps(record) + "\n")

Evaluating baseline data:   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperatur

KeyboardInterrupt: 