In [1]:
import json
import torch
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, TextStreamer, AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
base_model = "teknium/OpenHermes-2.5-Mistral-7B" 

from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load your merged fine-tuned weights
model = AutoModelForCausalLM.from_pretrained(
    "merged_model",         
    device_map="auto",
    load_in_4bit=True,
    torch_dtype="auto"       
).eval()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [3]:
import pandas as pd
df = pd.read_parquet('latest_notes.parquet')
evaluation_df = df['note_text'].iloc[201:301].values

In [7]:
system_prompt = """
You are a strict JSON generator. Only output JSON that matches the schema below.
Do not include any extra text, commentary, or explanations.

Schema:
- imatinib_mentioned: true if the drug imatinib (also known as Gleevec) is mentioned in the note, otherwise false
- related_drugs_mentioned: true if drugs related to imatinib (e.g., dasatinib, nilotinib, bosutinib) are mentioned, otherwise false
- cml_diagnosed: true if chronic myeloid leukemia (CML) is diagnosed, otherwise false
- cml_in_regression: true if chronic myeloid leukemia is mentioned as being in regression, otherwise false
- aml_diagnosed: true if acute myeloid leukemia (AML) is diagnosed, otherwise false
- blast_phase_cml: true if blast phase CML is explicitly mentioned, otherwise false
- bmt_history: true if history of bone marrow transplant (BMT) is mentioned, otherwise false
- acute_phase_cml: true if acute phase CML is explicitly mentioned, otherwise false

Rules:
1. Only mark a field as true if the note clearly indicates it.
2. If the note does not explicitly mention a field, mark it false.
3. The output must always be valid JSON with all eight fields present.
"""


In [8]:
input_json = [
    {
        "system": system_prompt,
        "input": evaluation_df[ind][:11000],
    }
    for ind in range(len(evaluation_df)) # This now iterates through all rows
]

with open('test_input.json', 'w') as json_file:
    json.dump(input_json, json_file, indent=4)

In [9]:
dataset = load_dataset("json", data_files= "test_input.json")['train']

def format_chat(example):
    return {
        "text": f"""<|im_start|>system
{example['system']}<|im_end|>
<|im_start|>user
{example['input']}<|im_end|>
"""
    }

formatted_dataset = dataset.map(format_chat)

Generating train split: 100 examples [00:00, 2037.33 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:00<00:00, 1471.23 examples/s]


In [10]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length= 5300) #4096)

tokenized_dataset = formatted_dataset.map(tokenize, batched = True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:00<00:00, 708.76 examples/s]


In [11]:
test_outputs = []
for i, example in enumerate(tokenized_dataset):
    inputs = tokenizer(example["text"], return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,   
        temperature=0.2,      
        do_sample=False
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    test_outputs.append({
        "note": example["text"],
        "output": decoded
    })
    

In [14]:
import re

schema_fields = [
    "imatinib_mentioned",
    "related_drugs_mentioned",
    "cml_diagnosed",
    "cml_in_regression",
    "aml_diagnosed",
    "blast_phase_cml",
    "bmt_history",
    "acute_phase_cml"
]

def extract_labels(output_str):
    # Extract JSON block
    json_match = re.search(r'\{[\s\S]*\}', output_str)
    if json_match:
        try:
            parsed = json.loads(json_match.group(0))
            # Ensure all schema fields exist, fill missing with False
            return {field: bool(parsed.get(field, False)) for field in schema_fields}
        except json.JSONDecodeError:
            return None
    return None  # if nothing matched

# Build dictionary keyed by note number
final_dict = {}
for i, ex in enumerate(test_outputs):
    parsed = extract_labels(ex['output'])
    if parsed:  # only add if parsing worked
        final_dict[f"note_{200+i}"] = parsed  # start numbering at 300

# Save to JSON file (pretty formatted)
with open("parsed_outputs.json", "w") as f:
    json.dump(final_dict, f, indent=2)

print(json.dumps(final_dict, indent=2))


{
  "note_200": {
    "imatinib_mentioned": false,
    "related_drugs_mentioned": false,
    "cml_diagnosed": false,
    "cml_in_regression": false,
    "aml_diagnosed": false,
    "blast_phase_cml": false,
    "bmt_history": false,
    "acute_phase_cml": false
  },
  "note_203": {
    "imatinib_mentioned": false,
    "related_drugs_mentioned": false,
    "cml_diagnosed": false,
    "cml_in_regression": false,
    "aml_diagnosed": false,
    "blast_phase_cml": false,
    "bmt_history": false,
    "acute_phase_cml": false
  },
  "note_204": {
    "imatinib_mentioned": false,
    "related_drugs_mentioned": false,
    "cml_diagnosed": false,
    "cml_in_regression": false,
    "aml_diagnosed": false,
    "blast_phase_cml": false,
    "bmt_history": false,
    "acute_phase_cml": false
  },
  "note_205": {
    "imatinib_mentioned": false,
    "related_drugs_mentioned": false,
    "cml_diagnosed": false,
    "cml_in_regression": false,
    "aml_diagnosed": false,
    "blast_phase_cml": false