### Required
- Create conda environment
- pip install -r requirements.txt

In [1]:
import pandas as pd
import os
import re
import json
import numpy as np
from dosage_models import Summary
import torch

os.environ["HF_HOME"] = "/data2/model_weights/huggingface"
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset

In [2]:
model = "Qwen/Qwen2.5-7B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda:0",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [3]:
# Plenty of directions to take this.

prompt = """
You are a helpful pharmaceutical assistant. Extract the total daily dosage from the following dosage instructions. Format the dosage as a json with strength and frequency fields. Explain how you calculated the daily_sumamry.

Hints:
- When the number of tablets is given rather than the mass, convert the strength to mass.

Example: 
1. 'Metformin 20mg tablets. take 0.25 tablet by mouth 2 (two) times daily' = {"events": [{"frequency": "twice daily","amount": {"value": 0.25,"unit": "tablet"},"strength": {"value": 5, "unit": "mg"}}],"daily_summary": {"total_amount": {"value": 0.5,"unit": "tablet"},"total_strength": {"value": 10, "unit": "mg"}}}
2. 'Tylenol 50mg tablets. take 1 tablet daily and 2 tablets nightly by mouth.' {"events": [{"frequency": "daily","amount": {"value": 1,"unit": "tablet"},"strength": {"value": 50,"unit": "mg"}},{"frequency": "nightly","amount": {"value": 2,"unit": "tablet"},"strength": {"value": 50,"unit": "mg"}}],"daily_summary": {"total_amount": {"value": 3,"unit": "tablets"},"total_strength": {"value": 150,"unit": "mg"}}}

Dosage Instruction:
"""

prompt_pydantic = f"""
You are a helpful pharmaceutical assistant. Extract the total daily dosage from the following dosage instructions. Format the dosage as a json with strength and frequency fields. Explain how you calculated the daily_sumamry.

Hints:
- When the number of tablets is given rather than the mass, convert the strength to mass.

Limit your response to a JSON object with the following fields\n{Summary.get_prompt_repr()}
"""

In [4]:
"""
    Can load your own dataset here. Just follow the example and 
        transform your column with dosage instructions with the prompt.
    
    You should modify the dosage_models description text. I just 
        threw some placeholder stuff in there. You could try to
        comment out Summary.events in the definition if you want to
        test without extracting the individual events.

    BATCH_SIZE can be adjusted and likely go pretty high given small
        input/output sizes. Try to keep it to a power of 2.
    MAX_NEW_TOKENS should be fine so long as there aren't any super 
        complicated instructions.

    num_workers probably doesn't need to be adjusted because that
        controls pre-processing steps.

    This tries to ouput the data as a Summary object, if that fails
        (missing {, model got lost, improper formatting, etc.) it
        will fallback to raw text.
"""
df = pd.DataFrame(
    {
        "dosage": np.random.choice(
            [
                "Albuterol 50mg. take 0.5 tablet by mouth 2 (two) times daily",
                "Metformin 20mg tablets. take 0.25 tablet by mouth 2 (two) times daily",
                "Tylenol 50mg tablets. take 1 tablet daily and 2 tablets nightly by mouth.",
            ],
            16,
        )
    }
)
df["text"] = df.dosage.apply(
    lambda x: [
        {"role": "system", "content": prompt_pydantic},
        {"role": "user", "content": x},
    ]
)
dataset = Dataset.from_pandas(df)

In [5]:
BATCH_SIZE =  64
MAX_NEW_TOKENS = 150

responses = []
for out in pipe(KeyDataset(dataset, "text"), batch_size=BATCH_SIZE, num_workers=8, max_new_tokens=MAX_NEW_TOKENS):
    out_text = out[0]["generated_text"][-1]["content"]
    try:
        str_resp = re.search(
            "({.*})", out_text, re.DOTALL
        )
        responses.append(Summary.model_validate(json.loads(str_resp.group(1))))
    except Exception:
        responses.append(out_text)


df["Summary"] = responses
df["total_amount"] = df.Summary.apply(lambda x: x.total_amount if isinstance(x, Summary) else None)
df["total_strength"] = df.Summary.apply(lambda x: x.total_strength  if isinstance(x, Summary) else None)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [6]:
df

Unnamed: 0,dosage,text,Summary,total_amount,total_strength
0,Tylenol 50mg tablets. take 1 tablet daily and ...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='daily', amount={'unit...","{'value': 3, 'unit': 'tablet'}","{'value': 150, 'unit': 'mg'}"
1,Tylenol 50mg tablets. take 1 tablet daily and ...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='daily', amount={'unit...","{'value': 3, 'unit': 'tablet'}","{'value': 150, 'unit': 'mg'}"
2,Metformin 20mg tablets. take 0.25 tablet by mo...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='twice daily', amount=...","{'value': 0.5, 'unit': 'tablet'}","{'value': 10, 'unit': 'mg'}"
3,Albuterol 50mg. take 0.5 tablet by mouth 2 (tw...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='twice daily', amount=...","{'value': 1.0, 'unit': 'tablet'}","{'value': 100, 'unit': 'mg'}"
4,Albuterol 50mg. take 0.5 tablet by mouth 2 (tw...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='twice daily', amount=...","{'value': 1.0, 'unit': 'tablet'}","{'value': 100, 'unit': 'mg'}"
5,Albuterol 50mg. take 0.5 tablet by mouth 2 (tw...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='twice daily', amount=...","{'value': 1.0, 'unit': 'tablet'}","{'value': 100, 'unit': 'mg'}"
6,Tylenol 50mg tablets. take 1 tablet daily and ...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='daily', amount={'unit...","{'value': 3, 'unit': 'tablet'}","{'value': 150, 'unit': 'mg'}"
7,Tylenol 50mg tablets. take 1 tablet daily and ...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='daily', amount={'unit...","{'value': 3, 'unit': 'tablet'}","{'value': 150, 'unit': 'mg'}"
8,Albuterol 50mg. take 0.5 tablet by mouth 2 (tw...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='twice daily', amount=...","{'value': 1.0, 'unit': 'tablet'}","{'value': 100, 'unit': 'mg'}"
9,Metformin 20mg tablets. take 0.25 tablet by mo...,"[{'role': 'system', 'content': ' You are a hel...","events=[Event(frequency='twice daily', amount=...","{'value': 0.5, 'unit': 'tablet'}","{'value': 10, 'unit': 'mg'}"
