## Import

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
# "0,1,2,3"
num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
print("num_gpus:", num_gpus)
print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"])

num_gpus: 2
CUDA_VISIBLE_DEVICES: 2,3


In [2]:
import re
import sys
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
# import matplotlib.pyplot as plt

In [3]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModel
from vllm import LLM, SamplingParams

  from .autonotebook import tqdm as notebook_tqdm


INFO 04-30 22:31:50 [__init__.py:243] No platform detected, vLLM is running on UnspecifiedPlatform


2025-04-30 22:31:52,432	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [4]:
from dataset.process import format_chat

In [5]:
def seed_everything(seed):
    seed = int(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)
    print(f"seed everything: {seed}")

seed = 42
seed_everything(seed=seed)

seed everything: 42


## Task

## Tokenizer

In [6]:
# name of model
with open("dict_model_path.json", "r") as f:
    dict_model_path = json.load(f)

In [7]:
model_name = 'Deepseek8B'
# path_dir_model = dict_model_path[model_name]
path_dir_model = "/n/holylfs06/LABS/kempner_undergrads/Lab/jasmineliu/DeepSeek-R1-Distill-Llama-8B"

In [8]:
# # loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(path_dir_model, padding_side='left')

## Task class

In [9]:
class EmptyArgs:
    def __init__(self):
        self.batch_size = 32
        self.gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
        self.model_path = path_dir_model
        self.model_name = model_name

args = EmptyArgs()

In [10]:
from model.init import load_config

load_config(args)
args.__dict__

{'batch_size': 32,
 'gpus': ['2', '3'],
 'model_path': '/n/holylfs06/LABS/kempner_undergrads/Lab/jasmineliu/DeepSeek-R1-Distill-Llama-8B',
 'model_name': 'Deepseek8B',
 'num_workers': 4,
 'max_token_all': 102400,
 'max_token_output': 3072,
 'max_token_input': 99328}

In [11]:
task_name = "106.MIMIC-III Outcome.Diagnosis"

In [12]:
path_file_data = f"dataset_raw/{task_name}.SFT.json"
print(f"Loading {path_file_data} ...")
with open(path_file_data, "r") as file:
    list_dict_data = json.load(file)
list_dict_data = [
    dict_data for dict_data in list_dict_data if dict_data["split"] == "test"
]
print(f"The number of data: {len(list_dict_data)}")

Loading dataset_raw/106.MIMIC-III Outcome.Diagnosis.SFT.json ...
The number of data: 1000


In [13]:
list_dict_data[0]

{'task': '106.MIMIC-III Outcome.Dignosis',
 'language': 'en',
 'type': 'clf',
 'id': 34242,
 'split': 'test',
 'instruction': 'Given a patient\'s information from admission notes, predict the patient\'s ICD-9 diagnosis codes, which are the codes used to classify the patient\'s diagnosis. Each ICD-9 diagnosis code is a 3-digit code that represents a specific diagnosis, and each patient may have multiple diagnosis codes.\nReturn your answer in the following format. DO NOT GIVE ANY EXPLANATION:\nICD-9 Diagnosis codes: code 1, code 2, ..., code n\nThe optional list for "code" is ICD-9 diagnosis codes.',
 'input': 'CHIEF COMPLAINT: Right foot ulcer getting worse\n\nPRESENT ILLNESS: 65 year old male with type 2 diabetes, HTN,\nHypercholesterolemia, CAD and history of remote stroke, current\nsmoker, who was brought to the [**Hospital1 882**] ER by his brother\nsecondary to concern about the state of his R foot.  They\ntransferred him to [**Hospital1 18**] as he gets all of his podiatric care\

In [14]:
from dataset.classification import Task_clf_Brain_MRI_AIS
task = Task_clf_Brain_MRI_AIS(args=args, task=task_name)

Load 106.MIMIC-III Outcome.Diagnosis data: train: 29928, val: 4314, test: 1000


In [15]:
task.setup(tokenizer=tokenizer, prompt_mode='direc')

## Model

Choose the way to load the model:
- Huggingface model loading
- Huggingface pipeline
- vLLM

### Huggingface

In [None]:
model = AutoModel.from_pretrained(path_dir_model, torch_dtype=torch.bfloat16, device_map='auto')

In [None]:
# set the model to the evaluation mode
model.eval()
# greedy decoding
model.generation_config.do_sample = False
model.generation_config.temperature = None
model.generation_config.top_k = None
model.generation_config.top_p = None

In [None]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print("Tokenizer: Now pad_token_id is:", tokenizer.pad_token_id)
else:
    print("Tokenizer: pad_token_id is already set:", tokenizer.pad_token_id)
if model.generation_config.pad_token_id is None:
    model.generation_config.pad_token_id = tokenizer.eos_token_id
    print("Model: Now pad_token_id is:", model.generation_config.pad_token_id)
else:
    print("Model: pad_token_id is already set:", model.generation_config.pad_token_id)

### HF Pipeline

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=path_dir_model, device="cuda")

In [None]:
str_input = "The medical condition is characterized by"
generated_text = pipe(str_input, num_return_sequences=1)

### vLLM 

In [19]:
model = LLM(
            model=path_dir_model,
            tensor_parallel_size=2,
            dtype="bfloat16",
            # max_model_len=4096,
            gpu_memory_utilization=0.90,
            trust_remote_code=True,
        )

RuntimeError: Failed to infer device type, please set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to turn on verbose logging to help debug the issue.

In [None]:
if "mistral" in model_name.lower() and "biomistral" not in model_name.lower():
    print(f"Loading {model_name} with mistral mode...")


    model = LLM(model=path_dir_model, tensor_parallel_size=num_gpus, dtype="bfloat16", seed=seed, max_model_len=args.max_token_all, tokenizer_mode="mistral", load_format="mistral",
    config_format="mistral")
else:
    print(f"Loading {model_name} ...")
    model = LLM(model=path_dir_model, tensor_parallel_size=num_gpus, dtype="bfloat16", seed=seed, max_model_len=args.max_token_all, gpu_memory_utilization=0.9, enforce_eager=True, device="cuda")

Loading Deepseek8B ...
INFO 04-30 22:33:41 [config.py:717] This model supports multiple tasks: {'generate', 'score', 'reward', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 04-30 22:33:41 [config.py:1770] Defaulting to use mp for distributed inference
INFO 04-30 22:33:41 [config.py:1804] Disabled the custom all-reduce kernel because it is not supported on current platform.


NotImplementedError: 

In [None]:
sampling_params = SamplingParams(seed=seed, temperature=0, max_tokens=args.max_token_output)

## Inference - test

In [None]:
idx = 0
data = list_dict_data[idx]
data

In [None]:
formatted_input= format_chat(
    model_name=model_name,
    tokenizer=tokenizer,
    data=data,
    max_token_input=args.max_token_input,
    examples=task.examples,
)

In [None]:
print(formatted_input)

In [None]:
output = model.generate(formatted_input, sampling_params=sampling_params, use_tqdm=False)
for output_one in output:
    generated_text = output_one.outputs[0].text
    print(f"Prompt:\n\t{formatted_input}")
    print("-"*50)
    print(f"Generated text:\n\t{generated_text}")

## Inference - task

In [None]:
num_sample = 100

In [None]:
# format the input text with the prompt
list_input = []
list_num_token = []
for idx_data, dict_data in enumerate(list_dict_data[:num_sample]):
    input_llm = format_chat(
        model_name=model_name,
        tokenizer=tokenizer,
        data=dict_data,
        max_token_input=args.max_token_input,
        examples=task.examples,
    )
    list_input.append(input_llm)
    len_token_input = len(tokenizer.tokenize(input_llm))
    list_num_token.append(len_token_input)
    if len_token_input > args.max_token_input:
        print(f"Input exceeds max token limit: id-{idx_data} - {len_token_input} > {args.max_token_input}")
print(f"Data size: {len(list_input)}")

In [None]:
print(list_input[1])

In [None]:
dict_stat_num_token = pd.Series(list_num_token).describe().to_dict()
print(dict_stat_num_token)

In [None]:
# calculate how many data will be truncated, max_token_input = max_token_output
num_truncate = sum([1 for num_token in list_num_token if num_token > args.max_token_input])
proportion_truncate = num_truncate / len(list_num_token)
print(f"The number of data will be truncated: {num_truncate}")
print(f"The proportion of data will be truncated: {proportion_truncate:.2%}") 

In [None]:
plt.hist(list_num_token, bins=30, alpha=0.7)
plt.title('Token Count Distribution')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

In [None]:
list_pred = []
output = model.generate(list_input, sampling_params=sampling_params, use_tqdm=True)
for output_one in output:
    generated_text = output_one.outputs[0].text
    list_pred.append(generated_text)

## Result

In [None]:
for input_text, pred_text in zip(list_input, list_pred):
    print(f"Input text:\n\t{input_text}")
    print("-"*50)
    print(f"Generated text:\n\t{pred_text}")
    print("="*100)

In [None]:
for idx_data, dict_data in enumerate(list_dict_data[:num_sample]):
    dict_data["pred"] = list_pred[idx_data]

In [None]:
list_pred_extracted = task.get_pred(list_dict_data[:num_sample], prompt_mode="direct")
list_label_extracted = task.get_label(list_dict_data[:num_sample], prompt_mode="direct")

### list_pred

In [None]:
list_pred_extracted, num_failed = task.get_pred_none(list_pred=list_pred_extracted[:num_sample], list_label=list_label_extracted[:num_sample])
print(f"The number of failed data: {num_failed} ({num_failed/num_sample:.2%})")

In [None]:
dict_performance = task.get_performance(list_pred_extracted, list_label_extracted)

In [None]:
dict_performance

### list_list_pred

In [None]:
list_pred_extracted, num_failed = task.get_pred_none(list_pred=list_pred_extracted[:num_sample], list_label=list_label_extracted[:num_sample])
print(f"The number of failed data: {num_failed} ({num_failed/num_sample:.2%})")

In [None]:
dict_performance, dict_performance_sample = task.get_performance(list_pred_extracted, list_label_extracted)

In [None]:
dict_performance

## End

In [None]:
print('Done!')