In [1]:
import os
import requests
import torch
from PIL import Image
import soundfile
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

# import datasets
from datasets import load_dataset
import torchcodec


  from .autonotebook import tqdm as notebook_tqdm


## Phi-4 Model

In [9]:
# Load model and processor
model_path = "kumapo/Phi-4-multimodal-instruct"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True,
    _attn_implementation='eager',
).cuda()
generation_config = GenerationConfig.from_pretrained(model_path)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`torch_dtype` is deprecated! Use `dtype` instead!
Phi4MMModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner t

In [10]:
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'

#################################################### text-only ####################################################
prompt = f'{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}'
print(f'>>> Prompt\n{prompt}')
inputs = processor(prompt, images=None, return_tensors='pt').to('cuda:0')

generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
    use_cache=False # Added to address the DynamicCache AttributeError
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(f'>>> Response\n{response}')

>>> Prompt
<|user|>what is the answer for 1+1? Explain it.<|end|><|assistant|>


You are not running the flash-attention implementation, expect numerical differences.


>>> Response
The answer for 1+1 is 2. This is because when you add one to another one, you are combining two units together, which results in a total of two units. In mathematical terms, addition is the process of combining two or more numbers to get a new total. So, 1 (one) plus 1 (one) equals 2 (two). This is a basic arithmetic operation and is universally accepted in mathematics.


In [7]:
# NOTE: Please prepare the audio file 'examples/what_is_the_traffic_sign_in_the_image.wav'
#       and audio file 'examples/what_is_shown_in_this_image.wav' before running the following code
#       Basically you can record your own voice for the question "What is the traffic sign in the image?" in "examples/what_is_the_traffic_sign_in_the_image.wav".
#       And you can record your own voice for the question "What is shown in this image?" in "examples/what_is_shown_in_this_image.wav".

#AUDIO_FILE_1 = 'examples/what_is_the_traffic_sign_in_the_image.wav'
# AUDIO_FILE_2 = 'examples/what_is_shown_in_this_image.wav'
AUDIO_FILE_1 = 'data/0adaefab-c0fa-4d55-9564-100d2bd5bd93_86a60667f1b75930c7844e37494b97f7_UxiL1B07.wav'

if not os.path.exists(AUDIO_FILE_1):
    raise FileNotFoundError(f'Please prepare the audio file {AUDIO_FILE_1} before running the following code.')
########################## vision-speech ################################
prompt = f'{user_prompt}<|image_1|><|audio_1|>{prompt_suffix}{assistant_prompt}'
url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
print(f'>>> Prompt\n{prompt}')
image = Image.open(requests.get(url, stream=True).raw)
audio = soundfile.read(AUDIO_FILE_1)
inputs = processor(text=prompt, images=[image], audios=[audio], return_tensors='pt').to('cuda:0')
generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
    use_cache=False # Added to address the DynamicCache AttributeError
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')

########################## speech only ################################
speech_prompt = "Based on the attached audio, generate a comprehensive text transcription of the spoken content."
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'

print(f'>>> Prompt\n{prompt}')
audio = soundfile.read(AUDIO_FILE_1)
inputs = processor(text=prompt, audios=[audio], return_tensors='pt').to('cuda:0')
generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
    use_cache=False # Added to address the DynamicCache AttributeError
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')


>>> Prompt
<|user|><|image_1|><|audio_1|><|end|><|assistant|>




>>> Response
Yes
>>> Prompt
<|user|><|audio_1|>Based on the attached audio, generate a comprehensive text transcription of the spoken content.<|end|><|assistant|>
>>> Response
Okay, um, today's topic is are animals conscious? It's a very interesting topic, you know, because I feel that animals are conscious because they have feelings, they can do whatever we do, they can feel pain, they can feel happiness, you know, like that. Yes, because they are breathing, you know, I feel, I, I, I fully believe that God created animals with love, as much as he created us humans with love, you know, yes. So I remember one time, because I do have a dog, you know, inside our yard, we have a dog. So there was this time when this guy was actually fighting us, he was fighting us, he was full of rage, he was upset, so our dog could see that this person is actually angry at us, you know, and he was barking, he was barking, he was protecting us, and also I remember this one time incident that there was an e

In [None]:
import os
from tqdm import tqdm

user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
speech_prompt = "Based on the attached audio, generate a comprehensive text transcription of the spoken content."

def transcribe_file_chunked(audio_path, chunk_seconds=300, max_new_tokens=500):
    if not os.path.exists(audio_path):
        return f'FILE_NOT_FOUND: {audio_path}'
    try:
        data, sr = soundfile.read(audio_path)  # (numpy array, sample_rate)
        # ensure mono (if stereo, average channels)
        if data.ndim > 1:
            data = data.mean(axis=1)
        total_samples = data.shape[0]
        chunk_samples = int(chunk_seconds * sr)
        if chunk_samples <= 0:
            return "ERROR: invalid chunk_seconds"
        segments = []
        for start in range(0, total_samples, chunk_samples):
            seg = data[start : start + chunk_samples]
            if seg.size == 0:
                continue
            segments.append((seg, sr))
        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        texts = []
        for i, seg in enumerate(segments):
            prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
            inputs = processor(text=prompt, audios=[seg], return_tensors='pt').to(device)
            generate_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                generation_config=generation_config,
                use_cache=False
            )
            # slice off prompt tokens
            generate_ids = generate_ids[:, inputs['input_ids'].shape[1] : ]
            resp = processor.batch_decode(
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )[0].strip()
            texts.append(resp)
            # cleanup to reduce peak memory
            try:
                del inputs, generate_ids
            except Exception:
                pass
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        # join segment transcriptions (you can change separator)
        return " ".join(t for t in texts if t)
    except Exception as e:
        return f'ERROR: {e}'

# iterate and transcribe all audio files in the dataframe (adjust chunk_seconds if needed)
responses = []
for audio_path in tqdm(all_datasets_df['audio_file'].tolist(), desc='Transcribing'):
    responses.append(transcribe_file_chunked(audio_path, chunk_seconds=300, max_new_tokens=500))

all_datasets_df['ASR'] = responses

# ...existing code...

Transcribing:  60%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ    | 24/40 [11:44<07:49, 29.37s/it]


KeyboardInterrupt: 

In [2]:
# save results to csv
all_datasets_df.to_csv('phi_4_asr_results.csv', index=False)

NameError: name 'all_datasets_df' is not defined

## Whisper ASR Model

In [45]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


# use a device id for pipeline (int) and a torch device string for .to()
torch_device = "cuda:0" if torch.cuda.is_available() else "cpu"
device_id = 0 if torch.cuda.is_available() else -1
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
# move the model to the proper device
model.to(torch_device)

processor = AutoProcessor.from_pretrained(model_id)

# create the pipeline; we keep model/tokenizer/feature_extractor explicit
# note: pass device as int (0 for cuda, -1 for cpu) to the pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device_id,
)

# sample audio path from the dataframe
sample = all_datasets_df.iloc[0]['audio_file']

# For long audio you must either limit audio length (<30s / 3000 mel features)
# or enable long-form generation which requires timestamp prediction.
# Here we enable timestamp prediction and chunk long audio into 30s segments
# to avoid the "more than 3000 mel input features" ValueError.
result = pipe(sample, return_timestamps=True, chunk_length_s=30)

# result contains 'text' and (when return_timestamps=True) 'chunks' with timestamps
print(result.get("text", ""))

# if you want to inspect timestamps:
# if "chunks" in result:
#     for c in result["chunks"]:
#         print(c)


Device set to use cuda:0
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


 How can I help you today? So I just came in today. I've been just feeling like my head's just been hurting for the past three or four days. And I also felt really cold throughout the day for the last few days as well. So I did end up checking my temperature just yesterday afternoon, it was 38.7 degrees celsius okay um well tell me about the headache where exactly is it yeah so it's kind of just uh throughout like the front of my forehead, and it kind of goes to the side into my temples. And what does it feel like? It feels like a constant aching. I've had migraines before, but they, like, it's not like the pulsating sensation that I usually get with them. I see. Okay, and how severe is the pain if you had to read it from one to ten um more than like severe it's definitely less severe than my migraines but it's just constant it's it's just been there for the last four days i'd say probably uh six out of Did you you um do anything for the pain to take any medications anything that makes

In [47]:
# run whisper ASR on all audio files in the dataframe
whisper_responses = []
for audio_path in tqdm(all_datasets_df['audio_file'].tolist(), desc='Whisper Transcribing'):
    try:
        result = pipe(audio_path, return_timestamps=True, chunk_length_s=30)
        whisper_responses.append(result.get("text", ""))
    except Exception as e:
        whisper_responses.append(f'ERROR: {e}')
all_datasets_df['Whisper-ASR'] = whisper_responses

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used 

In [None]:
all_datasets_df.to_csv('phi_4_asr_results.csv', index=False)