In [21]:
import torch
import wandb
from tqdm.auto import tqdm
from taudio import TAudio
import bitsandbytes as bnb

from dataset import *
from datasets import load_dataset, Dataset
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from transformers import Qwen2_5OmniThinkerForConditionalGeneration, BitsAndBytesConfig

In [22]:
ds = load_dataset("gilkeyio/librispeech-alignments", split='train_clean_100', streaming=True)

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/66 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/66 [00:00<?, ?it/s]

In [23]:
model_id = "Qwen/Qwen2.5-Omni-3B"

In [24]:
processor = Qwen2_5OmniProcessor.from_pretrained(model_id)

In [25]:
model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
	model_id, 
	torch_dtype="auto",
	device_map="auto",
)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
example = next(iter(ds))

In [27]:
conversation = [
	{
		"role": "system",
		"content": [
			{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
		],
	},
	{
		"role": "user",
		"content": [
			{"type": "audio", "audio": "PLACEHOLDER AUDIO"}, # we will manually fill in the audio
			{"type": "text", "text": f"Repeat exactly what was said in the audio"},
		],
	},
	{
		"role": "assistant",
		  "content": [
			 {"type": "text", "text": f"Habitual self possession and self respect happy and gracious willingness hard souled and joyously joyous."}
		  ],
	}
]

text = processor.apply_chat_template(
	conversation,
	tokenize=False,
	add_generation_prompt=False,
	continue_final_message=True
)

In [28]:
print(text[0])

<|im_start|>system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.<|im_end|>
<|im_start|>user
<|audio_bos|><|AUDIO|><|audio_eos|>Repeat exactly what was said in the audio<|im_end|>
<|im_start|>assistant
Habitual self possession and self respect happy and gracious willingness hard souled and joyously joyous.


In [29]:
inputs = processor(
	text=text,
	audio=example['audio']['array'],
	return_tensors='pt',
	padding=True,
)

inputs = {k: v.to(model.device) for k, v in inputs.items()}

In [85]:
# print(inputs['input_ids'][0].detach().cpu().numpy().tolist())
# print(processor.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0].tolist()))
# print(processor.tokenizer.convert_tokens_to_string(processor.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0].tolist())))
# processor.batch_decode(torch.tensor(15888).unsqueeze(0), skip_special_tokens=True)

for i in range(inputs['input_ids'][0].shape[0]):
	print(f"{i}: \t\t{inputs['input_ids'][0][i].detach().cpu().numpy()} \t\t{processor.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][i].tolist())}")

0: 		151644 		<|im_start|>
1: 		8948 		system
2: 		198 		Ċ
3: 		2610 		You
4: 		525 		Ġare
5: 		1207 		ĠQ
6: 		16948 		wen
7: 		11 		,
8: 		264 		Ġa
9: 		4108 		Ġvirtual
10: 		3738 		Ġhuman
11: 		7881 		Ġdeveloped
12: 		553 		Ġby
13: 		279 		Ġthe
14: 		1207 		ĠQ
15: 		16948 		wen
16: 		7909 		ĠTeam
17: 		11 		,
18: 		54364 		ĠAlibaba
19: 		5737 		ĠGroup
20: 		11 		,
21: 		12875 		Ġcapable
22: 		315 		Ġof
23: 		817 		Ġper
24: 		46344 		ceiving
25: 		82529 		Ġauditory
26: 		323 		Ġand
27: 		9124 		Ġvisual
28: 		11127 		Ġinputs
29: 		11 		,
30: 		438 		Ġas
31: 		1632 		Ġwell
32: 		438 		Ġas
33: 		23163 		Ġgenerating
34: 		1467 		Ġtext
35: 		323 		Ġand
36: 		8806 		Ġspeech
37: 		13 		.
38: 		151645 		<|im_end|>
39: 		198 		Ċ
40: 		151644 		<|im_start|>
41: 		872 		user
42: 		198 		Ċ
43: 		151647 		<|audio_bos|>
44: 		151646 		<|AUDIO|>
45: 		151646 		<|AUDIO|>
46: 		151646 		<|AUDIO|>
47: 		151646 		<|AUDIO|>
48: 		151646 		<|AUDIO|>
49: 		151646 		<|AUDIO|>
50: 		151646 		<|AUDIO|>
51: 		

In [None]:
outputs = model(
	**inputs,
	output_attentions=True
)

In [137]:
word_idx = -2
print(example['words'][word_idx])
print(int((example['words'][word_idx]['start'] * (1000) / 40) + 44))
print(int((example['words'][word_idx]['end'] * (1000) / 40) + 44))

print(int((example['words'][-1]['start'] * (1000) / 40) + 44))
print(int((example['words'][-1]['end'] * (1000) / 40) + 44))

{'end': 11.03, 'start': 10.43, 'word': 'joyously'}
304
319
319
339


In [136]:
# layer = len(outputs.attentions) // 2
layer = 18
token_index = -5

# with open('attentions.txt', 'w') as f:
# 	f.write(outputs.attentions[layer].detach().float().cpu().numpy().tolist().__str__() + '\n')

attentions = outputs.attentions[layer][0]
# print(attentions[head][-3].detach().float().cpu().numpy())

for head in range(attentions.shape[0]):
	print(f"Head: {head}")
	for i in range(attentions[head][token_index].shape[0]):
		attn_value = attentions[head][token_index][i].detach().float().cpu().numpy()
		if attn_value >= 1e-1:
			color = "\033[91m"  # red
		elif attn_value >= 1e-2:
			color = "\033[38;5;208m"  # orange (ANSI 208)
		elif attn_value >= 1e-3:
			color = "\033[93m"  # yellow
		else:
			color = "\033[90m"  # grey
		reset = "\033[0m"
		print(f"{i}: \t{color}{attn_value:.5e}{reset}")


Head: 0
0: 	[38;5;208m4.61426e-02[0m
1: 	[90m1.87159e-05[0m
2: 	[90m1.13249e-05[0m
3: 	[90m7.80821e-06[0m
4: 	[90m7.97212e-07[0m
5: 	[90m1.00136e-05[0m
6: 	[90m1.88828e-04[0m
7: 	[90m2.91824e-04[0m
8: 	[90m5.07832e-05[0m
9: 	[90m7.33137e-06[0m
10: 	[90m3.95775e-05[0m
11: 	[90m3.56138e-06[0m
12: 	[90m4.73857e-06[0m
13: 	[90m2.77162e-06[0m
14: 	[90m5.69224e-06[0m
15: 	[90m2.55108e-05[0m
16: 	[90m1.20401e-05[0m
17: 	[90m3.95775e-05[0m
18: 	[90m3.48091e-05[0m
19: 	[90m3.95775e-05[0m
20: 	[90m2.89679e-05[0m
21: 	[90m8.82149e-06[0m
22: 	[90m5.03659e-06[0m
23: 	[90m2.36928e-06[0m
24: 	[90m1.36495e-05[0m
25: 	[90m1.36495e-05[0m
26: 	[90m1.53482e-06[0m
27: 	[90m5.69224e-06[0m
28: 	[90m2.95043e-06[0m
29: 	[90m6.07967e-06[0m
30: 	[90m2.28174e-07[0m
31: 	[90m1.22935e-06[0m
32: 	[90m1.19209e-06[0m
33: 	[90m1.08778e-06[0m
34: 	[90m2.86102e-06[0m
35: 	[90m7.04080e-07[0m
36: 	[90m4.67524e-07[0m
37: 	[90m5.79834e-04[0m
38: 	[9

In [33]:
text_ids = model.generate(**inputs, max_new_tokens=128)
text = processor.batch_decode(text_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]

print(text)

<|im_start|>system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.<|im_end|>
<|im_start|>user
<|audio_bos|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUD

In [None]:
tokenizer.encode("joyously")

[4123, 7017]