*   Victor Muryn
*   Kostyantyn Savorona
*   Yurii Sahaidak

In [None]:
%pip install -q torch transformers accelerate bitsandbytes sentence-transformers faiss-gpu langchain langchain_community peft
%pip install -q git+https://github.com/robinhad/ukrainian-tts.git

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.2/411.2 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K

In [None]:
import json
import time
import pickle

import torch
import numpy as np
from tqdm import tqdm
from google.colab import userdata
import IPython.display as ipd

from langchain.embeddings import HuggingFaceEmbeddings

import faiss

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoProcessor, AutoModelForCTC

from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
from peft import PeftModel, PeftConfig

from langchain_core.runnables import RunnablePassthrough

import scipy.io.wavfile as wavfile
import librosa

from ukrainian_tts.tts import TTS, Voices, Stress

In [None]:
!gdown 1FLg1UlQuH6E7fzIFeiATsawIby48hQs3
!unzip rag-assistant.zip

Downloading...
From: https://drive.google.com/uc?id=1FLg1UlQuH6E7fzIFeiATsawIby48hQs3
To: /content/rag-assistant.zip
  0% 0.00/1.88M [00:00<?, ?B/s] 56% 1.05M/1.88M [00:00<00:00, 9.57MB/s]100% 1.88M/1.88M [00:00<00:00, 14.5MB/s]
Archive:  rag-assistant.zip
  inflating: embeddings_openai.pickle  
  inflating: __MACOSX/._embeddings_openai.pickle  
  inflating: questions.pickle        
  inflating: __MACOSX/._questions.pickle  
  inflating: modified_questions.json  
  inflating: __MACOSX/._modified_questions.json  


In [None]:
with open('questions.pickle', 'rb') as handle:
    questions = pickle.load(handle)

questions = np.array(questions)

# Speech to Text

In [None]:
# Load model directly
processor_stt = AutoProcessor.from_pretrained("robinhad/wav2vec2-xls-r-300m-uk")
model_stt = AutoModelForCTC.from_pretrained("robinhad/wav2vec2-xls-r-300m-uk").to("cuda")

In [None]:
def process_audio(audio_path: str):
  sampling_rate, data = wavfile.read(audio_path)

  if len(data.shape) > 1:
      data = data.mean(axis=1)

  if sampling_rate != 16000:
      data = librosa.resample(data.astype(float), orig_sr=sampling_rate, target_sr=16000)

  audio = torch.tensor(data).to('cuda')
  input_dict = processor_stt(audio, return_tensors="pt", padding=True)
  logits = model_stt(input_dict.input_values.to("cuda")).logits
  pred_ids = torch.argmax(logits, dim=-1)[0]

  return processor_stt.decode(pred_ids)

In [None]:
%%time
question = process_audio('question.wav')
question

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


CPU times: user 11.5 s, sys: 141 ms, total: 11.6 s
Wall time: 12.6 s


'як я можу створити унікальний номер з цифир які мені подобаються'

# RAG

In [None]:
embd_model_e5 = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

  embd_model_e5 = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")


In [None]:
embeddings_e5 = []

for question in tqdm(questions):
  embeddings_e5.append( embd_model_e5.embed_query(question) )

embeddings_e5 = np.float32(embeddings_e5)

100%|██████████| 299/299 [00:08<00:00, 36.88it/s]


In [None]:
db = faiss.IndexFlatL2(embeddings_e5.shape[1])
db.add(embeddings_e5)

## Rag

### Set up LLM

In [None]:
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    token=userdata.get("HF_TOKEN")
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model.to("cuda")



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    skip_special_tokens=False,
    cache_dir="./cache/",
    token=userdata.get("HF_TOKEN")
)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
text_generation_pipeline = pipeline(
    model = model,
    tokenizer = tokenizer,
    task = "text-generation",
    return_full_text=True,
    max_new_tokens = 400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [None]:
prompt_template = """<|begin_of_text|><<SYS>>
Ти – асистент бот у компанії Київстар. Твоє завдання – відповідати на запитання, що стосуються зв'язку та інтернету, виключно на основі наданого контексту.

ПРАВИЛА:
* Відповідай тільки тоді, коли питання має прямий зв’язок із наданим контекстом і контекст містить необхідну інформацію для відповіді.
* Не відповідай, якщо контекст не має відповідної інформації чи не стосується питання.
* Уникай спекуляцій або додавання деталей поза контекстом. Відповідь повинна чітко відповідати на питання на основі контексту.
* Якщо контекст не надано або не містить відповідної інформації, відповідай "Я не маю відповіді на це запитання".

Пам'ятай: відповідай тільки тоді, коли контекст чітко надає інформацію для цього.

Контекст:
{context}
<</SYS>>
<|eot_id|>
<<USER>>
[INST] Питання: {question} [/INST]
<</USER>>
<|eot_id|>
<<ASSISTANT>>
Відповідь: """

In [None]:
prompt = PromptTemplate(
  template = prompt_template,
  input_variables = ["context", "question"]
)

In [None]:
llm_chain = prompt | llm | StrOutputParser()

In [None]:
def get_context_elements(query):
  query_embedding = np.float32([embd_model_e5.embed_query(query)])
  # Perform the search
  D, I = db.search(query_embedding, k=5)  # k is the number of nearest neighbors
  return questions[I][0]

def transform_context(context):
  string = ""

  for i, doc in enumerate(context, start=1):
    string += f"Уривок #{i}: \"{doc}\"\n\n"

  return string.strip()

retriever = lambda x: transform_context(get_context_elements(x))

In [None]:
rag_chain = (
  {"context": retriever, "question": RunnablePassthrough()} | llm_chain
)

`question` is taken from Speach to text

In [None]:
def get_completion(query: str):
  result = rag_chain.invoke(query)
  output_text = result.split("<<ASSISTANT>>\nВідповідь: ")[1].split("<|user|>")[0].strip()
  return output_text, 0, 0

# Text to Speech

In [None]:
tts = TTS(device="cuda")

downloading https://github.com/robinhad/ukrainian-tts/releases/download/v6.0.0
Found ./model.pth. Skipping download...
Found ./config.yaml. Skipping download...
Found ./spk_xvector.ark. Skipping download...
Found ./feats_stats.npz. Skipping download...
downloaded.


  WeightNorm.apply(module, name, dim)
  model.load_state_dict(torch.load(model_file, map_location=device))


In [None]:
def output_to_voice(output_text: str, output_file: str = "reponse.wav"):
  with open(output_file, mode="wb") as file:
    _, output_text = tts.tts(output_text, Voices.Dmytro.value, Stress.Dictionary.value, file)

  # set speed x1.2
  speed_factor = 1.2
  sampling_rate, data = wavfile.read(output_file)
  new_sampling_rate = int(sampling_rate * speed_factor)
  wavfile.write(output_file, new_sampling_rate, data)

  return output_text

# Final

In [None]:
def all_together(input_file: str, output_file: str = "reponse.wav"):
  start = time.time()

  question = process_audio(input_file)
  step1 = time.time()

  output_text, input_price, output_price = get_completion(question)
  step2 = time.time()

  total_price = input_price + output_price

  accented_text = output_to_voice(output_text, output_file)
  step3 = time.time()

  times = {
      "stt": step1 - start,
      "llm": step2 - step1,
      "tts": step3 - step2,
      "total": step3 - start
  }

  return question, output_text, total_price, times

In [None]:
%%time
question, output_text, total_price, times = all_together("question.wav", "reponse.wav")

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


як я можу створити унікальний номер з цифир які мені подобаються


  return to_device(torch.from_numpy(data), device, dtype, non_blocking, copy)


RTF = 0.167748
CPU times: user 9.23 s, sys: 120 ms, total: 9.35 s
Wall time: 9.56 s


In [None]:
print(question)

як я можу створити унікальний номер з цифир які мені подобаються


In [None]:
print(output_text)

Скористайтеся послугою Ексклюзивний номер. Підключити її можна в нашому інтернет-магазині, найближчому магазині Київстар або магазинах партнерів. Деталі – в описі послуги Ексклюзивний номер. <</ASSISTANT>>


In [None]:
f"${total_price:.6f}"

'$0.000000'

In [None]:
print(times)

{'stt': 0.04455161094665527, 'llm': 6.619278907775879, 'tts': 2.897444248199463, 'total': 9.561274766921997}


## Generate questions

In [None]:
ipd.Audio(filename="reponse.wav")

In [None]:
!mkdir audios

In [None]:
import time

In [None]:
times = []

with open("modified_questions.json", "r") as f:
  modified_questions = json.load(f)

  for i, question in tqdm(enumerate(modified_questions["questions"])):
    original = question["original"]
    modified = question["modified"]

    start = time.time()
    output_to_voice(modified, f"./audios/question_{i}.wav")
    times.append(time.time() - start)

In [None]:
!zip -r audios.zip audios

In [None]:
np.mean(times)

0.5902021445480048

In [None]:
np.std(times)

0.1144984802447464

## Test all system

In [None]:
!mkdir llama_responses

In [None]:
from glob import glob

In [None]:
wavs = list(glob("./audios/question_*.wav"))
wavs = list(sorted(wavs, key=lambda x: int(x.split("_")[-1].split(".")[0])))

sst_times = []
tts_times = []
llm_times = []
total_times = []


for i, audio_file in tqdm(enumerate(wavs), total=len(wavs)):
  print(i, audio_file)

  question, output_text, total_price, times = all_together(
    audio_file,
    f"./llama_responses/response_{i}.wav"
  )

  sst_times.append(times['stt'])
  tts_times.append(times['tts'])
  llm_times.append(times['llm'])
  total_times.append(times['total'])

  with open(f"./llama_responses/response_{i}.txt", 'w') as f:
    f.write(output_text)

  0%|          | 0/51 [00:00<?, ?it/s]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


0 ./audios/question_0.wav


  2%|▏         | 1/51 [00:04<03:31,  4.23s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126898
1 ./audios/question_1.wav


  4%|▍         | 2/51 [00:08<03:20,  4.10s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127650
2 ./audios/question_2.wav


  6%|▌         | 3/51 [00:15<04:23,  5.49s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126616
3 ./audios/question_3.wav


  8%|▊         | 4/51 [00:17<03:18,  4.22s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.124736
4 ./audios/question_4.wav


 10%|▉         | 5/51 [00:22<03:25,  4.46s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.128397
5 ./audios/question_5.wav


 12%|█▏        | 6/51 [00:26<03:14,  4.32s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.134786
6 ./audios/question_6.wav


 14%|█▎        | 7/51 [00:42<05:56,  8.10s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.125978
7 ./audios/question_7.wav


 16%|█▌        | 8/51 [00:49<05:31,  7.72s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126818
8 ./audios/question_8.wav


 18%|█▊        | 9/51 [01:04<06:55,  9.88s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126220
9 ./audios/question_9.wav


 20%|█▉        | 10/51 [01:18<07:37, 11.15s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126238
10 ./audios/question_10.wav


 22%|██▏       | 11/51 [01:26<06:53, 10.34s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.153822
11 ./audios/question_11.wav


 24%|██▎       | 12/51 [01:32<05:45,  8.87s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126856
12 ./audios/question_12.wav


 25%|██▌       | 13/51 [01:40<05:36,  8.85s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.125797
13 ./audios/question_13.wav


 27%|██▋       | 14/51 [01:46<04:48,  7.81s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.132268
14 ./audios/question_14.wav


 29%|██▉       | 15/51 [02:30<11:21, 18.93s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127848
15 ./audios/question_15.wav


 31%|███▏      | 16/51 [02:35<08:27, 14.51s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.131614
16 ./audios/question_16.wav


 33%|███▎      | 17/51 [03:20<13:26, 23.73s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127253
17 ./audios/question_17.wav


 35%|███▌      | 18/51 [03:25<09:56, 18.07s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.128254
18 ./audios/question_18.wav


 37%|███▋      | 19/51 [03:47<10:16, 19.27s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.125947
19 ./audios/question_19.wav


 39%|███▉      | 20/51 [03:54<08:08, 15.76s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.125257
20 ./audios/question_20.wav


 41%|████      | 21/51 [04:00<06:21, 12.70s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127064
21 ./audios/question_21.wav


 43%|████▎     | 22/51 [04:09<05:32, 11.46s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126100
22 ./audios/question_22.wav


 45%|████▌     | 23/51 [04:31<06:53, 14.75s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127081
23 ./audios/question_23.wav


 47%|████▋     | 24/51 [04:36<05:20, 11.88s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126815
24 ./audios/question_24.wav


 49%|████▉     | 25/51 [04:44<04:35, 10.59s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.128525
25 ./audios/question_25.wav


 51%|█████     | 26/51 [04:53<04:14, 10.19s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.125539
26 ./audios/question_26.wav


 53%|█████▎    | 27/51 [04:59<03:34,  8.95s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127131
27 ./audios/question_27.wav


 55%|█████▍    | 28/51 [05:02<02:43,  7.11s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.138420
28 ./audios/question_28.wav


 57%|█████▋    | 29/51 [05:15<03:19,  9.06s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126494
29 ./audios/question_29.wav


 59%|█████▉    | 30/51 [05:26<03:18,  9.47s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126494
30 ./audios/question_30.wav


 61%|██████    | 31/51 [05:29<02:30,  7.54s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.134420
31 ./audios/question_31.wav


 63%|██████▎   | 32/51 [05:31<01:52,  5.94s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.125778
32 ./audios/question_32.wav


 65%|██████▍   | 33/51 [05:39<01:57,  6.54s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.125433
33 ./audios/question_33.wav


 67%|██████▋   | 34/51 [05:49<02:06,  7.44s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.128980
34 ./audios/question_34.wav


 69%|██████▊   | 35/51 [06:04<02:36,  9.76s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127086
35 ./audios/question_35.wav


 71%|███████   | 36/51 [06:17<02:42, 10.81s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127200
36 ./audios/question_36.wav


 73%|███████▎  | 37/51 [06:30<02:42, 11.60s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.131656
37 ./audios/question_37.wav


 75%|███████▍  | 38/51 [06:37<02:12, 10.17s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.126357
38 ./audios/question_38.wav


 76%|███████▋  | 39/51 [06:45<01:51,  9.29s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.128429
39 ./audios/question_39.wav


 78%|███████▊  | 40/51 [06:50<01:30,  8.24s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127704
40 ./audios/question_40.wav


 80%|████████  | 41/51 [06:55<01:11,  7.20s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.130239
41 ./audios/question_41.wav


 82%|████████▏ | 42/51 [06:59<00:56,  6.33s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.129787
42 ./audios/question_42.wav


 84%|████████▍ | 43/51 [07:20<01:24, 10.62s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.128872
43 ./audios/question_43.wav


 86%|████████▋ | 44/51 [07:41<01:36, 13.82s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127125
44 ./audios/question_44.wav


 88%|████████▊ | 45/51 [07:48<01:10, 11.80s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127804
45 ./audios/question_45.wav


 90%|█████████ | 46/51 [07:56<00:52, 10.48s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127038
46 ./audios/question_46.wav


 92%|█████████▏| 47/51 [08:04<00:39,  9.82s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.127186
47 ./audios/question_47.wav


 94%|█████████▍| 48/51 [08:23<00:37, 12.53s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.128712
48 ./audios/question_48.wav


 96%|█████████▌| 49/51 [08:25<00:19,  9.54s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.148488
49 ./audios/question_49.wav


 98%|█████████▊| 50/51 [08:29<00:07,  7.74s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RTF = 0.130752
50 ./audios/question_50.wav


100%|██████████| 51/51 [08:38<00:00, 10.16s/it]

RTF = 0.129886





In [None]:
np.mean(sst_times), np.std(sst_times)

(0.04266776758081773, 0.007818094807109707)

In [None]:
np.mean(llm_times), np.std(llm_times)

(7.708785346910065, 6.669677050769595)

In [None]:
np.mean(tts_times), np.std(tts_times)

(2.4119862621905757, 2.143897027950164)

In [None]:
np.mean(total_times), np.std(total_times)

(10.163439376681458, 8.807301590910315)

In [None]:
!zip -r llama_responses.zip llama_responses