*   Victor Muryn
*   Kostyantyn Savorona
*   Yurii Sahaidak

In [None]:
%pip install -q torch transformers accelerate sentence-transformers faiss-gpu langchain langchain_community peft
%pip install -q httpx==0.27.2 openai tiktoken
%pip install -q git+https://github.com/robinhad/ukrainian-tts.git

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

from google.colab import userdata
import IPython.display as ipd

import time
import pickle

import torch
import numpy as np
from tqdm import tqdm

import tiktoken
from openai import OpenAI

import faiss

from transformers import AutoProcessor, AutoModelForCTC

import scipy.io.wavfile as wavfile
import librosa

from ukrainian_tts.tts import TTS, Voices, Stress

In [None]:
!gdown 1FLg1UlQuH6E7fzIFeiATsawIby48hQs3
!unzip rag-assistant.zip

In [None]:
with open('questions.pickle', 'rb') as handle:
    questions = pickle.load(handle)

questions = np.array(questions)

# Speech to Text

In [None]:
# Load model directly
processor_stt = AutoProcessor.from_pretrained("robinhad/wav2vec2-xls-r-300m-uk")
model_stt = AutoModelForCTC.from_pretrained("robinhad/wav2vec2-xls-r-300m-uk").to("cuda")

In [None]:
def process_audio(audio_path: str):
  sampling_rate, data = wavfile.read(audio_path)

  if len(data.shape) > 1:
      data = data.mean(axis=1)

  if sampling_rate != 16000:
      data = librosa.resample(data.astype(float), orig_sr=sampling_rate, target_sr=16000)

  audio = torch.tensor(data).to('cuda')
  input_dict = processor_stt(audio, return_tensors="pt", padding=True)
  logits = model_stt(input_dict.input_values.to("cuda")).logits
  pred_ids = torch.argmax(logits, dim=-1)[0]

  return processor_stt.decode(pred_ids)

In [None]:
%%time
question = process_audio('question.wav')
question

# RAG

### Set up embeddings

In [None]:
embd_model_e5 = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

In [None]:
embeddings_e5 = []

for question in tqdm(questions):
  embeddings_e5.append( embd_model_e5.embed_query(question) )

embeddings_e5 = np.float32(embeddings_e5)


  0%|          | 0/299 [00:00<?, ?it/s][A
  0%|          | 1/299 [00:01<06:34,  1.32s/it][A
  2%|▏         | 6/299 [00:01<00:53,  5.44it/s][A
  4%|▎         | 11/299 [00:01<00:27, 10.54it/s][A
  5%|▌         | 16/299 [00:01<00:18, 15.65it/s][A
  7%|▋         | 21/299 [00:01<00:13, 20.75it/s][A
  9%|▊         | 26/299 [00:01<00:10, 25.35it/s][A
 10%|█         | 31/299 [00:02<00:09, 29.24it/s][A
 12%|█▏        | 36/299 [00:02<00:08, 32.31it/s][A
 14%|█▎        | 41/299 [00:02<00:08, 31.38it/s][A
 15%|█▌        | 46/299 [00:02<00:07, 33.99it/s][A
 17%|█▋        | 50/299 [00:02<00:07, 35.20it/s][A
 18%|█▊        | 55/299 [00:02<00:06, 36.86it/s][A
 20%|█▉        | 59/299 [00:02<00:06, 37.46it/s][A
 21%|██        | 63/299 [00:02<00:06, 37.90it/s][A
 23%|██▎       | 68/299 [00:03<00:05, 39.22it/s][A
 24%|██▍       | 73/299 [00:03<00:05, 39.54it/s][A
 26%|██▌       | 78/299 [00:03<00:05, 40.55it/s][A
 28%|██▊       | 83/299 [00:03<00:05, 40.39it/s][A
 29%|██▉       | 88/29

In [None]:
db = faiss.IndexFlatL2(embeddings_e5.shape[1])
db.add(embeddings_e5)

In [None]:
def get_context_elements(query):
  query_embedding = np.float32([embd_model_e5.embed_query(query)])
  # Perform the search
  D, I = db.search(query_embedding, k=5)  # k is the number of nearest neighbors
  return questions[I][0]

def transform_context(context):
  string = ""

  for i, doc in enumerate(context, start=1):
    string += f"Уривок #{i}: \"{doc}\"\n\n"

  return string.strip()

### Set up LLM

In [None]:
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
  encoding = tiktoken.get_encoding(encoding_name)
  num_tokens = len(encoding.encode(string))
  return num_tokens

def price_of_request(string: str, price_per_1m) -> float:
  # https://openai.com/api/pricing/
  price_per_token = price_per_1m / 1_000_000
  tokens = num_tokens_from_string(string)
  return tokens * price_per_token

# $0.50 for input
# $1.50 for output

In [None]:
client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

In [None]:
def get_system_prompt(context):
  return f"""Ти – асистент бот у компанії Київстар. Твоє завдання – відповідати на запитання, що стосуються зв'язку та інтернету, виключно на основі наданого контексту.

ПРАВИЛА:
* Відповідай тільки тоді, коли питання має прямий зв’язок із наданим контекстом і контекст містить необхідну інформацію для відповіді.
* Не відповідай, якщо контекст не має відповідної інформації чи не стосується питання.
* Уникай спекуляцій або додавання деталей поза контекстом. Відповідь повинна чітко відповідати на питання на основі контексту.
* Якщо контекст не надано або не містить відповідної інформації, відповідай "Я не маю відповіді на це запитання".

Пам'ятай: відповідай тільки тоді, коли контекст чітко надає інформацію для цього.

Контекст:
{context}
"""

In [None]:
def get_completion(query):
  context = get_context_elements(query)
  context = transform_context(context)
  system_prompt = get_system_prompt(context)

  completion = client.chat.completions.create(
      model="gpt-3.5-turbo-0125",
      messages=[
          {"role": "system", "content": system_prompt},
          {
              "role": "user",
              "content": query
          }
      ]
  )

  # $0.50 per 1M tokens for input
  input_price = completion.usage.prompt_tokens * 0.5 / 1_000_000

  # $1.50 per 1M tokens for output
  output_price = completion.usage.completion_tokens * 1.5 / 1_000_000

  return completion.choices[0].message.content, input_price, output_price

`question` is taken from Speach to text

In [None]:
question = "як створити унікальний номер з комбінації цифр яка мені подобається"

In [None]:
%%time
output_text, input_price, output_price = get_completion(question)

CPU times: user 63.1 ms, sys: 6.64 ms, total: 69.8 ms
Wall time: 2.06 s


In [None]:
input_price + output_price

0.0010255

In [None]:
print(output_text)

Для створення унікального номера з комбінації цифр, яка вам подобається, ви можете скористатися послугою "Ексклюзивний номер" від Київстар. Ця послуга дозволяє підключити індивідуальний та легко запам'ятовуваний номер. Ви можете підключити цю послугу в інтернет-магазині, найближчому магазині Київстар або партнерських магазинах. Деталі про цю послугу можна знайти в описі на сайті Київстар.


# Text to Speech

In [None]:
tts = TTS(device="cuda")

In [None]:
def output_to_voice(output_text: str, output_file: str = "reponse.wav"):
  with open(output_file, mode="wb") as file:
    _, output_text = tts.tts(output_text, Voices.Dmytro.value, Stress.Dictionary.value, file)

  # set speed x1.2
  speed_factor = 1.2
  sampling_rate, data = wavfile.read(output_file)
  new_sampling_rate = int(sampling_rate * speed_factor)
  wavfile.write(output_file, new_sampling_rate, data)

  return output_text

## Final

In [None]:
def all_together(input_file: str, output_file: str = "reponse.wav"):
  start = time.time()

  question = process_audio(input_file)
  step1 = time.time()

  output_text, input_price, output_price = get_completion(question)
  step2 = time.time()

  accented_text = output_to_voice(output_text, output_file)
  step3 = time.time()

  times = {
      "stt": step1 - start,
      "llm": step2 - step1,
      "tts": step3 - step2,
      "total": step3 - start
  }

  total_price = input_price + output_price

  return question, output_text, (input_price, output_price, total_price), times

In [None]:
%%time
question, output_text, total_price, times = all_together("question.wav", "reponse.wav")

In [None]:
print(question)

як я можу створити унікальний номер з цифир які мені подобаються


In [None]:
print(output_text)

Скористайтеся послугою "Ексклюзивний номер" в інтернет-магазині, найближчому магазині Київстар або магазинах партнерів. Ця послуга дозволить вам вибрати красивий номер, який легко запам'ятовується. Детальну інформацію ви знайдете у описі послуги "Ексклюзивний номер".


In [None]:
f"${total_price:.6f}"

'$0.000921'

In [None]:
print(times)

{'stt': 0.04222249984741211, 'llm': 2.2731988430023193, 'tts': 3.525733709335327, 'total': 5.841155052185059}


In [None]:
ipd.Audio(filename="reponse.wav")

## test times and price

In [None]:
!unzip -q audios.zip

In [None]:
!mkdir gpt_responses

In [None]:
from glob import glob

In [None]:
wavs = list(glob("./audios/question_*.wav"))
wavs = list(sorted(wavs, key=lambda x: int(x.split("_")[-1].split(".")[0])))

sst_times = []
tts_times = []
llm_times = []
total_times = []

input_prices = []
output_prices = []
total_prices = []


for i, audio_file in tqdm(enumerate(wavs), total=len(wavs)):
  print(i, audio_file)

  question, output_text, (input_price, output_price, total_price), times = all_together(
    audio_file,
    f"./gpt_responses/response_{i}.wav"
  )

  sst_times.append(times['stt'])
  tts_times.append(times['tts'])
  llm_times.append(times['llm'])
  total_times.append(times['total'])

  input_prices.append(input_price)
  output_prices.append(output_price)
  total_prices.append(total_price)

  with open(f"./gpt_responses/response_{i}.txt", 'w') as f:
    f.write(output_text)

  0%|          | 0/51 [00:00<?, ?it/s]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


0 ./audios/question_0.wav


  2%|▏         | 1/51 [00:05<04:15,  5.11s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.128884
1 ./audios/question_1.wav


  4%|▍         | 2/51 [00:07<02:59,  3.66s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.124193
2 ./audios/question_2.wav


  6%|▌         | 3/51 [00:11<02:59,  3.74s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125790
3 ./audios/question_3.wav


  8%|▊         | 4/51 [00:17<03:44,  4.78s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126834
4 ./audios/question_4.wav


 10%|▉         | 5/51 [00:23<03:57,  5.16s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.130357
5 ./audios/question_5.wav


 12%|█▏        | 6/51 [00:24<02:48,  3.74s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.157622
6 ./audios/question_6.wav


 14%|█▎        | 7/51 [00:31<03:32,  4.82s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.131446
7 ./audios/question_7.wav


 16%|█▌        | 8/51 [00:37<03:44,  5.22s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.127020
8 ./audios/question_8.wav


 18%|█▊        | 9/51 [00:48<04:50,  6.92s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126229
9 ./audios/question_9.wav


 20%|█▉        | 10/51 [00:54<04:35,  6.71s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125005
10 ./audios/question_10.wav


 22%|██▏       | 11/51 [00:55<03:18,  4.96s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.151158
11 ./audios/question_11.wav


 24%|██▎       | 12/51 [00:58<02:43,  4.19s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125051
12 ./audios/question_12.wav


 25%|██▌       | 13/51 [01:00<02:12,  3.48s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.128968
13 ./audios/question_13.wav


 27%|██▋       | 14/51 [01:01<01:48,  2.94s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126096
14 ./audios/question_14.wav


 29%|██▉       | 15/51 [01:10<02:50,  4.73s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.130635
15 ./audios/question_15.wav


 31%|███▏      | 16/51 [01:15<02:42,  4.64s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126394
16 ./audios/question_16.wav


 33%|███▎      | 17/51 [01:26<03:44,  6.60s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125680
17 ./audios/question_17.wav


 35%|███▌      | 18/51 [01:28<02:56,  5.34s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126380
18 ./audios/question_18.wav


 37%|███▋      | 19/51 [01:32<02:40,  5.01s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.129940
19 ./audios/question_19.wav


 39%|███▉      | 20/51 [01:33<01:57,  3.80s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.149662
20 ./audios/question_20.wav


 41%|████      | 21/51 [01:40<02:19,  4.67s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126235
21 ./audios/question_21.wav


 43%|████▎     | 22/51 [01:44<02:04,  4.31s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125663
22 ./audios/question_22.wav


 45%|████▌     | 23/51 [01:53<02:42,  5.81s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.127247
23 ./audios/question_23.wav


 47%|████▋     | 24/51 [01:55<02:04,  4.62s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.127171
24 ./audios/question_24.wav


 49%|████▉     | 25/51 [01:57<01:43,  3.97s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125461
25 ./audios/question_25.wav


 51%|█████     | 26/51 [02:03<01:50,  4.41s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126423
26 ./audios/question_26.wav


 53%|█████▎    | 27/51 [02:11<02:12,  5.51s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125220
27 ./audios/question_27.wav


 55%|█████▍    | 28/51 [02:15<01:55,  5.04s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126624
28 ./audios/question_28.wav


 57%|█████▋    | 29/51 [02:17<01:36,  4.38s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.124286
29 ./audios/question_29.wav


 59%|█████▉    | 30/51 [02:24<01:45,  5.02s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126080
30 ./audios/question_30.wav


 61%|██████    | 31/51 [02:26<01:22,  4.10s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.128415
31 ./audios/question_31.wav


 63%|██████▎   | 32/51 [02:28<01:04,  3.38s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.130536
32 ./audios/question_32.wav


 65%|██████▍   | 33/51 [02:35<01:20,  4.45s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.127131
33 ./audios/question_33.wav


 67%|██████▋   | 34/51 [02:46<01:49,  6.43s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125203
34 ./audios/question_34.wav


 69%|██████▊   | 35/51 [02:50<01:33,  5.85s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.130478
35 ./audios/question_35.wav


 71%|███████   | 36/51 [02:56<01:29,  5.97s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126198
36 ./audios/question_36.wav


 73%|███████▎  | 37/51 [03:02<01:21,  5.84s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.128413
37 ./audios/question_37.wav


 75%|███████▍  | 38/51 [03:04<01:00,  4.65s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.127885
38 ./audios/question_38.wav


 76%|███████▋  | 39/51 [03:09<00:57,  4.79s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.146228
39 ./audios/question_39.wav


 78%|███████▊  | 40/51 [03:15<00:57,  5.20s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125752
40 ./audios/question_40.wav


 80%|████████  | 41/51 [03:20<00:50,  5.04s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125834
41 ./audios/question_41.wav


 82%|████████▏ | 42/51 [03:24<00:42,  4.75s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126075
42 ./audios/question_42.wav


 84%|████████▍ | 43/51 [03:34<00:52,  6.54s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125730
43 ./audios/question_43.wav


 86%|████████▋ | 44/51 [03:39<00:41,  5.97s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126181
44 ./audios/question_44.wav


 88%|████████▊ | 45/51 [03:42<00:29,  4.97s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.123007
45 ./audios/question_45.wav


 90%|█████████ | 46/51 [03:44<00:21,  4.30s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.125045
46 ./audios/question_46.wav


 92%|█████████▏| 47/51 [03:49<00:17,  4.35s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.127081
47 ./audios/question_47.wav


 94%|█████████▍| 48/51 [03:59<00:18,  6.09s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.124843
48 ./audios/question_48.wav


 96%|█████████▌| 49/51 [04:01<00:09,  4.74s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.132219
49 ./audios/question_49.wav


 98%|█████████▊| 50/51 [04:04<00:04,  4.36s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RTF = 0.126156
50 ./audios/question_50.wav


100%|██████████| 51/51 [04:06<00:00,  4.84s/it]

RTF = 0.127072





In [None]:
np.mean(sst_times), np.std(sst_times)

(0.04137133617027133, 0.004840655899044445)

In [None]:
np.mean(llm_times), np.std(llm_times)

(1.7769849347133262, 0.9762185603038583)

In [None]:
np.mean(tts_times), np.std(tts_times)

(3.016692371929393, 1.9280179571154974)

In [None]:
np.mean(total_times), np.std(total_times)

(4.835048642812991, 2.8322782385275254)

In [None]:
np.mean(input_prices), np.std(input_prices)

(0.0008988627450980392, 0.0002829277791591264)

In [None]:
np.mean(output_prices), np.std(output_prices)

(0.00027282352941176475, 0.00017719123444463568)

In [None]:
np.mean(total_prices), np.std(total_prices)

(0.001171686274509804, 0.00040619260331532564)

In [None]:
sum(total_prices)

0.059756000000000004

In [None]:
!zip -r gpt_responses.zip gpt_responses