### RUN LLM

python3 -m vllm.entrypoints.api_server --model TheBloke/openchat-3.5-1210-AWQ --quantization awq --dtype auto


In [6]:
import requests


def chat(user_text=None):
    initial_prompt = f"GPT4 Correct User: Ты бот-помощник. Отвечай коротко и по существу.\n\n{user_text}<|end_of_turn|>GPT4 Correct Assistant:\n"
    result = requests.post(
        "http://localhost:8000/generate",
        json={
            "prompt": initial_prompt,
            "temperature": 0.5,
            "max_tokens": 1024,
        },
    ).json()["text"][0]
    result = result[len(initial_prompt) :]
    return result


# chat(user_text="Привет.")
print(chat(user_text="Привет, как дела?"))


Привет! Все хорошо. Как я могу помочь?


### voice generation

In [3]:
import torch

language = "ru"
model_id = "v4_ru"
device = torch.device("cpu")

model, example_text = torch.hub.load(
    repo_or_dir="snakers4/silero-models",
    model="silero_tts",
    language=language,
    speaker=model_id,
)
model.to(device)

Using cache found in /home/user-name-goes-here/.cache/torch/hub/snakers4_silero-models_master


In [7]:
from IPython.display import Audio, display

sample_rate = 48000
# sample_rate = 24000
# xenia - лучшая на мой взгляд
speaker = "xenia"
put_accent = True
put_yo = False
# example_text = "Привет! Как я не могу напрямую заказать пиццу, я могу помочь тебе выбрать пиццу и передать заказ в ресторан. Назови свой город, и я покажу тебе ближайшие рестораны, где можно заказать пиццу."
example_text = "Привет! Все хорошо. Как я могу помочь?"

audio = model.apply_tts(
    text=example_text,
    speaker=speaker,
    sample_rate=sample_rate,
    put_accent=put_accent,
    put_yo=put_yo,
)
print(example_text)
display(Audio(audio, rate=sample_rate))

Привет! Все хорошо. Как я могу помочь?


In [13]:
def generate_voice(
    text=None,
    sample_rate=48000,
):
    speaker = "xenia"
    put_accent = True
    put_yo = False
    audio = model.apply_tts(
        text=text,
        speaker=speaker,
        sample_rate=sample_rate,
        put_accent=put_accent,
        put_yo=put_yo,
    )
    return sample_rate, audio


sample_rate, audio = generate_voice(text="Привет, как дела? Hello world.")
print(audio)
display(Audio(audio, rate=sample_rate))

tensor([-3.5145e-05,  8.4325e-05, -5.4081e-06,  ..., -7.1276e-05,
        -1.4637e-05, -6.0406e-06])


### voice recognition

In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import gradio as gr
import numpy as np
from functools import partial

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=False,
    use_safetensors=True,
    use_flash_attention_2=True,
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=1,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]
print(sample)
result = pipe(sample)
print(result["text"])

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'path': '0d38672e0bbdbdc460af55b8bb84a15b2730db2819f2af64f9c777d4d586f2de', 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
       0.0005188 ]), 'sampling_rate': 16000}
 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of Upguards and Adam paintings, and Mason's exquisite idylls are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a 

In [17]:
import gradio as gr
import numpy as np
from IPython.display import clear_output


def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    sample = {"sampling_rate": sr, "raw": y}
    print(sample)
    result = pipe(sample)
    speech_text = result["text"]
    llm_response = chat(user_text=speech_text)
    sample_rate, generated_audio = generate_voice(
        text=llm_response,
    )
    return [
        gr.Markdown(f"{speech_text}-----{llm_response}"),
        gr.Audio((sample_rate, generated_audio.numpy())),
    ]


markdown = gr.Markdown()
audio = gr.Audio()
demo = gr.Interface(
    transcribe,
    inputs=[gr.Audio(sources=["microphone"])],
    outputs=[markdown, audio],
)
demo.launch(
    inbrowser=True,
    # server_port=7866,
)
clear_output()

{'sampling_rate': 48000, 'raw': array([0.        , 0.        , 0.        , ..., 0.00038597, 0.0011579 ,
       0.0011579 ], dtype=float32)}


