In [50]:
import miniaudio

def choose_device():
    devices = miniaudio.Devices()
    print("Available recording devices:")
    captures = devices.get_captures()
    for d in enumerate(captures):
        print("{num} = {name}".format(num=d[0], name=d[1]['name']))
    choice = int(input("record from which device? "))
    return captures[choice]


In [51]:
buffer_chunks = []

In [52]:
import array


def record_to_buffer():
    _ = yield
    while True:
        data = yield
        print(".", end="", flush=True)
        buffer_chunks.append(data)

capture = miniaudio.CaptureDevice(buffersize_msec=1000, sample_rate=44100, device_id=choose_device()['id'])
generator = record_to_buffer()
next(generator)
capture.start(generator)
input('Enter to stop recording')
capture.stop()
buffer = b"".join(buffer_chunks)
samples = array.array('h')
samples.frombytes(buffer)
sound = miniaudio.DecodedSoundFile('capture', capture.nchannels, capture.sample_rate, capture.format, samples)
miniaudio.wav_write_file('capture.wav', sound)

Available recording devices:
0 = Microphone (USB Live Camera audio)
1 = Microphone (fifine Microphone)
...

In [53]:
import whisperx

In [54]:
import torch

print(torch.cuda.is_available())

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

True


In [55]:
device = "cuda" 
audio_file = "capture.wav"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# save model to local path (optional)
model_dir = "model"
model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\andrew\AppData\Local\Programs\Python\Python312\Lib\site-packages\whisperx\assets\pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [56]:
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment


Detected language: en (0.64) in first 30s of audio...
[{'text': ' Who am I talking to?', 'start': 1.178, 'end': 2.242}]


In [57]:
segments = [segment["text"] for segment in result["segments"]]
print(segments)

[' Who am I talking to?']


In [58]:
import nltk
import pymorphy3

morph = pymorphy3.MorphAnalyzer()
nltk_stopwords = nltk.corpus.stopwords.words('russian')

corpuses = []

for text in segments:
	text = text.lower()
	words = nltk.word_tokenize(text, language='russian')
	words = [word for word in words if word not in nltk_stopwords]
	words = [morph.parse(word)[0].normal_form for word in words]
	words = [word for word in words if word.isalpha()]
	
	corpuses.append(' '.join(words))


corpuses, len(corpuses)

INFO:pymorphy3.opencorpora_dict.wrapper:Loading dictionaries from c:\Users\andrew\AppData\Local\Programs\Python\Python312\Lib\site-packages\pymorphy3_dicts_ru\data
INFO:pymorphy3.opencorpora_dict.wrapper:format: 2.4, revision: 417150, updated: 2022-01-08T22:09:24.565962


(['who am i talking to'], 1)

In [59]:
MISTRAL_URI = 'http://localhost:11434'

In [60]:
sys_prompt = "You are a voice-powered neural network, every prompt you receive might be a question, a command, etc" \
"Please respond with a single word or a short phrase. If you don't know the answer, just say 'I don't know'" \
"Anything you'll recieve as a prompt will be stripped of filler words, so you can focus on the main idea" \
"You may recieve prompts in either english or russian" \
"Respond in the same language you were asked" \
"Please, don't try to be funny or sarcastic, just be helpful and informative" \
"Remember, you are a voice-powered neural network, you are here to help and inform, not to entertain" \
"Please, be polite and respectful, and always try to provide the most accurate and relevant information" \
"Thank you for your cooperation, and remember, you are a voice-powered neural network, you are here to help and inform, not to entertain"

In [61]:
from mistralai import Mistral

client = Mistral(server_url=MISTRAL_URI)

for corpus in corpuses:
	resp = client.chat.complete(
		model='mistral',
		messages=[
			{ 'role': 'system', 'content': sys_prompt },
			{ 'role': 'user', 'content': corpus }
		],
		max_tokens=128
	)

	print(resp.choices[0].message.content)

INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"


 Assistant
