In [None]:
%%capture
!pip install huggingsound
!pip install -U transformers
!pip install -q ipython-autotime
!pip install -q accelerate optimum
!pip install moviepy
!pip install langchain
!pip install chromadb
!pip install sentence-transformers
!pip install imutils
!pip install llama-cpp-python
!pip install faiss-cpu
!pip install langchain-experimental
!pip install yandexcloud

In [None]:
%cd ~
!git clone --recursive https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!make LLAMA_CUBLAS=1 -j libllama.so

# HACK: Use custom compiled libllama.so
%cp ~/llama.cpp/libllama.so /opt/conda/lib/python3.10/site-packages/llama_cpp/libllama.so

In [None]:
import os
import torch
import librosa
import time
import cv2
import imutils
import shutil
import glob
import argparse

import numpy as np

from collections import defaultdict
from pydub import AudioSegment
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    Speech2TextProcessor,
    Speech2TextForConditionalGeneration,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    AutoModelForSpeechSeq2Seq, 
    AutoProcessor, 
    pipeline
)
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from chromadb.config import Settings
from llama_cpp import Llama

from langchain.agents import Tool
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from typing import Any, List, Mapping, Optional
from langchain_experimental.agents.agent_toolkits import create_csv_agent
import pandas as pd
from tqdm import tqdm
import json
import re

In [None]:
import gc
gc.collect()

In [None]:
%cd /kaggle/
!mkdir tmp
%cd tmp

!wget https://huggingface.co/IlyaGusev/saiga_mistral_7b_gguf/resolve/main/model-q4_K.gguf

In [None]:
TRANSCRIBER_ID = "openai/whisper-large-v2"

# EMBEDDER_ID = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
EMBEDDER_ID ="ai-forever/sbert_large_nlu_ru"


FRAME_RATE = 1        
WARMUP = FRAME_RATE              
FGBG_HISTORY = round(FRAME_RATE * 15)   
VAR_THRESHOLD = 16              
MIN_PERCENT = 0.1               
MAX_PERCENT = 3         

SYSTEM_PROMPT = "Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им."
SYSTEM_TOKEN = 1587
USER_TOKEN = 2188
BOT_TOKEN = 12435
LINEBREAK_TOKEN = 13


DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

print(DEVICE)

In [None]:
# openai/whisper-large-v2  
processor = AutoProcessor.from_pretrained(TRANSCRIBER_ID)
model_t = AutoModelForSpeechSeq2Seq.from_pretrained(
    TRANSCRIBER_ID, torch_dtype=TORCH_DTYPE, low_cpu_mem_usage=True, use_safetensors=True
)
model_t.to(DEVICE)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model_t,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=TORCH_DTYPE,
    device=DEVICE,
)


In [None]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDER_ID)

# landchain 
def sliding_window(lst, window_size, step_size):
    windows = []
    for i in range(0, len(lst) - window_size + 1, step_size):
        windows.append(lst[i:i + window_size])
    return windows

def build_index(text, chunk_size, chunk_overlap):
    '''База текстовых батчей'''
    documents = []
    for chunk in sliding_window(full_text['chunks'], chunk_size, chunk_overlap):
        meta_data = (chunk[0]['timestamp'][0], chunk[-1]['timestamp'][0])
        chunk_text = ' '.join([element['text'] for element in chunk])
        documents.append(Document(page_content=chunk_text, metadata={'start':meta_data[0], 'end':meta_data[1]}))
    
    fixed_documents = [doc for doc in documents if doc]
    db = Chroma.from_documents(
        fixed_documents,
        embeddings,
        client_settings=Settings(
            anonymized_telemetry=False
        ),
    )
    return db

def retrieve(text, db, k_documents):
    '''Поиск ближайших батчей текста'''
    context = ""
    if db:
        retriever = db.as_retriever(search_kwargs={"k": k_documents})
        docs = retriever.get_relevant_documents(text)
        retrieved_docs = "\n\n".join([doc.page_content for doc in docs])
    return retrieved_docs


# Ф-ия для генерации датасета  
Датасет был сгенерирован с помощью YandexGPT, но не удалось дообучить модель Saiga из за нехватки ресурсов и проблем с datasphere.  
Предобученная Saiga также хорошо справляется с данной задачей, поэтому дообучение решено оставить на дальнейшее развитие

In [None]:
from langchain.llms import YandexGPT
import time
ygpt = YandexGPT(api_key="AQVN0k6NUUf9UZkghayg6kGpyI8tNybqaj58cU60")

def exctract_term(text):
    output = None
    promt = f"""
Найди ключевой термин для которого дано опеределение в данном тексте.
Важно: для термина должно быть дано опредление в тексте.
Если термин с определением есть, выводи {{термин}}
Если термина с определением нет, то выводи {{None}}.
Текст:
{text}
"""
    while True:
        time.sleep(3)
        try:
            output = ygpt(promt, temperature = 0.005)
            if output:
                break
        except:
            continue
            
    return {'system': 'Ты ищешь термин для которого есть определение в тексте','user': promt, 'bot': output}


data = exctract_term(text)
with open('/kaggle/working/testoviy.json', "w", encoding='utf-8') as w:
    json.dump(data, w, ensure_ascii=False)

## Метод извлечения терминов

In [None]:
data = [
    {'user': 12344},
    {'user': 12344}
]
with open('/kaggle/working/testoviy.json', "w", encoding='utf-8') as w:
    json.dump(data, w, ensure_ascii=False)

In [None]:
# сайга
n_ctx = 3000 
top_k = 40
top_p = 0.5
temperature = 0.05
repeat_penalty = 1.1

ROLE_TOKENS = {
    "user": USER_TOKEN,
    "bot": BOT_TOKEN,
    "system": SYSTEM_TOKEN
}


def get_message_tokens(model, role, content):
    message_tokens = model.tokenize(content.encode("utf-8"))
    message_tokens.insert(1, ROLE_TOKENS[role])
    message_tokens.insert(2, LINEBREAK_TOKEN)
    message_tokens.append(model.token_eos())
    return message_tokens


def get_system_tokens(model):
    system_message = {
        "role": "system",
        "content": SYSTEM_PROMPT
    }
    return get_message_tokens(model, **system_message)

def chat_saiga(message, model):
    system_tokens = get_system_tokens(model)
    tokens = system_tokens
    
    message_tokens = get_message_tokens(model=model, role="user", content=message)
    role_tokens = [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]
    tokens += message_tokens + role_tokens
    generator = model.generate(
        tokens,
        top_k = top_k,
        top_p = top_p,
        temp = temperature,
        repeat_penalty = repeat_penalty,
        reset = True
    )
    
    result_list = []
    for token in generator:
        token_str = model.detokenize([token]).decode("utf-8", errors="ignore")
        tokens.append(token)
        if token == model.token_eos():
            break
        result_list.append(token_str)
    return ''.join(result_list)

In [None]:
%%capture
try:
    del model_s
except:
    pass

model_path = '/kaggle/tmp/model-q4_K.gguf'
n_ctx = 3000

model_s = Llama(
    model_path = model_path,
    n_ctx = n_ctx,
    n_gpu_layers=-1,
    main_gpu = 1
)

In [None]:
def clear_output(output):
    output = (re.sub("""термин|[\{\}:\n\r'"]""", "", output)).split()
    if len(output) <3 and output:
        return ' '.join(output)
    return 'None'

def exctract_term(text):
    promt = f"""
Найди ключевой термин для которого дано опеределение в данном тексте.
Важно: для термина должно быть дано опредление в тексте.
Если термин с определением есть, выводи {{термин}}
Если термина с определением нет, то выводи {{None}}.
Текст:
{text}
    """
    with torch.no_grad():
        output = chat_saiga(promt, model_s)
    output = clear_output(output)
    return output

In [None]:
start_time = time.time()
data_terms = {"File":[], 'Term': []}

files = os.listdir("/kaggle/input/test-data")
PATH_TO_AUDIO = "/kaggle/input/test-data"

mp3_files = [file for file in files if file.endswith('.mp3')]

# цикл по аудио в папке
for audio_file in tqdm(mp3_files):
    print(f'Работаем в аудио {audio_file}')
    path_audio_file = f'{PATH_TO_AUDIO}/{audio_file}'
    with torch.no_grad():
        audio = librosa.load(path_audio_file, sr=16_000)[0]
        print('Транскрибируем аудио')
        full_text = pipe(audio, generate_kwargs={"language": "russian"})
    
    db = build_index(full_text['text'], 10, 5)
    
    # ищем список терминов
    print(f'Ищем список терминов')
    data = []
    for batch in tqdm(db.get()['documents']):
        data.append(exctract_term(batch))
    
    data = list(set(data))
    try:
        data.remove('None')
    except:
        pass
    
    data_terms['Term'] += data
    data_terms['File'] += [audio_file] * len(data)
    
    for document_id in db.get()['ids']:
        db._collection.delete(ids=document_id)
    db.persist()
    torch.cuda.empty_cache()
print(f'Прошлло времени: {time.time() - start_time}')
submission = pd.DataFrame(data_terms)

In [26]:
submission.sort_values(by='File').to_csv('/kaggle/working/sample_submission.csv', index = False)

In [30]:
%cd /kaggle/working/
%ls

/kaggle/working
sample_submission.csv


In [31]:
from IPython.display import FileLink
FileLink(r'sample_submission.csv')