In [1]:
import warnings
import logging
import torch
import os
from log_utils import setup_logger, configure_root_logger
from ai_transcribe import AudioTranscription, AudioToTextResult
from transformers import AutoTokenizer, pipeline
from typing import List
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from tqdm.notebook import tqdm
from enum import Enum

  from speechbrain.pretrained import EncoderClassifier


In [2]:
print(f"PyTorch version: {torch.__version__}")
print(f"MPS (Metal GPU) available: {torch.backends.mps.is_available()}")
print(f"MPS device name: {torch.device('mps')}")

PyTorch version: 2.6.0
MPS (Metal GPU) available: True
MPS device name: mps


In [3]:
BERT_MODEL_NAME = 'bert-base-multilingual-cased'
TRANSCRIBE_MODEL_NAME = 'medium'

id_to_label = {0: 'DOC', 1: 'MDT', 2: 'NAME', 3: 'O', 4: 'ORG', 5: 'POS', 6: 'TEL', 7: 'VOL'}
dict_names = {'ORG': 'Организация', 'VOL': 'Объем обработки документов', 'NAME': 'Имя', 'TEL': 'Телефон'}

warnings.filterwarnings("ignore", message="FP16 is not supported on CPU")
logging.getLogger("speechbrain").setLevel(logging.WARNING)
LOGGER_LEVEL = logging.INFO

In [4]:
logger = setup_logger(__name__, level='DEBUG')
configure_root_logger(level='DEBUG')

In [5]:
# Preload models
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


[36m2025-04-13 21:01:04 - urllib3.connectionpool - DEBUG    - Starting new HTTPS connection (1): huggingface.co:443[0m
[36m2025-04-13 21:01:05 - urllib3.connectionpool - DEBUG    - https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0[0m


In [10]:
@dataclass
class NamedEntity:
    entity_type: str
    text: set


def parse_result(results):
    readable_result = []
    for pred in results:
        label = id_to_label[int(pred["entity"].split("_")[1])]
        readable_result.append({
            "word": pred["word"],
            "entity": label,
            "score": pred["score"],
            "start": pred["start"],
            "end": pred["end"]
        })
    current_group = None
    result = {}
    current_phrase = ''

    def add_pair(key, value):
        if key not in result:
            result[key] = []
        result[key].append(value)

    for res in readable_result:
        word = res['word']
        group = res['entity']
        if group == "O":
            if current_group != None:
                add_pair(current_group, current_phrase)
            current_group = None
            current_phrase = ''
            continue
        if word.startswith('##'):
            current_phrase = current_phrase + word.replace('##', '')
        elif group == current_group:
            current_phrase = current_phrase + ' ' + word
        else:
            if current_group != None:
                add_pair(current_group, current_phrase)
            current_group = group
            current_phrase = word
    for key in result:
        result[key] = list(set(result[key]))
    named_entities_list = []
    for key, value in result.items():
        named_entities_list.append(NamedEntity(key, value))
    return named_entities_list

In [25]:
class ProcessingStatus(Enum):
    INITIALIZED = (0.0, 'Processing not started')
    TRANSCRIBING = (0.2, 'Transcribing')
    DIARIZING = (0.4, 'Diarizing')
    BUILDING_AUDIO_TO_TEXT = (0.6, 'Building audio to text')
    RETRIEVING_ENTITIES = (0.8, 'Retrieving entities')
    FAILED = (1.0, 'Failed')
    DONE = (1.0, 'Done')

    def __init__(self, progress, description):
        self.progress = progress
        self.description = description

    @classmethod
    def get_progress(cls, status):
        return status.progress

    @classmethod
    def get_description(cls, status):
        return status.description


class AudioAnalysis:
    def __init__(self, audio_path: str):
        logger.debug(f'Создан Audio Analisys для {audio_path}')
        self.audio_path = os.path.join(os.path.expanduser("~"), audio_path)
        self.progress = 0.0
        self.status = ProcessingStatus.INITIALIZED
        self.results = {
            "audio_process": AudioTranscription(self.audio_path),
            "audio_text": None,
            "entities": None
        }
        self.pbar = tqdm(total=1.0, initial=0, mininterval=0.5, leave=False)

    def run_pipeline(self):
        try:
            #Transcribe
            self.results['audio_process'].full_transcribe()

            #Diarization
            self.results['audio_process'].just_diarize()

            #Unite
            self.results['audio_text'] = self.results['audio_process'].results['audio_to_text']

            #NER Processing
            self.results["entities"] = self._extract_entities()

            self._update_status(ProcessingStatus.DONE)

            self.pbar.close()
        except Exception as e:
            logger.error(f"Error in {e}")
            self._update_status(ProcessingStatus.FAILED)
            return None

    def _extract_entities(self) -> List[NamedEntity]:
        self._update_status(ProcessingStatus.RETRIEVING_ENTITIES)
        results = []
        ner_pipeline = pipeline("ner", model="./ner-model-2.0", tokenizer=tokenizer, device=device)

        for processed_result in self.results["audio_text"]:
            parsed = parse_result(ner_pipeline(processed_result.text))
            if len(parsed) > 0:
                results.append(parsed)
        return results

    def _update_status(self, status):
        self.status = status
        self.pbar.set_description(f'{ProcessingStatus.get_description(self.status)} - {self.audio_path}')
        self.pbar.n = ProcessingStatus.get_progress(self.status)
        self.pbar.refresh()


In [26]:
ar = AudioAnalysis('Разговор-15.mp3')

[36m2025-04-13 21:15:57 - __main__ - DEBUG    - Создан Audio Analisys для Разговор-15.mp3[0m
[36m2025-04-13 21:15:57 - ai_transcribe - DEBUG    - Создан Audio Analisys для /Users/igorlapin/Разговор-15.mp3[0m


  0%|          | 0/1.0 [00:00<?, ?it/s]

  0%|          | 0/1.0 [00:00<?, ?it/s]

In [27]:
ar.run_pipeline()


[36m2025-04-13 21:16:00 - ai_transcribe - DEBUG    - Разговор-15.mp3 - TRANSCRIBING..[0m
[36m2025-04-13 21:16:26 - ai_transcribe - DEBUG    - Разговор-15.mp3 - DIARIZING..[0m
[32m2025-04-13 21:16:26 - speechbrain.utils.fetching - INFO     - Fetch hyperparams.yaml: Using symlink found at '/Users/igorlapin/text-analisys/notebooks/pretrained_models/hyperparams.yaml'[0m
[32m2025-04-13 21:16:26 - speechbrain.utils.fetching - INFO     - Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached[0m
[36m2025-04-13 21:16:26 - urllib3.connectionpool - DEBUG    - Resetting dropped connection: huggingface.co[0m
[36m2025-04-13 21:16:27 - urllib3.connectionpool - DEBUG    - https://huggingface.co:443 "HEAD /speechbrain/spkrec-ecapa-voxceleb/resolve/main/custom.py HTTP/1.1" 404 0[0m
[36m2025-04-13 21:16:27 - speechbrain.utils.parameter_transfer - DEBUG    - Collecting files (or symlinks) for pretraining in pretrained_models.[0m
[32m2025-04-13 21:16

Device set to use mps


In [None]:
file_paths = ['Разговор-15.mp3', 'Разговор-14.mp3', 'Разговор-13.mp3', 'Разговор-12.mp3',
              'Разговор-11.mp3', 'Разговор-10.mp3', 'Разговор-9.mp3', 'Разговор-8.mp3',
              'Разговор-7.mp3', 'Разговор-6.mp3', 'Разговор-20.mp3', 'Разговор-15.mp3',
              'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3',
              'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3', ]

results_to_print = []


def process_file(audio_path):
    audio_analisys = AudioAnalysis(audio_path)
    audio_analisys.run_pipeline()
    logger.debug(audio_analisys.results['entities'])


#ЗАПУСКАЕМ ОБРАБОТКУ ФАЙЛОВ
with ThreadPoolExecutor(max_workers=1) as executor:
    executor.map(process_file, file_paths)

[36m2025-04-13 21:17:10 - __main__ - DEBUG    - Создан Audio Analisys для Разговор-15.mp3[0m
[36m2025-04-13 21:17:10 - ai_transcribe - DEBUG    - Создан Audio Analisys для /Users/igorlapin/Разговор-15.mp3[0m


  0%|          | 0/1.0 [00:00<?, ?it/s]

  0%|          | 0/1.0 [00:00<?, ?it/s]

[36m2025-04-13 21:17:10 - ai_transcribe - DEBUG    - Разговор-15.mp3 - TRANSCRIBING..[0m


In [None]:
#ВЫВОД ИЗВЛЕЧЕННЫХ СУЩНОСТЕЙ
for result in results_to_print:
    res_dict = {}
    for data in result[0]:
        for key, value in data.items():
            if key not in res_dict:
                res_dict[key] = set()
            for val in value:
                res_dict[key].add(val)
    print(f'Файл {result[1]}:')
    for key, value in res_dict.items():
        # print(f'{dict_names[key]}:')
        for val in value:
            print(f'  {val}')
    print()