In [1]:
import warnings
import logging
import torch
import os
import whisper
from log_utils import setup_logger, configure_root_logger
from ai_transcribe import AudioTranscription
from intonation_resolover import predict_emotion
from transformers import AutoTokenizer, pipeline
from typing import List
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from tqdm.notebook import tqdm
from enum import Enum
from pydub import AudioSegment

  from speechbrain.pretrained import EncoderClassifier


In [2]:
print(f"PyTorch version: {torch.__version__}")
print(f"MPS (Metal GPU) available: {torch.backends.mps.is_available()}")
print(f"MPS device name: {torch.device('mps')}")

PyTorch version: 2.7.0
MPS (Metal GPU) available: True
MPS device name: mps


In [3]:
BERT_MODEL_NAME = 'bert-base-multilingual-cased'
TRANSCRIBE_MODEL_NAME = 'medium'

id_to_label = {0: 'DOC', 1: 'MDT', 2: 'NAME', 3: 'O', 4: 'ORG', 5: 'POS', 6: 'TEL', 7: 'VOL'}
dict_names = {'ORG': 'Организация', 'VOL': 'Объем обработки документов', 'NAME': 'Имя', 'TEL': 'Телефон'}

warnings.filterwarnings("ignore", message="FP16 is not supported on CPU")
LOGGER_LEVEL = logging.INFO

In [4]:
logger = setup_logger(__name__, level='INFO')
configure_root_logger(level='INFO')
sb_logger = logging.getLogger("speechbrain")
sb_logger.setLevel(logging.CRITICAL)  # Mute everything
sb_logger.propagate = False  # Prevent prop

In [5]:
# Preload models
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


In [6]:
from huggingface_hub import login

token = ""
login(token)

In [7]:
@dataclass
class NamedEntity:
    entity_type: str
    text: set


def parse_result(results):
    readable_result = []
    for pred in results:
        label = id_to_label[int(pred["entity"].split("_")[1])]
        readable_result.append({
            "word": pred["word"],
            "entity": label,
            "score": pred["score"],
            "start": pred["start"],
            "end": pred["end"]
        })
    current_group = None
    result = {}
    current_phrase = ''

    def add_pair(key, value):
        if key not in result:
            result[key] = []
        result[key].append(value)

    for res in readable_result:
        word = res['word']
        group = res['entity']
        if group == "O":
            if current_group != None:
                add_pair(current_group, current_phrase)
            current_group = None
            current_phrase = ''
            continue
        if word.startswith('##'):
            current_phrase = current_phrase + word.replace('##', '')
        elif group == current_group:
            current_phrase = current_phrase + ' ' + word
        else:
            if current_group != None:
                add_pair(current_group, current_phrase)
            current_group = group
            current_phrase = word
    for key in result:
        result[key] = list(set(result[key]))
    named_entities_list = []
    for key, value in result.items():
        named_entities_list.append(NamedEntity(key, value))
    return named_entities_list

In [8]:
class ProcessingStatus(Enum):
    INITIALIZED = (0.0, 'Processing not started')
    TRANSCRIBING = (0.2, 'Transcribing')
    DIARIZING = (0.4, 'Diarizing')
    BUILDING_AUDIO_TO_TEXT = (0.6, 'Building audio to text')
    RETRIEVING_ENTITIES = (0.8, 'Retrieving entities')
    RESOLVING_EMOTION = (0.9, 'Resolving emotion')
    FAILED = (1.0, 'Failed')
    DONE = (1.0, 'Done')

    def __init__(self, progress, description):
        self.progress = progress
        self.description = description

    @classmethod
    def get_progress(cls, status):
        return status.progress

    @classmethod
    def get_description(cls, status):
        return status.description


class VoiceTone(Enum):
    NEUTRAL = (0, 0, 'Нейтрально')
    ANGRY = (1, -1, 'Злобно')
    POSITIVE = (2, 1, 'Позитивно')
    SAD = (3, -0.5, 'Грустно')
    UNKNOWN = (4, -0.1, 'Другое')

    def __init__(self, id, point, description):
        self.id = id
        self.point = point
        self.description = description

    def __str__(self):
        return f"{self.description}"

    @classmethod
    def from_id(cls, id):
        for member in cls:
            if member.id == 0:
                return VoiceTone.NEUTRAL
            if member.id == 1:
                return VoiceTone.ANGRY
            if member.id == 2:
                return VoiceTone.POSITIVE
            if member.id == 3:
                return VoiceTone.SAD
            if member.id == 4:
                return VoiceTone.UNKNOWN
        raise ValueError(f"No enum member with id {id}")

    @classmethod
    def get_progress(cls, status):
        return status.progress


@dataclass
class PhraseEntity:
    start_time: float
    end_time: float
    text: str
    speaker_id: str
    voice_tone: VoiceTone

    def __str__(self):
        return f"{self.speaker_id} | {self.start_time:.2f} - {self.end_time:.2f}: {self.text} [{self.voice_tone}]"


class AudioAnalysis:
    def __init__(self, audio_path: str):
        logger.debug(f'Создан Audio Analisys для {audio_path}')
        self.audio_path = os.path.join(os.path.expanduser("~"), audio_path)
        self.progress = 0.0
        self.status = ProcessingStatus.INITIALIZED
        self.phrases = []
        self.entities = None
        self.pbar = tqdm(total=1.0, initial=0, mininterval=0.5, leave=False)

    def run_pipeline(self):
        try:
            #Transcribe
            audio_transcription = AudioTranscription(self.audio_path)
            audio_transcription.full_transcribe()
            for transcription_result in audio_transcription.results["audio_to_text"]:
                self.phrases.append(PhraseEntity(
                    start_time=transcription_result.start_time,
                    end_time=transcription_result.end_time,
                    speaker_id=transcription_result.speaker_id,
                    text=transcription_result.text,
                    voice_tone=None
                ))
            #NER Processing
            self.entities = self._extract_entities()

            self._update_status(ProcessingStatus.RESOLVING_EMOTION)
            self._resolve_emotion()

            self._update_status(ProcessingStatus.DONE)

            self.pbar.close()
        except Exception as e:
            logger.error(f"Error in {e}")
            self._update_status(ProcessingStatus.FAILED)
            return None

    def _resolve_emotion(self):
        audio = AudioSegment.from_file(self.audio_path)
        for phrase in self.phrases:
            audio_part = audio[phrase.start_time * 1000:phrase.end_time * 1000]
            emotion = predict_emotion(audio_part)
            phrase.voice_tone = VoiceTone.from_id(emotion)

    def _extract_entities(self) -> List[NamedEntity]:
        self._update_status(ProcessingStatus.RETRIEVING_ENTITIES)
        results = []
        ner_pipeline = pipeline("ner", model="./ner-model-2.0", tokenizer=tokenizer, device=device)

        for phrase in self.phrases:
            parsed = parse_result(ner_pipeline(phrase.text))
            if len(parsed) > 0:
                results.append(parsed)
        return results

    def _update_status(self, status):
        self.status = status
        self.pbar.set_description(f'{ProcessingStatus.get_description(self.status)} - {self.audio_path}')
        self.pbar.n = ProcessingStatus.get_progress(self.status)
        self.pbar.refresh()


In [None]:
ar = AudioTranscription('Разговор-7.mp3')
ar.full_transcribe()
# ar.just_diarize()

  0%|          | 0/1.0 [00:00<?, ?it/s]

In [27]:
ar.results












  0%|                                              | 0.00/1.42G [00:00<?, ?iB/s][A[A[A[A[A[A[A[A[A[A[A










  0%|                                      | 176k/1.42G [00:00<14:18, 1.78MiB/s][A[A[A[A[A[A[A[A[A[A[A










  0%|                                      | 352k/1.42G [00:00<17:56, 1.42MiB/s][A[A[A[A[A[A[A[A[A[A[A










  0%|                                     | 496k/1.42G [00:01<1:21:17, 313kiB/s][A[A[A[A[A[A[A[A[A[A[A










  0%|                                     | 584k/1.42G [00:01<1:39:35, 256kiB/s][A[A[A[A[A[A[A[A[A[A[A










  0%|                                     | 648k/1.42G [00:02<1:48:55, 234kiB/s][A[A[A[A[A[A[A[A[A[A[A










  0%|                                     | 696k/1.42G [00:02<2:02:16, 208kiB/s][A[A[A[A[A[A[A[A[A[A[A










  0%|                                     | 736k/1.42G [00:02<2:21:07, 180kiB/s][A[A[A[A[A[A[A[A[A[A[A









Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-23): 24 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (key): Linear(in_features=1024, out_features=1024, bias=False)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (out): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
        (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((

In [42]:
for a in ar.phrases:
    print(a)

Speaker_2 | 0.00 - 14.00:  Добрый день, я Алина, администратор бутик отеля Неба. Каждый месяц у нас накапливается около 300-350 документов бронирования, паспортные данные, гостей, акты сверки. Можно как-то автоматизировать? [Нейтрально]
Speaker_1 | 15.00 - 34.00:  Добрый день, Алина. Я Артем, менеджер компании HotelDoc. Наше решение для отелей. Атоматически извлекаю данные из паспортов и виз, даже иностранных. Синхронизируется с пмс-системами, например, опера. Упробатывает до 500 документов в месяц. Предлагаю бесплатно обработать 30 ваших документов. [Нейтрально]
Speaker_1 | 35.00 - 42.00:  Также хотелось бы подробнее обсудить некоторые детали. Вы можете оставить свой сотовый номер телефона? [Нейтрально]
Speaker_2 | 42.00 - 50.00:  Да, конечно. Мой номер 7-915-222-33-44. [Нейтрально]
Speaker_1 | 51.00 - 55.00:  Отлично, мы связываемся с вами. Спасибо за то, что выбрали нас. [Нейтрально]
Speaker_2 | 56.00 - 57.00:  Спасибо, до свидания. [Нейтрально]
Speaker_2 | 58.00 - 59.00:  До свидан

In [28]:
# file_paths = ['Разговор-15.mp3', 'Разговор-14.mp3', 'Разговор-13.mp3', 'Разговор-12.mp3',
#               'Разговор-11.mp3', 'Разговор-10.mp3', 'Разговор-9.mp3', 'Разговор-8.mp3',
#               'Разговор-7.mp3', 'Разговор-6.mp3', 'Разговор-20.mp3', 'Разговор-15.mp3',
#               'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3',
#               'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3', 'Разговор-15.mp3', ]
#
# results_to_print = []
#
#
# def process_file(audio_path):
#     audio_analisys = AudioAnalysis(audio_path)
#     audio_analisys.run_pipeline()
#     logger.debug(audio_analisys.results['entities'])
#
#
# #ЗАПУСКАЕМ ОБРАБОТКУ ФАЙЛОВ
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(process_file, file_paths)

In [29]:
# #ВЫВОД ИЗВЛЕЧЕННЫХ СУЩНОСТЕЙ
# for result in results_to_print:
#     res_dict = {}
#     for data in result[0]:
#         for key, value in data.items():
#             if key not in res_dict:
#                 res_dict[key] = set()
#             for val in value:
#                 res_dict[key].add(val)
#     print(f'Файл {result[1]}:')
#     for key, value in res_dict.items():
#         # print(f'{dict_names[key]}:')
#         for val in value:
#             print(f'  {val}')
#     print()

In [29]:
def convert_to_huggingface_format(tagged_tokens):
	tokens = [token for token, label in tagged_tokens]
	ner_tags = [label for token, label in tagged_tokens]
	return {"tokens": tokens, "ner_tags": ner_tags}


def normalize_phone_in_text(text):
    phone_pattern = r'''
        (?:\+7|8|7)?                  # Код страны (+7/8/7)
        [\s\-()]*                     # Возможные разделители
        (\d{3})                       # 3 цифры (XXX)
        [\s\-()]*
        (\d{3})                       # 3 цифры (YYY)
        [\s\-()]*
        (\d{2})                       # 2 цифры (ZZ)
        [\s\-()]*
        (\d{2})                       # 2 цифры (ZZ)
    '''

    normalized_text = re.sub(
        phone_pattern,
        lambda m: f"+7{m.group(1)}{m.group(2)}{m.group(3)}{m.group(4)}",
        text,
        flags=re.VERBOSE
    )

    return normalized_text

def build_model_dataset():
    nlp = spacy.load("ru_core_news_sm")
    files = glob.glob(EMBEDDED_FOLDER_PATH)
    data = []
    for file in files:
		with open(file, 'r', encoding='utf-8') as f:
	        content = f.read()
            content = normalize_phone_in_text(content)
			tagged_tokens = []
			doc = nlp(content.replace('%%', 'NERTAG'))
			words = [token.text for token in doc]
			words_in_tag = []
			for word in words:
				if word.startswith('NERTAG'):
					parts = word.split('NERTAG')
					if len(parts) == 3:
						entity_text = parts[1]
						entity_label = parts[2]
						if entity_label not in TAG_LIST:
							continue
						tagged_tokens.append((entity_text, entity_label))
						words_in_tag = []
						continue
					else:
						words_in_tag.append(parts[1])
						continue
				if 'NERTAG' in word and not word.startswith('NERTAG'):
					parts == word.split('NERTAG')
					if len(parts) == 2:
						parts = word.split('NERTAG')
						entity_label= parts[1]
						words_in_tag.append(parts[0])
						total = len(words_in_tag)
						if entity_label not in TAG_LIST:
							continue
						for i in range(len(words_in_tag)):
							tagged_tokens.append((words_in_tag[i], entity_label))
						words_in_tag = []
						continue
					else:
						continue
				if 'NERTAG' not in word and len(words_in_tag) != 0:
					words_in_tag.append(word)
				else:
					tagged_tokens.append((word, "O"))
			data.append(convert_to_huggingface_format(tagged_tokens))
	return data