In [None]:
!pip install pyannote.audio
!pip install openai-whisper
!pip install torchaudio
!pip install -q torch
!pip install python-Levenshtein
!pip install pydu
!pip install speechbrain
!pip install torchaudio
!pip install silero-vad

In [2]:
import statsmodels.stats.proportion as proportion
from sklearn.cluster import KMeans
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as snswsadx
import pandas as pd
import numpy as np
import Levenshtein
import torchaudio
import sqlalchemy
import psycopg2
import colorlog
import warnings
import logging
import whisper
import timeit
import torch
import spacy
import time
import sys
import ast
import os
import re
import re
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from transformers import AutoTokenizer, AutoModelForTokenClassification
from scipy.stats import ttest_ind,mannwhitneyu,shapiro,norm
from speechbrain.pretrained import EncoderClassifier
from concurrent.futures import ThreadPoolExecutor
from statsmodels.stats.weightstats import ztest
from IPython.display import display, HTML
from datetime import datetime, date, time
from sqlalchemy import create_engine
from scipy.stats import skew, mode
from transformers import pipeline
from tqdm import tqdm_notebook
from itertools import groupby
from datasets import Dataset
from scipy import stats
from tqdm import tqdm


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  from speechbrain.pretrained import EncoderClassifier
INFO:datasets:PyTorch version 2.6.0 available.


In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"MPS (Metal GPU) available: {torch.backends.mps.is_available()}")
print(f"MPS device name: {torch.device('mps')}")

PyTorch version: 2.6.0
MPS (Metal GPU) available: True
MPS device name: mps


In [4]:
FILE_NAME = "test_audio.mp3"
BERT_MODEL_NAME = 'bert-base-multilingual-cased'
TRANSCRIBE_MODEL_NAME = 'medium'

id_to_label = {0: 'DOC', 1: 'MDT', 2: 'NAME', 3: 'O', 4: 'ORG', 5: 'POS', 6: 'TEL', 7: 'VOL'}
dict_names = {'ORG': 'Организация', 'VOL': 'Объем обработки документов', 'NAME': 'Имя', 'TEL': 'Телефон'}

warnings.filterwarnings("ignore", message="FP16 is not supported on CPU")
logging.getLogger("speechbrain").setLevel(logging.WARNING)
LOGGER_LEVEL = logging.DEBUG

In [5]:
def setup_logger():
    logger = colorlog.getLogger(__name__)
    
    logger.handlers = []
    logger.propagate = False
    
    root_logger = logging.getLogger()
    if any(isinstance(h, colorlog.StreamHandler) for h in root_logger.handlers):
        root_logger.handlers = []
    
    handler = colorlog.StreamHandler(sys.stdout)
    handler.setFormatter(colorlog.ColoredFormatter(
        fmt='%(log_color)s%(asctime)s - %(levelname)-8s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        log_colors={
            'DEBUG': 'cyan',
            'INFO': 'green',
            'WARNING': 'yellow',
            'ERROR': 'red',
            'CRITICAL': 'white,bg_red',
        }
    ))
    
    logger.addHandler(handler)
    logger.setLevel(LOGGER_LEVEL)
    return logger

logging.getLogger().handlers = []
logging.root.handlers = []

logger = setup_logger()
logger.info("Logging configured")

[32m2025-04-05 14:38:01 - INFO     - Logging configured[0m


In [6]:
def transcribe(file_path):
	logger.info(f"{os.path.basename(file_path)} - TRANSCRIBING..")
	model = whisper.load_model(TRANSCRIBE_MODEL_NAME)
	return model.transcribe(file_path)

In [7]:
from huggingface_hub import login
login()

In [9]:
def diarize(file_path, min_segment_length=1.5):
    logger.info(f"{os.path.basename(file_path)} - DIARIZING..")
    vad_model = load_silero_vad()
    embedding_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb",
        savedir="pretrained_models"
    )
    
    audio = read_audio(file_path)
    speech_timestamps = get_speech_timestamps(audio, vad_model, return_seconds=True)
    
    valid_segments = []
    embeddings = []
    
    for segment in speech_timestamps:
        duration = segment['end'] - segment['start']
        if duration < min_segment_length:
            continue
            
        start = int(segment['start'] * 16000)
        end = int(segment['end'] * 16000)
        segment_wav = audio[start:end]
        
        if duration < 1.5:
            padding = int((1.5 - duration) * 16000)
            segment_wav = torch.nn.functional.pad(segment_wav, (0, padding))
            
        segment_wav = segment_wav.unsqueeze(0)
        
        try:
            embedding = embedding_model.encode_batch(segment_wav)
            embeddings.append(embedding.squeeze().numpy())
            valid_segments.append(segment)
        except Exception as e:
            logger.error(f"Error processing segment: {e}")
            continue
    kmeans = KMeans(n_clusters=2, random_state=42)
    labels = kmeans.fit_predict(np.array(embeddings))
    
    return valid_segments, labels

In [10]:
def print_united_result():
	for view_res in silero_vad_speakers:
		logger.debug(f'Speaker: {view_res[0]} - {view_res[1]}')
		
def unite_results(transcribed_result, diarized_result, labels, logging = True):
	diarization_result = []
	base_string_res = []
	for i, segment in enumerate(diarized_result):
		speaker = f"Speaker_{labels[i] + 1}"
		diarization_result.append({
			"start": segment['start'],
			"end": segment['end'],
			"speaker": speaker
		})
	silero_vad_speakers = []
	for segment in transcribed_result["segments"]:
			start = segment["start"]
			end = segment["end"]
			text = segment["text"]
			max_overlap = 0
			best_speaker = None
	
			for diarization_segment in diarization_result:
				overlap_start = max(start, diarization_segment["start"])
				overlap_end = min(end, diarization_segment["end"])
				overlap = max(0, overlap_end - overlap_start)
	
				if overlap > max_overlap:
					max_overlap = overlap
					best_speaker = diarization_segment["speaker"]
	
			if best_speaker is None:
				for diarization_segment in diarization_result:
					if diarization_segment["end"] >= start or diarization_segment["start"] <= end:
						best_speaker = diarization_segment["speaker"]
						break
	
			speaker = best_speaker if best_speaker else "Unknown"
			silero_vad_speakers.append((speaker, text))
			base_string_res.append(text)
	return base_string_res

In [11]:
def parse_result(results):
	readable_result = []
	for pred in results:
		label = id_to_label[int(pred["entity"].split("_")[1])]
		readable_result.append({
			"word": pred["word"],
			"entity": label,
			"score": pred["score"],
			"start": pred["start"],
			"end": pred["end"]
		})
	current_group = None
	result = {}
	current_phrase = ''
	def add_pair(key, value):
		if key not in result:
			result[key] = []
		result[key].append(value)
	for res in readable_result:
		word = res['word']
		group = res['entity']
		if group == "O":
			if current_group != None:
				add_pair(current_group, current_phrase)
			current_group = None
			current_phrase = ''
			continue
		if word.startswith('##'):
			current_phrase = current_phrase + word.replace('##', '')
		elif group == current_group:
			current_phrase = current_phrase + ' ' + word
		else:
			if current_group != None:
				add_pair(current_group, current_phrase)
			current_group = group
			current_phrase = word
	for key in result:
	    result[key] = list(set(result[key]))
	return result

In [17]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

def processFile(file_name):
	file_path = os.path.join(os.path.expanduser("~"), file_name)
	
	transcribed_result = transcribe(file_path)
	
	diarized_result, labels = diarize(file_path)
	
	processed_results = unite_results(transcribed_result, diarized_result, labels)
	
	ner_pipeline = pipeline("ner", model="./ner-model-2.0", tokenizer=tokenizer, device=device)

	results = []
	for processed_result in processed_results:
		parsed = parse_result(ner_pipeline(processed_result))
		if len(parsed) > 0:
			results.append(parsed)
	return results

In [26]:
# ПОДАЕМ НА ВХОД 3 АУДИО ФАЙЛА
file_paths = ['Разговор-14.mp3', 'Разговор-13.mp3', 'Разговор-12.mp3',]
results_to_print = []
    
def process_file(file_name):
    logger.info(f'Processing: {file_name}')
    results = processFile(file_name)
    for res in results:
        logger.debug(f'{file_name} - {res}')
    results_to_print.append((results, file_name))
    return results

# ЗАПУСКАЕМ ОБРАБОТКУ ФАЙЛОВ
with ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(process_file, file_paths)

[32m2025-04-05 14:39:21 - INFO     - Processing: Разговор-14.mp3[0m
[32m2025-04-05 14:39:21 - INFO     - Processing: Разговор-13.mp3[0m
[32m2025-04-05 14:39:21 - INFO     - Processing: Разговор-12.mp3[0m
[32m2025-04-05 14:39:21 - INFO     - Разговор-14.mp3 - TRANSCRIBING..[0m
[32m2025-04-05 14:39:21 - INFO     - Разговор-13.mp3 - TRANSCRIBING..[0m
[32m2025-04-05 14:39:21 - INFO     - Разговор-12.mp3 - TRANSCRIBING..[0m
[32m2025-04-05 14:39:55 - INFO     - Разговор-14.mp3 - DIARIZING..[0m


  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


[32m2025-04-05 14:40:01 - INFO     - Разговор-12.mp3 - DIARIZING..[0m
[36m2025-04-05 14:40:04 - DEBUG    - Разговор-14.mp3 - {'NAME': ['Марина Сергеевна']}[0m
[36m2025-04-05 14:40:04 - DEBUG    - Разговор-14.mp3 - {'VOL': ['450 документов']}[0m
[36m2025-04-05 14:40:04 - DEBUG    - Разговор-14.mp3 - {'NAME': ['Антон', 'Марина Сергеев'], 'ORG': ['SchoolD']}[0m
[36m2025-04-05 14:40:04 - DEBUG    - Разговор-14.mp3 - {'VOL': ['50 документов', '600 документов']}[0m
[36m2025-04-05 14:40:04 - DEBUG    - Разговор-14.mp3 - {'TEL': ['8983 572 92 31']}[0m
[32m2025-04-05 14:40:06 - INFO     - Разговор-13.mp3 - DIARIZING..[0m
[36m2025-04-05 14:40:08 - DEBUG    - Разговор-12.mp3 - {'NAME': ['Анна'], 'ORG': ['Вокруг света']}[0m
[36m2025-04-05 14:40:08 - DEBUG    - Разговор-12.mp3 - {'VOL': ['300', '200']}[0m
[36m2025-04-05 14:40:08 - DEBUG    - Разговор-12.mp3 - {'NAME': ['Максим', 'Анна']}[0m
[36m2025-04-05 14:40:08 - DEBUG    - Разговор-12.mp3 - {'VOL': ['20 документов']}[0m
[

In [27]:
#ВЫВОД ИЗВЛЕЧЕННЫХ СУЩНОСТЕЙ
for result in results_to_print:
	res_dict = {}
	for data in result[0]:
		for key, value in data.items():
			if key not in res_dict:
				res_dict[key] = set()
			for val in value:
				res_dict[key].add(val)
	print(f'Файл {result[1]}:')
	for key, value in res_dict.items():
		print(f'{dict_names[key]}:')
		for val in value:
			print(f'  {val}')
	print()

Файл Разговор-14.mp3:
Имя:
  Марина Сергеевна
  Антон
  Марина Сергеев
Объем обработки документов:
  50 документов
  450 документов
  600 документов
Организация:
  SchoolD
Телефон:
  8983 572 92 31

Файл Разговор-12.mp3:
Имя:
  Максим
  Анна
Организация:
  Вокруг света
Объем обработки документов:
  200
  300
  20 документов

Файл Разговор-13.mp3:
Имя:
  Екатерина
  Елена Смирнова
  Елена
Объем обработки документов:
  200 документов в месяц
  30 - 40 документов
  20 карт
Организация:
  BeautyDoc
Телефон:
  8357453112

