Автор: Лещенко Сергей \
Год: 2025

In [1]:
import torch
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split

import json
import math
import numpy as np
import evaluate
from tqdm import tqdm
import xml.etree.ElementTree as ET
from typing import Dict, List

## Подготовка данных

In [2]:
XML_train_path = "./text_leshchenko.Result.xml"
output_dir     = "./Allophone-model/"

# model_name   = "ai-forever/ruT5-base" - базовая модель для дообучения
model_name     = "./Allophone-model/"  # Дообученная модель

In [3]:
tree = ET.parse(XML_train_path)
root = tree.getroot()

data = []

for sentence in root.findall(".//sentence"):
    for word in sentence.findall(".//word"):

        # Исходное слово: (тег <word> → атрибут original)
        original = word.get("original")

        letters = []
        allophones = []

        # Собираем буквы и аллофоны
        for elem in word:
            if elem.tag == "letter":
                char = elem.get("char")
                if char:
                    letters.append(char)

            elif elem.tag == "allophone":
                ph = elem.get("ph")
                if ph:
                    allophones.append(ph)

        # Формируем input и target
        input_text = f"{original} {' '.join(letters)}"
        target_text = " ".join(allophones)

        data.append({"input": input_text, "target": target_text})

In [4]:
data[10]

{'input': 'товарищей т о в а р и щ е й', 'target': "t a1 v a0 r' i4 sc i4 j"}

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The module name  (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues.


In [6]:
def prepare_dataset(data: List[Dict], tokenizer, max_input_length=128, max_target_length=128):

    def preprocess_function(samples):
        # Добавляем префикс для task-specific обучения
        inputs = ["grapheme to phoneme: " + i for i in samples["input"]]
        targets = samples["target"]
        
        # Токенизация входных текстов
        model_inputs = tokenizer(
            inputs,
            max_length=max_input_length,
            truncation=True,
            padding="max_length"
        )
        
        # Токенизация таргетов
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding=False
        )
        
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    

    inputs = [sample["input"] for sample in data]
    targets = [sample["target"] for sample in data]
    
    dataset_dict = {
        "input": inputs,
        "target": targets
    }
    
    dataset = Dataset.from_dict(dataset_dict)
    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    
    return tokenized_dataset

In [7]:
train_data, val_data = train_test_split(data, test_size=0.1 , random_state=42)

train_dataset = prepare_dataset(train_data, tokenizer)
val_dataset = prepare_dataset(val_data, tokenizer)

Map:   0%|          | 0/24844 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

In [8]:
train_data[0:10]

[{'input': 'скамейках с к а м е й к а х', 'target': "s k a1 m' e0 j k a4 h"},
 {'input': 'вообще в о о б щ е', 'target': 'v a2 a1 p sc e0'},
 {'input': 'опередил о п е р е д и л', 'target': "a1 p' i1 r' i1 d' i0 l"},
 {'input': 'пополудни. п о п о л у д н и',
  'target': "p a2 p a1 l u0 d' n' i4"},
 {'input': 'знать з н а т ь', 'target': "z n a0 d'"},
 {'input': 'была б ы л а', 'target': 'b y1 l a0'},
 {'input': 'Буй б у й', 'target': 'b u0 j'},
 {'input': 'и и', 'target': 'i1'},
 {'input': 'июля и ю л я', 'target': "i1 j u0 l' a4"},
 {'input': 'за з а', 'target': 'z a2'}]

In [9]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 24844
})

## Обучение и тестирование модели

In [10]:
def compute_metrics(eval_pred, tokenizer):
    predictions, labels = eval_pred
    
    # Trainer может вернуть отрицательное значение для игнорируемых токенов
    # для корректности вычислений такие значения заменяются на pad_token_id
    predictions = np.where(predictions < 0, tokenizer.pad_token_id, predictions)
    labels = np.where(labels < 0, tokenizer.pad_token_id, labels)

    # Обратное преобразование токенов строки
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # CER
    cer_metric = evaluate.load("cer")
    cer = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Word accuracy
    correct = sum(pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels))
    accuracy = correct / len(decoded_preds)

    return {"cer": cer, "accuracy": accuracy}

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f"{output_dir}/logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    fp16=False,
    predict_with_generate=True,
    generation_max_length=128,
    max_grad_norm=1.0,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
)

  trainer = Seq2SeqTrainer(


In [None]:
last_checkpoint = "./Allophone-model/checkpoint-21000"

trainer.train(resume_from_checkpoint=last_checkpoint)

In [None]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [12]:
results = trainer.evaluate(val_dataset)

print(f"Character Error Rate: {results['eval_cer']:.4f}")
print(f"Word Accuracy: {results['eval_accuracy']:.4f}")

Character Error Rate: 0.0185
Word Accuracy: 0.8732


In [13]:
def predict_allophones(texts, model_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
  
    input_text = ["grapheme to phoneme: " + text for text in texts]
    inputs = tokenizer(input_text, max_length=128, truncation=True, padding="max_length", return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128, num_beams=5, early_stopping=True)
    
    prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return prediction

In [14]:
allophones = predict_allophones(["Больше б о л ь ш е", "всего в с е г о", "человек ч е л о в е к"], "./Allophone-model")

print("Больше")
print(allophones[0])
print("-" * 30)

print("всего")
print(allophones[1])
print("-" * 30)

print("человек")
print(allophones[2])

Больше
b o0 l' sh y4
------------------------------
всего
f s' i1 v o0
------------------------------
человек
ch i1 l a1 v' e0 k


## Предсказание аллофонов

In [15]:
XML_test_path  = "./Example_Input_Phonetics.xml"

tree = ET.parse(XML_test_path)
root = tree.getroot()

data = []

for sentence in root.findall(".//sentence"):
    for word in sentence.findall(".//word"):
        original = word.get("original")
        letters = []

        # Собираем буквы
        for elem in word:
            if elem.tag == "letter":
                char = elem.get("char")
                if char:
                    letters.append(char)

        # Формируем input
        input_text = f"{original} {' '.join(letters)}"
        data.append(input_text)

In [16]:
# Побатчевое предсказание аллофонов

batch_size = 64
allophones = []

steps = math.ceil(len(data) / batch_size)

for i in tqdm(range(steps)):
    batch = data[i * batch_size:i * batch_size + batch_size]
    allophones.extend(predict_allophones(batch, "./Allophone-model"))

100%|██████████| 1/1 [00:01<00:00,  1.84s/it]


In [17]:
data

['Больше б о л ь ш е',
 'всего в с е г о',
 'человек ч е л о в е к',
 'ненавидит н е н а в и д и т',
 'тех, т е х',
 'кто к т о',
 'смеется с м е ё т с я',
 'ему е м у',
 'прямо п р я м о',
 'в в',
 'глаза, г л а з а',
 'а а',
 'не н е',
 'тех, т е х',
 'кто к т о',
 'смеется с м е ё т с я',
 'за з а',
 'его е г о',
 'спиной. с п и н о й']

In [18]:
allophones

["b o0 l' sh y4",
 "f s' i1 v o0",
 "ch i1 l a1 v' e0 k",
 "n' i1 n a1 v' i0 d' i4 t",
 "t' e0 h",
 'k t o0',
 "s m' i1 j o0 c a4",
 'j i1 m u0',
 "p r' a0 m a4",
 'v',
 'g l a1 z a0',
 'a1',
 "n' i1",
 "t' e0 h",
 'k t o0',
 "s m' i1 j o0 c a4",
 'z a2',
 'j i1 v o0',
 "s p' i1 n o0 j"]

In [19]:
# Сохранение результатов в JSON файл

output = [{
    "words": [
        {
            "content": input_text.split()[0],
            "allophones": output_text.split()
        }
        for input_text, output_text in zip(data, allophones)
    ]
}]

with open("leshchenko_phonetics.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=4)