# Kyrgyz Diacritics Restorer using Transformer

This notebook implements a Transformer-based model for restoring diacritics in Kyrgyz text.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls -lah /content/drive/MyDrive/ky_diacritics_dataset.zip

In [None]:
# Clone the repository
!git clone https://github.com/jumasheff/ky_diacritics_restorer.git
%cd ky_diacritics_restorer

In [None]:
!unzip /content/drive/MyDrive/ky_diacritics_dataset.zip -d /content/ky_diacritics_restorer/

In [None]:
# Import necessary libraries
import torch
from model import KyrgyzTextDataset, DiacriticsRestorer
from train import train, test_model

In [None]:
# Set training parameters
EPOCHS = 10
BATCH_SIZE = 32
LEARNING_RATE = 1e-4

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# Load dataset
dataset = KyrgyzTextDataset(
    'dataset.tsv',
    max_len=512,
    sample_ratio=0.25,  # Use 25% of the data
    val_ratio=0.1,      # 10% of that 25% will be validation
    seed=42             # For reproducibility
)
# Print dataset information
info = dataset.get_dataset_info()
print("\nDataset Information:")
print(f"Total samples: {info['total_samples']}")
print(f"Used samples: {info['used_samples']}")
print(f"Training samples: {info['train_samples']}")
print(f"Validation samples: {info['val_samples']}")
print(f"Vocabulary size: {info['vocab_size']}")
print(f"Max sequence length: {info['max_len']}")

In [None]:
# Create model
model = DiacriticsRestorer(
    vocab_size=len(dataset.char_to_idx),
    d_model=256,
    nhead=8,
    num_encoder_layers=6,
    dim_feedforward=1024,
    dropout=0.1,
    max_len=512
)

In [None]:
# Train model
train_losses, _ = train(
    model=model,
    train_dataset=dataset,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE
)

In [None]:
# Test the model
test_samples = [
    "кыргызcтан онугот",
    "мен онугом",
    "биз онугобуз"
]

test_model(model, dataset, test_samples)