# Kyrgyz Diacritics Restorer Training

This notebook trains a model to restore diacritics in Kyrgyz text using Transformer architecture. The training process is tracked using Weights & Biases, and the final model is uploaded to Hugging Face Hub.

In [None]:
# Install required packages
!pip install wandb transformers huggingface-hub

# Clone the repository
!git clone https://github.com/jumasheff/ky_diacritics_restorer.git
%cd ky_diacritics_restorer

In [None]:
# Unzip the dataset from Kaggle input directory
import os
import glob

# Find the dataset zip file
dataset_path = glob.glob("/kaggle/input/**/ky_diacritics_dataset.zip", recursive=True)[0]
print(f"Found dataset at: {dataset_path}")

# Create directory if it doesn't exist
!mkdir -p data

# Unzip the dataset
!unzip -o {dataset_path} -d data/

# Verify the files
print("\nExtracted files:")
!ls -lh data/

In [None]:
# Import necessary libraries
import torch
from model import KyrgyzTextDataset, DiacriticsRestorer
from train import train, test_model
import wandb
from huggingface_hub import HfApi, upload_file
import os
from getpass import getpass

In [None]:
# Login to wandb and Hugging Face
wandb_key = getpass("Enter your Weights & Biases API key: ")
wandb.login(key=wandb_key)

hf_token = getpass("Enter your Hugging Face token: ")
api = HfApi(token=hf_token)

# Create Hugging Face repo if it doesn't exist
repo_name = "murat/ky-diacritics-restorer"
try:
    api.create_repo(repo_name, exist_ok=True)
except Exception as e:
    print(f"Note: {e}")

In [None]:
# Set training parameters
EPOCHS = 10
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
PROJECT_NAME = "ky-diacritics-restorer"

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# Load dataset
dataset = KyrgyzTextDataset(
    #'data/dataset.tsv',  # Updated path to the extracted dataset
    'example_dataset.tsv', # demo run to check if the code works end to end
    max_len=512,
    sample_ratio=0.25,  # Use 25% of the data
    val_ratio=0.1,      # 10% of that 25% will be validation
    seed=42             # For reproducibility
)

# Print dataset information
info = dataset.get_dataset_info()
print("\nDataset Information:")
print(f"Total samples: {info['total_samples']}")
print(f"Used samples: {info['used_samples']}")
print(f"Training samples: {info['train_samples']}")
print(f"Validation samples: {info['val_samples']}")
print(f"Vocabulary size: {info['vocab_size']}")
print(f"Max sequence length: {info['max_len']}")

In [None]:
# Create model
model = DiacriticsRestorer(
    vocab_size=len(dataset.char_to_idx),
    d_model=256,
    nhead=8,
    num_encoder_layers=6,
    dim_feedforward=1024,
    dropout=0.1,
    max_len=512
)

In [None]:
# Train model with wandb tracking
train_losses, val_losses = train(
    model=model,
    dataset=dataset,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    project_name=PROJECT_NAME
)

In [None]:
# Test the model
test_samples = [
    "кыргызcтан онугот",
    "мен онугом",
    "биз онугобуз"
]

test_model(model, dataset, test_samples)

In [None]:
# Save and upload model to Hugging Face Hub
def save_and_upload_model(model, dataset, repo_name):
    # Save model and vocabulary
    model_path = "model.pt"
    vocab_path = "vocab.json"
    config_path = "config.json"
    
    # Save model state
    torch.save(model.state_dict(), model_path)
    
    # Save vocabulary
    import json
    with open(vocab_path, 'w', encoding='utf-8') as f:
        json.dump({
            'char_to_idx': dataset.char_to_idx,
            'idx_to_char': dataset.idx_to_char
        }, f, ensure_ascii=False, indent=2)
    
    # Save model config
    config = {
        'vocab_size': len(dataset.char_to_idx),
        'd_model': model.d_model,
        'nhead': 8,
        'num_encoder_layers': 6,
        'dim_feedforward': 1024,
        'dropout': 0.1,
        'max_len': 512
    }
    
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)
    
    # Upload files to Hugging Face
    api.upload_file(
        path_or_fileobj=model_path,
        path_in_repo=model_path,
        repo_id=repo_name
    )
    
    api.upload_file(
        path_or_fileobj=vocab_path,
        path_in_repo=vocab_path,
        repo_id=repo_name
    )
    
    api.upload_file(
        path_or_fileobj=config_path,
        path_in_repo=config_path,
        repo_id=repo_name
    )
    
    print(f"Model and associated files uploaded to {repo_name}")

# Upload the model
save_and_upload_model(model, dataset, repo_name)