In [6]:
import csv
import re
from collections import defaultdict

def contains_non_cyrillic_letter(text: str) -> bool:
    """
    Checks if a string contains any alphabetic character that is not Cyrillic.
    Returns True if a non-Cyrillic letter is found, False otherwise.
    """
    CYRILLIC_ALPHABET = "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
    for char in text:
        if char.isalpha():
            if char not in CYRILLIC_ALPHABET:
                return True
    return False

def reconstruct_sentences_from_csv(file_path):
    """
    Reads a CSV file, reconstructs original and normalized sentences,
    and filters out sentences containing Latin words.

    Args:
        file_path (str): The path to the input CSV file.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              a 'sentence' and its 'normalized_sentence'.
    """
    # Use defaultdict to easily append tokens to lists for each sentence_id
    sentences_before = defaultdict(list)
    sentences_after = defaultdict(list)
    latin_sentence_ids = set()

    print(f"Reading and processing file: {file_path}...")

    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            for row in reader:
                sentence_id = int(row['sentence_id'])
                before_token = row['before']
                after_token = row['after']

                # Check for Latin characters in the 'before' token
                if contains_non_cyrillic_letter(before_token):
                    latin_sentence_ids.add(sentence_id)

                # Append tokens to their respective sentences
                sentences_before[sentence_id].append(before_token)
                sentences_after[sentence_id].append(after_token)

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

    print("Reconstructing and filtering sentences...")
    
    final_results = []
    # Sort by sentence_id to process in order
    sorted_ids = sorted(sentences_before.keys())

    for sentence_id in sorted_ids:
        # Skip sentences that were flagged as containing Latin words
        if sentence_id in latin_sentence_ids:
            continue

        # Join tokens to form the full sentence string
        # This simple join is usually sufficient. For perfect grammar,
        # a more complex joining logic for punctuation would be needed,
        # but this is a robust starting point.
        original_sentence = " ".join(sentences_before[sentence_id])
        normalized_sentence = " ".join(sentences_after[sentence_id])
        
        # A simple post-processing step to fix common spacing issues with punctuation
        for punc in ['.', ',', '?', '!', ')', ':', ';']:
            original_sentence = original_sentence.replace(f' {punc}', punc)
            normalized_sentence = normalized_sentence.replace(f' {punc}', punc)
        for punc in ['(']:
             original_sentence = original_sentence.replace(f'{punc} ', punc)
             normalized_sentence = normalized_sentence.replace(f'{punc} ', punc)


        final_results.append({
            "sentence": original_sentence,
            "normalized_sentence": normalized_sentence
        })

    return final_results

In [7]:
results = reconstruct_sentences_from_csv("/mnt/d/text-normalization-challenge-russian-language/ru_train.csv/ru_train.csv")
print(f"\nFound {len(results)} Cyrillic-only sentences.\n")

Reading and processing file: /mnt/d/text-normalization-challenge-russian-language/ru_train.csv/ru_train.csv...
Reconstructing and filtering sentences...

Found 596921 Cyrillic-only sentences.



In [8]:
results

[{'sentence': 'По состоянию на 1862 год.',
  'normalized_sentence': 'По состоянию на тысяча восемьсот шестьдесят второй год.'},
 {'sentence': 'Оснащались латными рукавицами и сабатонами с не длинными носками.',
  'normalized_sentence': 'Оснащались латными рукавицами и сабатонами с не длинными носками.'},
 {'sentence': 'В конце 1811 года, вследствие конфликта с проезжим вельможей (графом Салтыковым) вынужден был оставить службу по личному прошению.',
  'normalized_sentence': 'В конце тысяча восемьсот одиннадцатого года, вследствие конфликта с проезжим вельможей (графом Салтыковым) вынужден был оставить службу по личному прошению.'},
 {'sentence': 'Севернее Дудинки и северо - восточнее Белочи, в низменной долине Неруссы — урочище Узлив.',
  'normalized_sentence': 'Севернее Дудинки и северо - восточнее Белочи, в низменной долине Неруссы — урочище Узлив.'},
 {'sentence': 'Получение информации об адресах, почтовых индексах, странах, городах.',
  'normalized_sentence': 'Получение информации 

In [16]:
_punctuation = '!? ^,;.'
_letters = 'АБВГДЕЁЖЗИЙКЛМНҢОӨПРСТУҮФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнңоөпрстуүфхцчшщъыьэюя'
ALLOWED_CHARS = set(_punctuation + _letters)

def clean_and_format_text(text: str) -> str:
    """
    Applies two transformations to the input text:
    1. Converts the entire string to lowercase.
    2. Filters the string, keeping only the characters defined in ALLOWED_CHARS.
    """
    # Use a generator expression within "".join() for an efficient, one-line solution
    return "".join(char for char in text.lower() if char in ALLOWED_CHARS)


# --- 2. Convert to Gemma's Conversational Format ---
# The goal is to create a list where each item is a dictionary
# containing a single key "messages", and the value is the conversation list.
formatted_data = []
for item in results:
    conversation = [
        {'role': 'user', 'content': item['sentence']},
        {'role': 'assistant', 'content': clean_and_format_text(item['normalized_sentence'])}
    ]
    formatted_data.append({"messages": conversation})

print(f"Successfully converted {len(formatted_data)} items to the conversational format.")
print("Example of the new format:")
print(formatted_data[0])

Successfully converted 596921 items to the conversational format.
Example of the new format:
{'messages': [{'role': 'user', 'content': 'По состоянию на 1862 год.'}, {'role': 'assistant', 'content': 'по состоянию на тысяча восемьсот шестьдесят второй год.'}]}


In [20]:
# --- 3. Shuffle the Data ---
# This is a crucial step to ensure that the training and validation sets
# are representative of the overall data distribution.
# We set a seed for reproducibility.
import random
random.seed(42)
random.shuffle(formatted_data)
print("\nDataset has been shuffled.")


Dataset has been shuffled.


In [21]:
# --- 4. Create Train/Validation Split ---
# We'll use a 90/10 split as requested.
split_index = int(0.9 * len(formatted_data))
train_data = formatted_data[:split_index]
validation_data = formatted_data[split_index:]

print(f"Data split into {len(train_data)} training examples and {len(validation_data)} validation examples.")

Data split into 537228 training examples and 59693 validation examples.


In [23]:
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login, whoami

# --- 5. Create a Hugging Face DatasetDict ---
# The `datasets` library works best with dictionaries of lists,
# so we convert our list of dicts into a dict of lists.
train_dataset = Dataset.from_dict({"messages": [item["messages"] for item in train_data]})
validation_dataset = Dataset.from_dict({"messages": [item["messages"] for item in validation_data]})

# Combine them into a single DatasetDict object
final_dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset
})

print("\nCreated a Hugging Face DatasetDict:")
print(final_dataset)


Created a Hugging Face DatasetDict:
DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 537228
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 59693
    })
})


In [25]:
# --- 6. Authenticate and Upload to Hugging Face Hub ---
print("\n--- Uploading to Hugging Face Hub ---")

# IMPORTANT: You need to be logged in.
# In a Jupyter/Colab notebook, use notebook_login().
# In a terminal, run `huggingface-cli login` before executing the script.
try:
    # Check if user is already logged in
    hf_username = whoami()['name']
    print(f"Authenticated as: {hf_username}")
except (OSError, KeyError):
    print("Not logged in. Please log in to Hugging Face.")
    # For notebooks, this will open a login widget
    notebook_login()
    hf_username = whoami()['name']

# **IMPORTANT**: Change this to your desired repository name!
# It should be in the format "your-username/your-dataset-name"
repo_id = f"kenenbek/gemma-russian-normalization-dataset" 

print(f"Preparing to upload the dataset to: https://huggingface.co/datasets/{repo_id}")

# Push the dataset to the Hub
# `private=True` makes the dataset private. Remove it or set to False for a public dataset.
final_dataset.push_to_hub(repo_id)

print("\n✅ Successfully uploaded the dataset to the Hugging Face Hub!")


--- Uploading to Hugging Face Hub ---
Authenticated as: kenenbek
Preparing to upload the dataset to: https://huggingface.co/datasets/kenenbek/gemma-russian-normalization-dataset


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            


✅ Successfully uploaded the dataset to the Hugging Face Hub!
