# Translating the HellaSwag Dataset to Russian

The goal of this notebook is to translate the [HellaSwag dataset](https://huggingface.co/datasets/Rowan/hellaswag) to Russian. See [README.md](../README.md) for more information about the HellaSwag benchmark, and how it can be used to assess common-sense sentence-completion in Large Language Models (LLMs).

In [1]:
# load initial libs
%pip install datasets huggingface_hub[hf_xet] ipywidgets --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
# pull the dataset from hugging face hub
from datasets import load_dataset

dataset = load_dataset("Rowan/hellaswag")

In [3]:
# cool to see that hellaswag already supports the new, more efficient xet file storage
# read more about xet here: https://huggingface.co/blog/xet-on-the-hub

# let's have a look at a few sample from the hellaswag dataset
sample = dataset["train"][0]
print("Context (ctx):", sample["ctx"])
print("\nEndings:", sample["endings"])
print("\nLabel (correct ending index):", sample["label"])


Context (ctx): Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then

Endings: [', the man adds wax to the windshield and cuts it.', ', a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.', ', the man puts on a christmas coat, knitted with netting.', ', the man continues removing the snow on his car.']

Label (correct ending index): 3


In [4]:
# print out the features of the dataset
print("Features:", dataset["train"].features)

# check number of rows for each split
print("Train samples:", len(dataset["train"]))
print("Validation samples:", len(dataset["validation"]))
print("Test samples:", len(dataset["test"]))

Features: {'ind': Value(dtype='int32', id=None), 'activity_label': Value(dtype='string', id=None), 'ctx_a': Value(dtype='string', id=None), 'ctx_b': Value(dtype='string', id=None), 'ctx': Value(dtype='string', id=None), 'endings': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'source_id': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'split_type': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
Train samples: 39905
Validation samples: 10042
Test samples: 10003


## Using Open Source Translation

For this exercise, we'll use the [Opus MT](https://huggingface.co/Helsinki-NLP/opus-mt-en-ru) translation model from the [University of Helsinki NLP department](https://huggingface.co/Helsinki-NLP).

Once we've translated the data, we'll revisit it with human annotators to confirm its accuracy.

In [5]:
# install additional libs
%pip install transformers sentencepiece sacremoses --quiet

Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# use gpu if possible (this increases the speed of translation)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [7]:
# function to translate our english-language dataset and component elements into russian language
def translate_texts(texts, tokenizer, model, max_length=512):
    """Batch-translate a list of texts from English to Russian"""
    batch = tokenizer.prepare_seq2seq_batch(
        src_texts=texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    ).to(device)

    translated = model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)


In [8]:
from concurrent.futures import ThreadPoolExecutor

def translate_texts_parallel(ctx_batch, endings_batch, tokenizer, model):
    with ThreadPoolExecutor(max_workers=2) as executor:
        future_ctx = executor.submit(translate_texts, ctx_batch, tokenizer, model)
        flat_endings = sum(endings_batch, [])
        future_endings = executor.submit(translate_texts, flat_endings, tokenizer, model)
        return future_ctx.result(), future_endings.result()

In [9]:
import os
import json
from pathlib import Path

def save_jsonl(path, data):
    with open(path, "w", encoding="utf-8") as f:
        for entry in data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data


In [10]:
# let's test our function on a single sample
sample = dataset["train"][0]

# translate context
translated_ctx = translate_texts([sample["ctx"]], tokenizer, model)[0]

# translate all possible endings
translated_endings = translate_texts(sample["endings"], tokenizer, model)

# show results
print("Original ctx:\n", sample["ctx"])
print("\nTranslated ctx:\n", translated_ctx)
print("\nOriginal endings:\n", sample["endings"])
print("\nTranslated endings:\n", translated_endings)


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Original ctx:
 Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then

Translated ctx:
 Затем мужчина пишет по снегу, закрывая окно машины, и женщина в зимней одежде улыбается.

Original endings:
 [', the man adds wax to the windshield and cuts it.', ', a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.', ', the man puts on a christmas coat, knitted with netting.', ', the man continues removing the snow on his car.']

Translated endings:
 ['Человек добавляет воск к лобовому стеклу и разрезает его.', ', человек сидит на лыжном подъемнике, в то время как двое мужчин поддерживают голову человека в зимнем снегу, как мы, девочки, сапливали.', 'Мужчина надевает рождественское пальто, вязанное сеткой.', 'Мужчина продолжает удалять снег на своей машине.']


## Translating the Full Dataset

We've written a function to programmatically translate the HellaSwag dataset and confirmed that it is working to an acceptable standard. 

Let's iterate through the entire dataset now, using batch processing to avoid rates limits from Hugging Face, and create a new Russian-language HellaSwag dataset!

In [11]:
# install additional libs
%pip install tqdm --quiet

Note: you may need to restart the kernel to use updated packages.


In [None]:
from tqdm import tqdm

def batched_translate_with_checkpoint(dataset_split, split_name, output_dir="checkpoints", batch_size=4):
    os.makedirs(output_dir, exist_ok=True)
    checkpoint_path = Path(output_dir) / f"{split_name}.jsonl"

    translated_data = load_jsonl(checkpoint_path) if checkpoint_path.exists() else []
    start_index = len(translated_data)

    print(f"📝 Resuming {split_name} from index {start_index}...")

    for i in tqdm(range(start_index, len(dataset_split), batch_size), desc=f"Translating {split_name}"):
        batch = dataset_split.select(range(i, min(i + batch_size, len(dataset_split))))
        batch_dicts = batch.to_list()

        ctx_batch = [ex["ctx"] for ex in batch_dicts]
        endings_batch = [ex["endings"] for ex in batch_dicts]

        try:
            translated_ctx, translated_flat_endings = translate_texts_parallel(ctx_batch, endings_batch, tokenizer, model)
        except Exception as e:
            print(f"⚠️ Error on batch {i}-{i+batch_size}: {e}")
            continue

        translated_endings = [
            translated_flat_endings[j:j+4] for j in range(0, len(translated_flat_endings), 4)
        ]

        for j in range(len(batch_dicts)):
            translated_entry = {
                "ctx": translated_ctx[j],
                "endings": translated_endings[j],
                "label": batch_dicts[j].get("label", -1)
            }
            translated_data.append(translated_entry)

            with open(checkpoint_path, "a", encoding="utf-8") as f:
                f.write(json.dumps(translated_entry, ensure_ascii=False) + "\n")

    print(f"✅ Done: {split_name} — {len(translated_data)} total entries")
    return translated_data


: 

In [None]:
translated_train = batched_translate_with_checkpoint(dataset["train"], "train")
translated_val = batched_translate_with_checkpoint(dataset["validation"], "validation")

translated_test = None
if "test" in dataset:
    translated_test = batched_translate_with_checkpoint(dataset["test"], "test")


📝 Resuming train from index 0...


Translating train:   2%|▏         | 186/9977 [33:56<22:49:41,  8.39s/it]

In [None]:
from datasets import Dataset, DatasetDict

# Load from saved files (ensures full resumption)
train_data = load_jsonl("checkpoints/train.jsonl")
val_data = load_jsonl("checkpoints/validation.jsonl")
test_data = load_jsonl("checkpoints/test.jsonl") if Path("checkpoints/test.jsonl").exists() else None

hf_dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
})

if test_data:
    hf_dataset["test"] = Dataset.from_list(test_data)

hf_dataset


In [None]:
# upload resulting dataset to hugging face hub
from huggingface_hub import notebook_login
notebook_login()

hf_dataset.push_to_hub("ZennyKenny/hellaswag-ru")
