In [None]:
!pip install transformers==4.38.2 accelerate datasets evaluate sentencepiece sacrebleu rouge-score

Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/130.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.2)
  Downloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py

In [3]:
# ===== 2) Training & inference script (CPU/GPU auto) =====
import os, warnings, random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    set_seed,
)
from datasets import Dataset, DatasetDict
from accelerate import Accelerator
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from pathlib import Path
import zipfile

# ---------------- Config ----------------
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

MODEL_NAME = "bigscience/mt0-small"

TASK_PREFIX = "Detoxify the following Tatar sentence while preserving the meaning: "
TOXIC_COL = "toxic"
DETOX_COL = "detox"

MAX_SOURCE_LENGTH = 128
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8
NUM_EPOCHS = 6
LEARNING_RATE = 5e-5
BEAMS_FOR_TEST = 1  # подними до 4 после проверки

OUTPUT_FILE = "/content/submission.tsv"
OUTPUT_ZIP  = "/content/submission.zip"

warnings.filterwarnings("ignore")

# ---------------- Data ----------------
df = pd.read_csv(TRAIN_PATH)
assert TOXIC_COL in df.columns and DETOX_COL in df.columns, f"В csv нет колонок {TOXIC_COL}/{DETOX_COL}"
df = df[[TOXIC_COL, DETOX_COL]].dropna().reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.3, random_state=SEED, shuffle=True)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df.reset_index(drop=True))
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

# ---------------- Tokenizer & Model ----------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# ---------------- Preprocess ----------------
def preprocess_function(examples):
    inputs = [TASK_PREFIX + t for t in examples[TOXIC_COL]]
    model_inputs = tokenizer(inputs, max_length=MAX_SOURCE_LENGTH, truncation=True)
    labels = tokenizer(text_target=examples[DETOX_COL],
                       max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

# ---------------- Accelerator ----------------
use_mixed = "fp16" if torch.cuda.is_available() else "no"
accelerator = Accelerator(mixed_precision=use_mixed if use_mixed!="no" else None)
device = accelerator.device
print("Device:", device)

# ---------------- Dataloaders & Collator ----------------
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt",
)

train_loader = DataLoader(tokenized["train"], batch_size=BATCH_SIZE, shuffle=True,  collate_fn=data_collator)
eval_loader  = DataLoader(tokenized["test"],  batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

model, optimizer, train_loader, eval_loader = accelerator.prepare(
    model, optimizer, train_loader, eval_loader
)

# ---------------- Sanity step ----------------
model.train()
sanity_batch = next(iter(train_loader))
optimizer.zero_grad()
sanity_out = model(**sanity_batch)
sanity_loss = sanity_out.loss
accelerator.backward(sanity_loss)
optimizer.step()
accelerator.print(f"Sanity step OK | loss: {float(sanity_loss):.4f}")

# ---------------- Training ----------------
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}", leave=False)
    for step, batch in enumerate(pbar, start=1):
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        total_loss += loss.item()
        if step % 50 == 0 or step == 1:
            pbar.set_postfix(step_loss=f"{loss.item():.4f}")
    avg = total_loss / max(1, len(train_loader))
    accelerator.print(f"Epoch {epoch+1} | Loss: {avg:.4f}")

# ---------------- Inference helper ----------------
def generate_detox(sentence: str, num_beams: int = BEAMS_FOR_TEST) -> str:
    inp = TASK_PREFIX + sentence
    enc = tokenizer(inp, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LENGTH)
    enc = {k: v.to(device) for k, v in enc.items()}
    unwrapped = accelerator.unwrap_model(model)
    unwrapped.eval()
    with torch.no_grad():
        out_ids = unwrapped.generate(
            **enc,
            max_length=MAX_TARGET_LENGTH,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=2,
        )
    result = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    return result.strip() if result.strip() else sentence

# ---------------- Submission ----------------
submit_df = pd.read_csv(SUBMIT_PATH, sep="\t")
assert "ID" in submit_df.columns and "tat_toxic" in submit_df.columns, "В TSV нет колонок ID/tat_toxic"

preds = []
pbar = tqdm(submit_df["tat_toxic"], desc="Generating", leave=False)
for text in pbar:
    preds.append(generate_detox(text))

final_df = pd.DataFrame({
    "ID": submit_df["ID"],
    "tat_toxic": submit_df["tat_toxic"],
    "tat_detox1": preds,
})
final_df.to_csv(OUTPUT_FILE, sep="\t", index=False)

with zipfile.ZipFile(OUTPUT_ZIP, "w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write(OUTPUT_FILE, arcname=Path(OUTPUT_FILE).name)

print(f"Saved: {OUTPUT_FILE}")
print(f"Zipped: {OUTPUT_ZIP}")


Map:   0%|          | 0/1276 [00:00<?, ? examples/s]

Map:   0%|          | 0/547 [00:00<?, ? examples/s]

Device: cuda
Sanity step OK | loss: nan


Epoch 1/6:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 1 | Loss: nan


Epoch 2/6:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 2 | Loss: nan


Epoch 3/6:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 3 | Loss: nan


Epoch 4/6:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 4 | Loss: nan


Epoch 5/6:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 5 | Loss: nan


Epoch 6/6:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 6 | Loss: nan


Generating:   0%|          | 0/701 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Saved: /content/submission.tsv
Zipped: /content/submission.zip
