# Kaggle link: https://www.kaggle.com/code/l1ghtsource/transaltor-finetune

In [1]:
! pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m722.2 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [113]:
import pandas as pd
import re

df = pd.read_csv('/kaggle/input/yappy-tags-and-links/yappy_hackaton_2024_400k.csv')
df = df.dropna().reset_index(drop=True)

func = lambda text: re.sub(r'[^А-Яа-яЁё\s]+', '', text) # оставим только русские слова (без эмодзи и английских слов)
df['cleaned_description'] = df['description'].apply(func)

df = df.dropna().reset_index(drop=True)

russian_phrases = []

for i in range(len(df)):
    russian_phrases.extend([word.lower() for word in df.cleaned_description[i].split() if len(word) > 3])
    
russian_phrases = list(set(russian_phrases))[:10000] # нам хватит 10000

# добавим запросы для ценителей высокого искусства
russian_phrases.extend([
    'роблокс',
    'роблоксер',
    'бравл старс',
    'бравлстарс',
    'кейпоп',
    'кей поп',
    'майнкрафт',
    'скибиди туалет',
    'мэйби бэйби',
    'дота',
    'дота 2',
    'гача лайф',
    'таро'
])

In [162]:
from deep_translator import GoogleTranslator
from tqdm import tqdm

translator = GoogleTranslator(source='ru', target='en')

translations = []

for phrase in tqdm(russian_phrases):
    translated = translator.translate(phrase)
    translations.append((phrase, translated))

with open('translations.txt', 'w', encoding='utf-8') as f:
    for ru, en in translations:
        f.write(f'{ru}\t{en}\n')

100%|██████████| 10014/10014 [1:48:15<00:00,  1.54it/s] 


In [1]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [2]:
import os
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, load_metric
from evaluate import load

2024-06-10 12:42:03.299057: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-10 12:42:03.299175: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-10 12:42:03.435828: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
path = '/kaggle/input/translations-dataset/translations.txt'

In [21]:
data= {'translation': []}

with open(path, 'r', encoding='utf-8') as f:
    for line in f:
        ru, en = line.strip().split('\t')
        data['translation'].append({'ru': ru,  'en': en})

In [68]:
# добавим еще слова для самых маленьких любителей Yappy
new = [
    {'ru': 'бравл старс', 'en': 'brawl stars'},
    {'ru': 'бравл старсу', 'en': 'brawl stars'},
    {'ru': 'бравл старсе', 'en': 'brawl stars'},
    {'ru': 'бравл старсом', 'en': 'brawl stars'},
    {'ru': 'бравл старсы', 'en': 'brawl stars'},
    {'ru': 'бравл старсер', 'en': 'brawl stars'},
    {'ru': 'бравл старсов', 'en': 'brawl stars'},
    {'ru': 'бравл старсеры', 'en': 'brawl stars'},
    {'ru': 'бровл старс', 'en': 'brawl stars'},
    {'ru': 'о бравл старсе', 'en': 'about brawl stars'},
    {'ru': 'о бравл старсерах', 'en': 'about brawl stars'},
    {'ru': 'о бравл старсах', 'en': 'about brawl stars'},
    {'ru': 'с бравл старсом', 'en': 'about brawl stars'},
    {'ru': 'перед бравл старсом', 'en': 'before brawl stars'},
    {'ru': 'после бравл старса', 'en': 'after brawl stars'},
    {'ru': 'за бравл старсом', 'en': 'after brawl stars'},
    {'ru': 'о бравл старсе', 'en': 'about brawl stars'},
    {'ru': 'бравлстарс', 'en': 'brawl stars'},
    {'ru': 'бравлстарсу', 'en': 'brawl stars'},
    {'ru': 'бравлстарсе', 'en': 'brawl stars'},
    {'ru': 'бравлстарсер', 'en': 'brawl stars'},
    {'ru': 'бравлстарсеры', 'en': 'brawl stars'},
    {'ru': 'бравлстарсом', 'en': 'brawl stars'},
    {'ru': 'бравлстарсы', 'en': 'brawl stars'},
    {'ru': 'бровлстарс', 'en': 'brawl stars'},
    {'ru': 'роблокс', 'en': 'roblox'},
    {'ru': 'роблоксе', 'en': 'roblox'},
    {'ru': 'роблоксу', 'en': 'roblox'},
    {'ru': 'роблоксер', 'en': 'roblox'},
    {'ru': 'роблоксеры', 'en': 'roblox'},
    {'ru': 'роблоксы', 'en': 'roblox'},
    {'ru': 'роблоксом', 'en': 'roblox'},
    {'ru': 'роблоксы', 'en': 'roblox'},
    {'ru': 'роблоксерам', 'en': 'roblox'},
    {'ru': 'о роблоксе', 'en': 'about roblox'},
    {'ru': 'о роблоксах', 'en': 'about roblox'},
    {'ru': 'в роблоксе', 'en': 'in roblox'},
    {'ru': 'после роблокса', 'en': 'after roblox'},
    {'ru': 'перед роблоксом', 'en': 'before roblox'},
    {'ru': 'за роблоксом', 'en': 'after roblox'},
    {'ru': 'скибиди туалет', 'en': 'skibidi toilet'},
    {'ru': 'скибиди туалетом', 'en': 'skibidi toilet'},
    {'ru': 'скибиди туалетами', 'en': 'skibidi toilet'},
    {'ru': 'скибидитуалет', 'en': 'skibidi toilet'},
    {'ru': 'скибидитуалету', 'en': 'skibidi toilet'},
    {'ru': 'скибидитуалеты', 'en': 'skibidi toilet'},
    {'ru': 'скибидитуалетами', 'en': 'skibidi toilet'},
    {'ru': 'скибиди', 'en': 'skibidi'},
    {'ru': 'капибара', 'en': 'capybara'},
    {'ru': 'капибары', 'en': 'capybaras'},
    {'ru': 'много капибара', 'en': 'many capybaras'},
    {'ru': 'капибары в воде', 'en': 'capybaras in water'},
    {'ru': 'видео с капибарами', 'en': 'video with capybaras'},
    {'ru': 'райан гослинг', 'en': 'ryan gosling'},
    {'ru': 'гослинг', 'en': 'gosling'},
]

In [133]:
import pickle

with open('zoomer_words.pkl', 'wb') as f:
    pickle.dump(new, f)

In [69]:
d_train = data['translation'][:8000]
for _ in range(5):
    d_train.extend(new)
    
d_eval = data['translation'][8000:]
for _ in range(2):
    d_eval.extend(new)

In [70]:
import random

random.shuffle(d_train)
random.shuffle(d_eval)

In [71]:
data_train = {'translation': d_train}
data_eval = {'translation': d_eval}

In [72]:
train_dataset = Dataset.from_dict(data_train)
eval_dataset = Dataset.from_dict(data_eval)

In [73]:
model_name = 'Helsinki-NLP/opus-mt-ru-en'

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [74]:
def preprocess_function(examples):
    inputs = [ex['ru'] for ex in examples['translation']]
    targets = [ex['en'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, 
                             text_target=targets, 
                             truncation=True, 
                             padding='max_length', 
                             max_length=128)
    return model_inputs

tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_eval = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8275 [00:00<?, ? examples/s]

Map:   0%|          | 0/2124 [00:00<?, ? examples/s]

In [75]:
bleu = load("bleu")

def compute_bleu(model, tokenizer, dataset):
    inputs = [ex['ru'] for ex in dataset['translation']]
    targets = [[ex['en']] for ex in dataset['translation']]
    predictions = []
    
    for input_text in inputs:
        translated = model.generate(**tokenizer(input_text, 
                                                return_tensors="pt", 
                                                padding=True, 
                                                truncation=True))
        translated_text = tokenizer.batch_decode(translated, 
                                                 skip_special_tokens=True)[0]
        predictions.append(translated_text)
    
    results = bleu.compute(predictions=predictions, references=targets)
    return results

In [None]:
# bleu_before = compute_bleu(model, tokenizer, data[250:750])
# print(f"BLEU до дообучения: {bleu_before['bleu']}")

In [76]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [77]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
)

In [78]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1064,0.037829
2,0.0335,0.03687
3,0.0259,0.03643
4,0.0219,0.036473
5,0.0194,0.036398


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


TrainOutput(global_step=2590, training_loss=0.04062958210131377, metrics={'train_runtime': 662.8172, 'train_samples_per_second': 62.423, 'train_steps_per_second': 3.908, 'total_flos': 1402544848896000.0, 'train_loss': 0.04062958210131377, 'epoch': 5.0})

In [79]:
model.save_pretrained('./fine-tuned-opus-mt-ru-en')
tokenizer.save_pretrained('./fine-tuned-opus-mt-ru-en')

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


('./fine-tuned-opus-mt-ru-en/tokenizer_config.json',
 './fine-tuned-opus-mt-ru-en/special_tokens_map.json',
 './fine-tuned-opus-mt-ru-en/vocab.json',
 './fine-tuned-opus-mt-ru-en/source.spm',
 './fine-tuned-opus-mt-ru-en/target.spm',
 './fine-tuned-opus-mt-ru-en/added_tokens.json')

In [80]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [81]:
fine_tuned_model = MarianMTModel.from_pretrained('./fine-tuned-opus-mt-ru-en').to(device)
fine_tuned_tokenizer = MarianTokenizer.from_pretrained('./fine-tuned-opus-mt-ru-en')



In [132]:
%%time
text = 'бравл старс роблокс скибиди туалет' # ура зумерский переводчик

inputs = fine_tuned_tokenizer(text, return_tensors='pt', padding=True).to(device)
translated = fine_tuned_model.generate(**inputs)
translated_text = [fine_tuned_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
translated_text[0]

CPU times: user 136 ms, sys: 1.88 ms, total: 137 ms
Wall time: 124 ms


'brawl stars roblox skibidi toilet'

In [94]:
!cd /kaggle/working/
!zip -r tuned.zip /kaggle/working/fine-tuned-opus-mt-ru-en

  adding: kaggle/working/fine-tuned-opus-mt-ru-en/ (stored 0%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/config.json (deflated 61%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/target.spm (deflated 49%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/source.spm (deflated 58%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/tokenizer_config.json (deflated 68%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/generation_config.json (deflated 43%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/vocab.json (deflated 79%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/model.safetensors (deflated 7%)
  adding: kaggle/working/fine-tuned-opus-mt-ru-en/special_tokens_map.json (deflated 35%)
