# Importing Dependencies

In [1]:
from datasets import load_dataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import spacy
import gensim.downloader as api
from gensim.models import KeyedVectors

In [2]:
ds = load_dataset("Helsinki-NLP/opus-100", "en-fr")
ds['train'] = ds['train'].shuffle().select(range(90000))

# Implementation

**1) Transformer**

In [3]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [4]:
def tokenize(examples):
    inputs = [example['en'] for example in examples['translation']]
    trgts = [example['fr'] for example in examples['translation']]
    model_inputs = tokenizer(inputs, text_target = trgts, max_length = 128, truncation = True)
    return model_inputs

In [5]:
tknized_ds = ds.map(tokenize, batched = True, remove_columns = ds['train'].column_names)

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
metric = evaluate.load("sacrebleu")

In [7]:
def metrics(results):
    preds, labels = results
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {'bleu' : result['score']}

In [11]:
args = Seq2SeqTrainingArguments(
    output_dir = "transformer_translator",
    eval_strategy = "no",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    weight_decay = 0.01,
    num_train_epochs = 3,
    predict_with_generate = True,
    fp16 = True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset = tknized_ds['train'],
    eval_dataset = tknized_ds['test'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = metrics,
)

  trainer = Seq2SeqTrainer(


In [12]:
trainer.evaluate(max_length = 128)

{'eval_loss': 1.3903475999832153,
 'eval_model_preparation_time': 0.0,
 'eval_bleu': 35.736604792825936,
 'eval_runtime': 162.5981,
 'eval_samples_per_second': 12.3,
 'eval_steps_per_second': 0.387}

In [13]:
trainer.train()

Step,Training Loss
500,1.3896
1000,1.3587
1500,1.358
2000,1.3566
2500,1.3389
3000,1.2933
3500,1.2471
4000,1.242
4500,1.2342
5000,1.2553




TrainOutput(global_step=8439, training_loss=1.260590876147481, metrics={'train_runtime': 4093.424, 'train_samples_per_second': 65.959, 'train_steps_per_second': 2.062, 'total_flos': 5651700655325184.0, 'train_loss': 1.260590876147481, 'epoch': 3.0})

In [15]:
trainer.evaluate(max_length = 128)

{'eval_loss': 1.255053997039795,
 'eval_model_preparation_time': 0.0,
 'eval_bleu': 36.62139560959779,
 'eval_runtime': 168.4536,
 'eval_samples_per_second': 11.873,
 'eval_steps_per_second': 0.374,
 'epoch': 3.0}