**Installing necessary packages**

In [None]:
!pip install -q simpletransformers
!pip install -q sacremoses
!pip install -q datasets
!pip install -q sacrebleu
!pip install -q evaluate
!pip install -q torch
!pip install -q accelerate
!pip install -q tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.2 MB/s[0m et

**Importing necessary libraries**

In [None]:
import datasets
from datasets import Dataset, DatasetDict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamW, get_scheduler, pipeline
import evaluate
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm.auto import tqdm
import torch

**Reading the dataset**

In [None]:
df = pd.read_csv ('/content/drive/MyDrive/nlp/translation/por.txt', sep ='\t', header = None)
df.sample(10)

Unnamed: 0,0,1,2
110958,Tom needs to sign this document.,Tom precisa assinar este documento.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
45843,They'll love that one.,Eles vão amar aquele ali.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
122421,"You don't know what it is, do you?","Você não sabe o que é, né?",CC-BY 2.0 (France) Attribution: tatoeba.org #9...
6048,I'm not angry.,Eu não estou bravo.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
62334,He was elected president.,Ele foi eleito presidente.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
61573,Are you off duty tonight?,O senhor está de folga esta noite?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
98268,These boxes are the same size.,Estas caixas são do mesmo tamanho.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
115698,This isn't going to happen again.,Isto não irá acontecer novamente.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
83637,I stayed in Boston with Tom.,Eu fiquei em Boston com o Tom.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
163232,Mississippi is the poorest state in the United...,Mississippi é o estado mais pobre dos Estados ...,CC-BY 2.0 (France) Attribution: tatoeba.org #3...


**Removing the unnecessary column**

In [None]:
df = df.drop (2, axis =1)

In [None]:
df.head(3)

Unnamed: 0,0,1
0,Go.,Vai.
1,Go.,Vá.
2,Hi.,Oi.


**Checking for missing values**

In [None]:
df.isnull().sum()

0    0
1    0
dtype: int64

**Removing duplicates**

In [None]:
print (df.shape)
df = df.drop_duplicates()
print (df.shape)

(168903, 2)
(168903, 2)


**Renaming columns**

In [None]:
df = df.rename(columns={0:'en',1:'pt'})

In [None]:
df.head(1)

Unnamed: 0,en,pt
0,Go.,Vai.


**Making an index column**

In [None]:
df['id'] = np.arange (0,168903)

**Making train, test and validation splits**

In [None]:
train_old, validation = train_test_split (df, test_size =0.25, random_state =1)
train, test = train_test_split (train_old, test_size =0.25, random_state =1)

**Converting the dataset to arrows format for faster processing**

In [None]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
validation = Dataset.from_pandas(validation)

In [None]:
train

Dataset({
    features: ['en', 'pt', 'id', '__index_level_0__'],
    num_rows: 95007
})

**Removing the index column just generated**

In [None]:
train = train.remove_columns (['__index_level_0__'])
test = test.remove_columns (['__index_level_0__'])
validation = validation.remove_columns (['__index_level_0__'])

**Further data preprocessing for converting it in to a format that is accepted by the model**

In [None]:
# Add the 'translation' key containing 'en' and 'pt' columns to the dataset

train= train.map(lambda example: {'translation': {'en': example['en'], 'pt': example['pt']}, 'id': example['id']})

test= test.map(lambda example: {'translation': {'en': example['en'], 'pt': example['pt']}, 'id': example['id']})

validation= validation.map(lambda example: {'translation': {'en': example['en'], 'pt': example['pt']}, 'id': example['id']})

Map:   0%|          | 0/95007 [00:00<?, ? examples/s]

Map:   0%|          | 0/31670 [00:00<?, ? examples/s]

Map:   0%|          | 0/42226 [00:00<?, ? examples/s]

In [None]:
train = train.remove_columns(['en', 'pt'])

test = test.remove_columns(['en', 'pt'])

validation = validation.remove_columns(['en', 'pt'])

In [None]:
train['translation'][0]

{'en': 'Are you studying French?', 'pt': 'Vocês estão estudando Francês?'}

In [None]:
ds = DatasetDict()

ds['train'] = train
ds['test'] = test
ds['validation'] = validation

ds

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 95007
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 31670
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 42226
    })
})

In [None]:
ds['train']['translation'][0]

{'en': 'Are you studying French?', 'pt': 'Vocês estão estudando Francês?'}

**Downloading the suitable pretrained model for fine-tuning**

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-tc-big-en-pt"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

Downloading (…)okenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

**Defining tokenization function**

In [None]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["pt"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

**Tokenizing the dataset**

In [None]:
tokenized_datasets = ds.map(
    preprocess_function,
    batched=True,
    remove_columns=ds["train"].column_names,)

Map:   0%|          | 0/95007 [00:00<?, ? examples/s]

Map:   0%|          | 0/31670 [00:00<?, ? examples/s]

Map:   0%|          | 0/42226 [00:00<?, ? examples/s]

**Initializing the model, data collator and evaluation metric**

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/465M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

**Preparing train and evaluation data loaders**

In [None]:
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=16
)

**Initializing adam optimizer**

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)



**Initializing accelerator**

In [None]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

**Defining training arguments**

In [None]:
num_train_epochs = 2
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,)

**Defining the name and output directory of trained model**

In [None]:
model_name = "marian-finetuned-kde4-en-to-pt-accelerate"

output_dir = "/content/drive/MyDrive/nlp/translation/translation"

**Defining the post processing function**

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

**Model Training**

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/11876 [00:00<?, ?it/s]

  0%|          | 0/2640 [00:00<?, ?it/s]

epoch 0, BLEU score: 61.33


  0%|          | 0/2640 [00:00<?, ?it/s]

epoch 1, BLEU score: 61.90


In [None]:
ds['test']['translation'][5:6]

[{'en': 'I teach mathematics and physics.',
  'pt': 'Eu ensino matemática e física.'}]

**Taking custom input from user and translating into portuguese**

In [None]:
model_checkpoint = "/content/drive/MyDrive/nlp/translation/translation/"
translator = pipeline("translation", model=model_checkpoint)
output = translator("I teach mathematics and physics.")
translated_text = output[0]['translation_text']
print(translated_text)

Eu ensino matemática e física.
