In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q datasets sacrebleu torch transformers sentencepiece transformers[sentencepiece]
!pip install -q accelerate -U
!pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m252.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency 

Neural Machine Translation using T5 (Seq2Seq Trainer) from huggingface. <br>
Source Language - English <br>
Traget Language - Portuguese (Later Nepali)

In [3]:
import pathlib
import numpy as np
import pandas as pd
import warnings
import torch
import transformers

from tqdm import tqdm
from sklearn.model_selection import train_test_split

from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer

from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

warnings.filterwarnings("ignore")

### Raw Dataset to CSV File

In [39]:
config = {
    'RAW_DATASET_PATH': "/content/drive/MyDrive/Colab Code/Week 1 - Neural Machine Translation/por-eng/por.txt",
    'BATCH_SIZE': 32,
    'ENGLISH' : "en",
    'ENGLISH_TEXT' : "english_text",
    'CSV_FILENAME' : "/content/en-pt-translation.csv",
    'GEN_LEN' : "gen_len",
    'MAX_GEN_LENGTH': 32,
    'MAX_INPUT_LENGTH' : 32, # maxm sequence length
    'MAX_TARGET_LENGTH' : 32,
    'LABELS' : "labels",
    'INPUT_IDS': "input_ids",
    'PREFIX' : "",
    'PORTUGUESE' : "pt",
    'PORTUGUESE_TEXT' : "portuguese_text",
    'SCORE' : "score",
    'SOURCE_LANG' : "pt",
    'TARGET_LANG' : "en",
    'TRANSLATION' : "translation",
    'COLUMN_NAMES': ['English Translation', 'Portuguese Translation'],
    'MODEL_CHECKPOINT': "unicamp-dl/translation-en-pt-t5",
    'DEVICE': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
}

In [40]:
# load raw dataset and prepare a CSV file
raw_dataset = pathlib.Path(config['RAW_DATASET_PATH'])
text_data = raw_dataset.read_text(encoding = 'utf-8')

lines = text_data.splitlines()
pairs = [line.split('\t') for line in lines]

translation_pairs = np.array([[context, target] for context, target, _ in pairs])

df = pd.DataFrame(translation_pairs, columns = config['COLUMN_NAMES'])
df.to_csv(config['CSV_FILENAME'])

del lines
del pairs
del translation_pairs
del df

In [41]:
df = pd.read_csv(config['CSV_FILENAME'])[config['COLUMN_NAMES']]
df.head()

Unnamed: 0,English Translation,Portuguese Translation
0,Go.,Vai.
1,Go.,Vá.
2,Hi.,Oi.
3,Run!,Corre!
4,Run!,Corra!


In [42]:
def postprocess_text(preds: list, labels: list) -> tuple:
    """Performs post processing on the prediction text and labels"""

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def prep_data_for_model_fine_tuning(source_lang: list, target_lang: list) -> list:
    """Takes the input data lists and converts into translation list of dicts"""

    data_dict = dict()
    data_dict[config['TRANSLATION']] = []

    for sr_text, tr_text in zip(source_lang, target_lang):
        temp_dict = dict()
        temp_dict[config['PORTUGUESE']] = sr_text
        temp_dict[config['ENGLISH']] = tr_text

        data_dict[config['TRANSLATION']].append(temp_dict)

    return data_dict


def generate_model_ready_dataset(dataset: list, source: str, target: str,
                                 model_checkpoint: str,
                                 tokenizer: AutoTokenizer):
    """Makes the data training ready for the model"""

    preped_data = []

    for row in dataset:
        inputs = "" + row[source]
        targets = row[target]

        model_inputs = tokenizer(inputs, max_length = config['MAX_INPUT_LENGTH'],
                                 truncation=True, padding=True)

        model_inputs[config['TRANSLATION']] = row

        # setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=config['MAX_INPUT_LENGTH'],
                                 truncation=True, padding=True)
            model_inputs[config['LABELS']] = labels[config['INPUT_IDS']]

        preped_data.append(model_inputs)

    return preped_data


def compute_metrics(eval_preds: tuple) -> dict:
    """computes bleu score and other performance metrics """

    metric = load_metric("sacrebleu")
    tokenizer = AutoTokenizer.from_pretrained(config['MODEL_CHECKPOINT'])

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {'BLEU': result['score']}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result['gen_len'] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result


In [43]:
df.head()
df.shape

(190639, 2)

#### Train-Test-Validation Data Split

In [44]:
X, y = df['English Translation'], df['Portuguese Translation']


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.10,
                                                    shuffle = True,
                                                    random_state = 69)

print("INITIAL X-TRAIN SHAPE: ", x_train.shape)
print("INITIAL Y-TRAIN SHAPE: ", y_train.shape)
print("X-TEST SHAPE: ", x_test.shape)
print("Y-TEST SHAPE: ", y_test.shape)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
                                                  test_size = 0.20,
                                                  shuffle = True,
                                                  random_state = 69)

print("FINAL X-TRAIN SHAPE: ", x_train.shape)
print("FINAL Y-TRAIN SHAPE: ", y_train.shape)
print("X-VAL SHAPE: ", x_val.shape)
print("Y-VAL SHAPE: ", y_val.shape)

INITIAL X-TRAIN SHAPE:  (171575,)
INITIAL Y-TRAIN SHAPE:  (171575,)
X-TEST SHAPE:  (19064,)
Y-TEST SHAPE:  (19064,)
FINAL X-TRAIN SHAPE:  (137260,)
FINAL Y-TRAIN SHAPE:  (137260,)
X-VAL SHAPE:  (34315,)
Y-VAL SHAPE:  (34315,)


In [45]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_CHECKPOINT'])

training_data = prep_data_for_model_fine_tuning(x_train.values, y_train.values)
validation_data = prep_data_for_model_fine_tuning(x_val.values, y_val.values)
test_data = prep_data_for_model_fine_tuning(x_test.values, y_test.values)

In [46]:
train_data = generate_model_ready_dataset(dataset=training_data['translation'],
                                          tokenizer=tokenizer,
                                          source='en',
                                          target='pt',
                                          model_checkpoint=config['MODEL_CHECKPOINT'])

validation_data = generate_model_ready_dataset(dataset=validation_data['translation'],
                                               tokenizer = tokenizer,
                                               source = 'en',
                                               target = 'pt',
                                               model_checkpoint = config['MODEL_CHECKPOINT'])

test_data = generate_model_ready_dataset(dataset=test_data['translation'],
                                               tokenizer=tokenizer,
                                               source = 'en',
                                               target = 'pt',
                                               model_checkpoint = config['MODEL_CHECKPOINT'])

In [47]:
train_df = pd.DataFrame.from_records(train_data)
validation_df = pd.DataFrame.from_records(validation_data)
test_df = pd.DataFrame.from_records(test_data)

train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [48]:
model = AutoModelForSeq2SeqLM.from_pretrained(config['MODEL_CHECKPOINT']).to(config['DEVICE'])

In [37]:
import wandb

wandb.init(project = "translation-en-pt", name = "t5-finetuning-en-to-pt")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [49]:
model_args = Seq2SeqTrainingArguments(
    f"{'translation-en-pt-t5'}-finetuned-{'en'}-to-{'pt'}",
    evaluation_strategy = 'epoch',
    learning_rate = 2e-4,
    per_device_train_batch_size = config['BATCH_SIZE'],
    per_device_eval_batch_size = config['BATCH_SIZE'],
    weight_decay = 0.02,
    save_total_limit = 3,
    num_train_epochs = 10,
    predict_with_generate=True,
    report_to=["wandb"],  # This line enables reporting to WandB
    logging_dir="./logs",  # Directory for storing logs
    logging_steps = 50      # Log every 100 steps
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

wandb.finish()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 