<a href="https://colab.research.google.com/github/highplainscomputing/HPC_T5/blob/main/HPC_T5/T5%20/02-language-translatio/language_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary dependencies

In [1]:
!pip install accelerate -U
!pip install -q -U datasets
!pip install scipy
!pip install ipywidgets
!pip install wandb
!pip install transformers
!pip install torch
!pip install sentencepiece
!pip install tqdm
!pip install evaluate
!pip install rouge_score
!pip install huggingface_hub

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00

# Import dependencies

In [2]:
# Data libraries
from datasets import load_dataset, Dataset, concatenate_datasets, list_metrics, load_metric

# preprocessing Libraries
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Model libraries
import transformers
import ast
import torch
import sklearn
from transformers import (
    T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM,
    DefaultDataCollator, T5ForQuestionAnswering, DataCollatorForSeq2Seq, pipeline, AutoModelForQuestionAnswering, BertTokenizer, BertForQuestionAnswering
)

# Metrics library
import evaluate
from evaluate import evaluator

# Visualization library
import wandb

# Warning libraries
import warnings
warnings.filterwarnings('ignore')

# Utility functions
import t5_utils as utils


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Global variables

In [6]:
DATASET = "enimai/MuST-C-and-WMT16-de-en"
PREFIX = "Translate English in German"
INPUT_COL = "en"
OUTPUT_COL = "de"
MODEL = "google/flan-t5-base"
TRAIN_SPLIT = "train[:8000]"
VALIDATION_SPLIT = "train[8000:8900]"
TEST_SPLIT = "test"



# Training args

MAX_STEPS = 1000
PER_DEVICE_TRAIN_BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH_SIZE = 4
SAVE_STEPS = 100
EVAL_STEPS = 100
LOGGING_STEPS = 100
CHECKPOINT_SAVED = 1000
OPTIMIZER = "adamw_torch"
# adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor.
LEARNING_RATE = 2.5e-5
COLLATOR = "seq2seq" # seq2seq or default

# for compute metrics
TASK = "text2text-generation"
METRICS = "rouge"

In [7]:
utils.metric = evaluate.load(METRICS)
utils.tokenizer = T5Tokenizer.from_pretrained(MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Data gathering and preparation

This function takes data from HuggingFace or local data, split ( train, test, validation, train[:slicing] ), input_col, target_col, prefix(column or text)

In [8]:
train_data = utils.prepare_data(
                                DATASET,
                                TRAIN_SPLIT,
                                INPUT_COL,
                                OUTPUT_COL,
                                PREFIX
                                )

eval_data = utils.prepare_data(
                               DATASET,
                               VALIDATION_SPLIT,
                               INPUT_COL,
                               OUTPUT_COL,
                               PREFIX
                               )

test_data = utils.prepare_data(
                               DATASET,
                               TEST_SPLIT,
                               INPUT_COL,
                               OUTPUT_COL,
                               PREFIX
                               )

Downloading data:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/837k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Here is how a data looks like

In [9]:
input_data_first_train = train_data[0]["input_text"]
targer_data_first_train = train_data[0]["target_text"]
print(f"input_text : {input_data_first_train}")
print("--------------------------------")
print(f"target_text : {targer_data_first_train}")

input_text : Translate English in German : In the proposals made by the Commission in Agenda 2000, that issue is also taken up in the proposal that for all the candidates without exception, accession partnerships should represent the framework on which a continuous course towards membership should be pursued, with everything that implies.
--------------------------------
target_text : In den von der Kommission in der Agenda 2000 unterbreiteten Empfehlungen wird diese Frage auch in dem Vorschlag aufgegriffen, daß für alle Kandidaten ohne Ausnahme Beitrittspartnerschaften den Rahmen bilden sollen, innerhalb dessen ein stetiger Kurs auf die Mitgliedschaft verfolgt werden sollte, mit allem, was dazugehört.


In [10]:
input_data_first_eval = eval_data[0]["input_text"]
targer_data_first_eval = eval_data[0]["target_text"]
print(f"input_text : {input_data_first_eval}")
print("--------------------------------")
print(f"target_text : {targer_data_first_eval}")

input_text : Translate English in German : Charles Weissmann will give a keynote lecture on Thursday evening and Adriano Aguzzi will open the meeting with an overview of the history of prion research in Zurich.
--------------------------------
target_text : Adriano Aguzzi eröffnet das Symposium mit einer historischen Übersicht über die Prionenforschung in Zürich. Am Donnerstagabend wird Charles Weissmann einen Vortrag halten, zu dem auch alle Studierenden herzlich eingeladen sind.


In [11]:
input_data_first_test = test_data[0]["input_text"]
targer_data_first_test = test_data[0]["target_text"]
print(f"input_text : {input_data_first_test}")
print("--------------------------------")
print(f"target_text : {targer_data_first_test}")

input_text : Translate English in German : In fact, their life depended on doing the job.
--------------------------------
target_text : Denn ihr Leben hing davon ab, ob sie den Job richtig machten.


In [12]:
max_source_length, max_target_length = utils.identify_max_lengths(
                                                                  train_data,
                                                                  eval_data,
                                                                  model = MODEL
                                                                  )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/8900 [00:00<?, ? examples/s]

Map:   0%|          | 0/8900 [00:00<?, ? examples/s]

In [13]:
print(f"Input max length : {max_source_length}")
print("--------------------------------")
print(f"Target max length : {max_target_length}")


Input max length : 260
--------------------------------
Target max length : 260


In [14]:
train_data_tokenized, eval_data_tokenized = utils.preprocessed_data(
                                                                    train_data,
                                                                    eval_data,
                                                                    max_source_length,
                                                                    max_target_length,
                                                                    model = MODEL)


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Training

Look at utility functions to get in-depth understanding of training function

In [15]:
utils.train_model(
           train_data_tokenized,
           train_data_tokenized,
           project = "Finetunning-t5",
           model = MODEL,
           base_model_name = "T5",
           max_steps = MAX_STEPS,
           per_device_train_batch_size = PER_DEVICE_TRAIN_BATCH_SIZE,
           per_device_eval_batch_size = PER_DEVICE_EVAL_BATCH_SIZE,
           save_steps = SAVE_STEPS,
           eval_steps = EVAL_STEPS,
           logging_steps = LOGGING_STEPS,
           checkpoint_saved = CHECKPOINT_SAVED,
           optimizer = OPTIMIZER,
           learning_rate = LEARNING_RATE,
           collator = COLLATOR
           )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,2.7784,2.407845,14.8794,6.7971,13.8713,14.0366,264.0
200,2.7537,2.40695,14.6219,6.6549,13.6487,13.7921,264.0
300,2.8589,2.398396,13.7876,6.2717,12.8434,12.9862,264.0
400,2.7918,2.398496,14.3999,6.5467,13.4217,13.5697,264.0
500,2.8644,2.390487,14.3385,6.5399,13.3712,13.5192,264.0
600,2.8044,2.385367,14.0313,6.4017,13.0948,13.2312,264.0


KeyboardInterrupt: 

# Evaluation

# Inference

This function takes 3 argumnets : task(summarization, classification, etc), text(data), model(pretrained, fine tuned)

In [17]:
utils.inference("Translate English in German",
                "In the proposals made by the Commission in Agenda 2000, that issue is also taken up in the proposal that for all the candidates without exception, accession partnerships should represent the framework on which a continuous course towards membership should be pursued, with everything that implies.",
                fined_tuned_path = "/content/T5-Finetunning-t5/checkpoint-600")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'In den Vorschlägen der Kommission im Agenda 2000 wird diese Frage auch in der Vorschlag gestellt, dass für alle Kandidaten ohne Ausnahme die Beitrittspartnerschaften die Rahmen für die Mitgliedschaft ein kontinuierliches Weg zu folgen, mit allen, die erfordert.'

In [18]:
utils.inference("Translate English in German",
                "Why the image is not opening",
                fined_tuned_path = "/content/T5-Finetunning-t5/checkpoint-600")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'Warum die Bild nicht öffnen?'

In [19]:
tokenizer = T5Tokenizer.from_pretrained("/content/T5-Finetunning-t5/checkpoint-600")
model = T5ForConditionalGeneration.from_pretrained("/content/T5-Finetunning-t5/checkpoint-600")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
tokenizer.push_to_hub("HuzaifaHPC/T5_English_German")
model.push_to_hub("HuzaifaHPC/T5_English_German")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HuzaifaHPC/T5_English_German/commit/e674fb0344a6e745ffa4d388797ba3477eb2d140', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='e674fb0344a6e745ffa4d388797ba3477eb2d140', pr_url=None, pr_revision=None, pr_num=None)