In [33]:
import pandas as pd
import numpy as np
from unidecode import unidecode
import tensorflow as tf 
from sklearn.model_selection import train_test_split
from transformers import T5TokenizerFast, TFAutoModelForSeq2SeqLM
from tensorflow.keras.callbacks import EarlyStopping
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_data(url: str, n_rows: int):
    data = pd.read_csv(url).iloc[:n_rows, :]
    data = data.sample(frac=1).copy()
    data.columns = ["input", "target"]
    data = data.loc[(~data["input"].isna()) & (~data["target"].isna())].copy()
    return data

In [3]:
data = get_data(url="data/eng_french.csv", n_rows=80000)

In [4]:
data.head()

Unnamed: 0,input,target
15382,Is everyone here?,Tout le monde est-il là ?
5072,Give it to me.,Donne-le-moi.
26116,They're all normal.,Elles sont toutes normales.
26590,We adopted a child.,Nous avons adopté un enfant.
1295,I know CPR.,Je connais la RCP.


In [5]:
def preprocess_data(data: pd.DataFrame):
    data = data.copy()
    data["input"] = "translate English to French: " + data["input"].map(unidecode).copy()
    return data

In [6]:
data = preprocess_data(data)

In [7]:
def split_data(data: pd.DataFrame, input_col: str="input", target_col: str="target", test_size: float=0.1):
    x_train, x_test, y_train, y_test = train_test_split(data[input_col], data[target_col], 
                                                        random_state=42, test_size=test_size)
    
    print(f'x_train.shape: {x_train.shape}, x_test.shape: {x_test.shape}, '+
          f'y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}')
    x_train, x_test, y_train, y_test = x_train.to_list(), x_test.to_list(), y_train.to_list(), y_test.to_list()
    return x_train, x_test, y_train, y_test

In [8]:
x_train, x_test, y_train, y_test = split_data(data=data)

x_train.shape: (72000,), x_test.shape: (8000,), y_train.shape: (72000,), y_test.shape: (8000,)


In [9]:
x_train[0], y_train[0]

("translate English to French: That's what worrying me.",
 "C'est ce qui m'inquiète.")

In [10]:
strategy = tf.distribute.MirroredStrategy()
CHECKPOINT = "t5-small"
N_TOKENS = 100 # considering only 100 tokens due to memory constraints
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [11]:
def tokenize(input: list, target: list, n_tokens: int):
    tokenizer = T5TokenizerFast.from_pretrained(CHECKPOINT)
    print(f'Example:\n{input[0]}\n{tokenizer.tokenize(input[0])}')
    tokenized_data = tokenizer(text=input, text_target=target, max_length=n_tokens, truncation=True, padding="max_length")
    return tokenized_data    

In [12]:
tokenize(input=x_train[:2], target=y_train[:2], n_tokens=N_TOKENS)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Example:
translate English to French: That's what worrying me.
['▁translate', '▁English', '▁to', '▁French', ':', '▁That', "'", 's', '▁what', '▁worrying', '▁me', '.']


{'input_ids': [[13959, 1566, 12, 2379, 10, 466, 31, 7, 125, 19348, 140, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [13959, 1566, 12, 2379, 10, 27, 737, 31, 17, 214, 255, 47, 3, 1092, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1

In [13]:
tokenized_train = tokenize(input=x_train, target=y_train, n_tokens=N_TOKENS)
tokenized_test = tokenize(input=x_test, target=y_test, n_tokens=N_TOKENS)

Example:
translate English to French: That's what worrying me.
['▁translate', '▁English', '▁to', '▁French', ':', '▁That', "'", 's', '▁what', '▁worrying', '▁me', '.']
Example:
translate English to French: Don't yell at me.
['▁translate', '▁English', '▁to', '▁French', ':', '▁Don', "'", 't', '▁', 'y', 'ell', '▁at', '▁me', '.']


In [14]:
tokenized_train[0]

Encoding(num_tokens=100, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
tokenized_train[:5]

[Encoding(num_tokens=100, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=100, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=100, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=100, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=100, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [16]:
def return_tf_tensors(data):
    data = tf.data.Dataset.from_tensor_slices(dict(data))
    data = data.prefetch(tf.data.AUTOTUNE)
    return data

In [17]:
train_tf_data = return_tf_tensors(tokenized_train)
test_tf_data = return_tf_tensors(tokenized_test)

In [18]:
for i in train_tf_data.take(1):
    print(i)

{'input_ids': <tf.Tensor: shape=(100,), dtype=int32, numpy=
array([13959,  1566,    12,  2379,    10,   466,    31,     7,   125,
       19348,   140,     5,     1,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(100,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   

In [19]:
def fit_model(train_data, val_data, epochs=2, eta=1e-4, early_stopping_patience=1, batch_size=BATCH_SIZE):
    with strategy.scope():
        model = TFAutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)
        model.compile(optimizer=tf.keras.optimizers.Adam(eta))

    print(model.summary())
    early_stop = EarlyStopping(monitor="val_loss", patience=early_stopping_patience, mode="min")
    model.fit(train_data.shuffle(len(train_data)).batch(batch_size), validation_data=val_data.shuffle(len(val_data)).batch(batch_size), 
          epochs=epochs, callbacks=[early_stop])
    return model

In [20]:
model = fit_model(train_data=train_tf_data, val_data=test_tf_data)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  35330816  
                                                                 
 decoder (TFT5MainLayer)     multiple                  41625344  
                                                                 
Total params: 60,506,624
Trainable params: 60,506,624
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
Epoch 2/2


**INFERENCE**

In [21]:
def inference_tokenize(input: list, n_tokens: int):
    tokenizer = T5TokenizerFast.from_pretrained(CHECKPOINT)
    tokenized_data = tokenizer(text=input, max_length=n_tokens, truncation=True, padding="max_length", return_tensors="tf")
    return tokenizer, tokenized_data    

In [25]:
def inference(txt: str):
    test_data = ["translate English to French: " + txt]
    inference_tokenizer, tokenized_data = inference_tokenize(input=test_data, n_tokens=N_TOKENS)
    pred = model.generate(**tokenized_data, max_new_tokens=N_TOKENS)
    result = inference_tokenizer.decode(pred[0])
    result = re.sub("<.*?>", "", result).strip()
    print(f"ENGLISH:\n{txt}\n\nFRENCH:\n{result}")
    return (txt, result)

In [26]:
txt = '''
Heavy rainfall in several parts of north India has plunged the region into chaos, with more than 28 reported dead in the past three days.
'''
txt, result = inference(txt)

ENGLISH:

Heavy rainfall in several parts of north India has plunged the region into chaos, with more than 28 reported dead in the past three days.


FRENCH:
Des pluies intenses dans plusieurs parties du nord de l'Inde ont plongé la région dans le chaos, plus de 28 personnes ayant été déclarées mortes au cours des trois derniers jours.


In [27]:
txt = '''
Machine learning is a branch of artificial intelligence (AI) and computer science which focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy.
'''
txt, result = inference(txt)

ENGLISH:

Machine learning is a branch of artificial intelligence (AI) and computer science which focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy.


FRENCH:
L'apprentissage par machine est une branche de l'intelligence artificielle (AI) et de l'informatique qui se concentre sur l'utilisation de données et d'algorithmes pour imiter la façon dont les humains apprennent, améliorant progressivement sa précision.
