In [1]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m


# Translation

##  Data

In [2]:
import pandas as pd

In [3]:
combined_df_all = pd.read_csv('combined_df.csv', usecols=['EN', 'TI', 'EN_tokenized', 'TI_tokenized'])

In [4]:
combined_df_all.drop(columns=['EN_tokenized', 'TI_tokenized'], inplace=True)
combined_df_all["EN"] = combined_df_all["EN"].str.replace(',', '')
combined_df_all["TI"] = combined_df_all["TI"].str.replace(',', '')
combined_df_all.reset_index(drop=True, inplace=True)
combined_df_all.to_csv('dat_1k.csv')

In [5]:
bi_dat_2 = pd.read_csv('bi_dat_2.csv', usecols = ['TI', 'EN', 'TI_tokenized', 'EN_tokenized'])
combined_df = pd.read_csv('combined_df.csv', usecols=['EN', 'TI', 'EN_tokenized', 'TI_tokenized'])

In [6]:
dat_5k = pd.concat([bi_dat_2, combined_df])

In [7]:
dat_5k.reset_index(drop=True, inplace=True)

In [8]:
dat_5k = dat_5k.iloc[:,[1,0,2,3]]

In [9]:
dat_5k.drop(columns=['EN_tokenized', 'TI_tokenized'], inplace=True)
dat_5k["EN"] = dat_5k["EN"].str.replace(',', '')
dat_5k["TI"] = dat_5k["TI"].str.replace(',', '')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_5k.drop(columns=['EN_tokenized', 'TI_tokenized'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_5k["EN"] = dat_5k["EN"].str.replace(',', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_5k["TI"] = dat_5k["TI"].str.replace(',', '')


In [10]:
dat_5k.reset_index(drop=True, inplace=True)

In [11]:
dat_5k.to_csv('dat_5k.csv')

## Processing

In [12]:
import numpy as np
np.random.seed(1)
text_file = 'dat_5k.csv'
with open(text_file) as f:
    lines = f.read().split('\n')[:-1]

prefix = "translate English to Tigrinya: "
text_pairs = []
for line in lines:
    ind,orig, target = line.split(',')
    orig = orig.replace('"', '')
    target = target.replace('"', '')
    text_pairs.append({'orig': orig, 'target': target})

#Let's create some splits
np.random.shuffle(text_pairs)
num_valid_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_valid_samples
train_pairs = text_pairs[:num_train_samples]
valid_pairs = text_pairs[num_train_samples : num_train_samples + num_valid_samples]
test_pairs = text_pairs[num_train_samples + num_valid_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

5036 total pairs
3526 training pairs
755 validation pairs
755 test pairs


In [13]:
train_pairs[0]

{'orig': 'Aaron shall be gathered to his people; for he shall not enter into the land which I have given to the children of Israel because you rebelled against my word at the waters of Meribah.',
 'target': 'ኣብ ማይ መሪባ ንትእዛዘይ ስለ ዝአቤኹምዎ፡ ኣሮን ናብቲ ንደቂ እስራኤል ዝሀብክዎም ምድሪ ኣይኣቱን እዩ እሞ፡ ናብ ሰቡ ይተአከብ።'}

In [14]:
train_list = []

In [15]:
for i in range(len(train_pairs)):
  train_list.append({'id': i, 'translation': {'en': train_pairs[i]['orig'],'ti':train_pairs[i]['target']}})

In [16]:
val_list = []
test_list = []

for i in range(len(valid_pairs)):
  val_list.append({'id': i, 'translation': {'en': valid_pairs[i]['orig'],'ti':valid_pairs[i]['target']}})

for i in range(len(test_pairs)):
  test_list.append({'id': i, 'translation': {'en': test_pairs[i]['orig'],'ti':test_pairs[i]['target']}})

## Preprocess

In [17]:
from transformers import AutoTokenizer


checkpoint = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [18]:
source_lang = "en"
target_lang = "ti"
prefix = "translate English to Tigrinya: "


def preprocess_function(examples):
    inputs = [prefix + example['translation'][source_lang] for example in examples]
    targets = [example['translation'][target_lang] for example in examples]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs
def preprocess_functionold(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [19]:
import tensorflow as tf
import pandas as pd
from datasets import Dataset

train_data = Dataset.from_list(train_list)
val_data = Dataset.from_list(val_list)
test_data = Dataset.from_list(test_list)

In [20]:
train_data

Dataset({
    features: ['id', 'translation'],
    num_rows: 3526
})

In [21]:
tokenized_train = train_data.map(preprocess_functionold, batched=True)

Map:   0%|          | 0/3526 [00:00<?, ? examples/s]

In [22]:
tokenized_val = val_data.map(preprocess_functionold, batched=True)


Map:   0%|          | 0/755 [00:00<?, ? examples/s]

In [23]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

## Evaluate

In [24]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [25]:
import evaluate

metric = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [26]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Train

In [27]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [28]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

tf_model.h5:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.

All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [29]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_train,
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)
tf_val_set = model.prepare_tf_dataset(
    tokenized_val,
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)


In [30]:
import tensorflow as tf
tf.random.set_seed(1234)
model.compile(optimizer=optimizer)  # No loss argument!

In [31]:
from transformers.keras_callbacks import KerasMetricCallback
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_val_set)

In [32]:
callbacks = [metric_callback]

In [33]:
model.fit(x=tf_train_set, validation_data=tf_val_set, epochs=60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<tf_keras.src.callbacks.History at 0x7ccd0c47ed40>

## Inference

In [34]:
tokenized_test = test_data.map(preprocess_functionold, batched=True)

Map:   0%|          | 0/755 [00:00<?, ? examples/s]

In [35]:
tf_test_set = model.prepare_tf_dataset(
    tokenized_test,
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

In [36]:
train_pairs[0:5]

[{'orig': 'Aaron shall be gathered to his people; for he shall not enter into the land which I have given to the children of Israel because you rebelled against my word at the waters of Meribah.',
  'target': 'ኣብ ማይ መሪባ ንትእዛዘይ ስለ ዝአቤኹምዎ፡ ኣሮን ናብቲ ንደቂ እስራኤል ዝሀብክዎም ምድሪ ኣይኣቱን እዩ እሞ፡ ናብ ሰቡ ይተአከብ።'},
 {'orig': 'Send you men that they may spy out the land of Canaan which I give to the children of Israel: of every tribe of their fathers shall you send a man everyone a prince among them.',
  'target': 'ነታ ኣነ ንደቂ እስራኤል ዝህቦም ምድሪ ኸነኣን ዚስልዩ ሰባት ስደድ፡ ከካብ ነገድ ኣቦታቶም ሓደ ሰብ፡ ኲሎም ካብቶም ሹማምቶም ይስደዱ።'},
 {'orig': 'In the greatness of your excellency you overthrow those who rise up against you. You send forth your wrath. It consumes them as stubble.',
  'target': 'ብዕቤት ግርማኻ ንዝተንስኡካ ጨፍለቕካዮም፡ ቍጥዓኻ ሰደድካ፡ ከም ሓሰር በልዓቶም።'},
 {'orig': 'You may make them an inheritance for your children after you to hold for a possession; of them may you take your slaves forever: but over your brothers the children of Israel you shal

In [37]:
from transformers import pipeline

In [38]:
from transformers import AutoTokenizer

In [39]:
from transformers import TFAutoModelForSeq2SeqLM


In [40]:
search_toks = {v:k for k, v in tokenizer.get_vocab().items()}


In [41]:
preds = []
trues = []
for text in test_pairs:
  inputs = tokenizer(text["orig"], return_tensors="tf").input_ids
  outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
  preds.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
  trues.append(text["target"])

In [42]:
preds[0:10]

['ብመጠን እቲ ፍርቂ ደቂ እስራኤል ሙሴ ካብቲ ፍርቂ ደቂ እስራኤል ዝአመነ ሹድሽተ',
 'እግዚኣብሄር ኻብ ማእከል ሓዊ ኺዛረበኩም ነበረ እሞ፡ ቃል ዘረባ ሰሚዕኩም ኰይኑ፡ ግ',
 'ያእቆብ ድማ ከምኡ ገበረ፡ ምሸትውን ቀረበት። ፍረዩ ድማ ንራሄል ጓሉ ሰበይቱ',
 'ካብ ቤት ኣቦይን ካብ ምድሪ ወለደይን፡ እቲ ንኣይ ዝነበረ፡ ንዘርእይ እዛ ምድሪ እዚ',
 'ኲሎም ቊጽራት ሰፈር ኤፍሬም ከም ሰራዊቶም ሚእትን ሾሞንተ ሽሕን ሓደን ሽሕን ኰኑ። ',
 'ሙሴ ድማ ኵሉ ግብሪ ረኣየ፡ እንሆ ኸኣ፡ ከምቲ እግዚኣብሄር ዝአዘዞ ዅሉ ገ',
 'ንእግዚኣብሄር ብናይዛ መገዲ እዚኣ ንኺዛረብ ደኣ ኽንሐልፍ፡ በዚ መገዲ እዚ ',
 'ኣቕርቡ እቲ ዚውዝወዝ መዓሙቚን ከም ቍራዕ ዓውድን ከም ፍሬ ናይ መጽመቚ ወይንን ',
 'ድማ ኪኸውን ድማ እዩ፡ ካብ ሓደ ኻብዚ ንሓደ ምስ በዳሊ፡ እቲ ዝገበሮ ሓጢኣት ምስ',
 'እቶም ሰባት ከኣ ካብኡ ተንስኡ፡ ናብ ሶዶም ከኣ ርእዩ። ኣብርሃም ከኣ ምሳታቶም ኪ']

In [43]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=preds, references=trues)
print(results)

{'bleu': 0.10672498417968665, 'precisions': [0.42749757516973813, 0.20616493194555643, 0.11406110946306734, 0.06967827971328555], 'brevity_penalty': 0.6560247638179094, 'length_ratio': 0.703454157782516, 'translation_length': 8248, 'reference_length': 11725}


In [44]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=preds, references=trues, max_order = 1)
print(results)

{'bleu': 0.28044899578345645, 'precisions': [0.42749757516973813], 'brevity_penalty': 0.6560247638179094, 'length_ratio': 0.703454157782516, 'translation_length': 8248, 'reference_length': 11725}


In [49]:
res_df = pd.DataFrame({"Translation": trues, "Predictions": preds})

In [50]:
res_df.to_csv('predsmt5large5k.csv')