In [None]:
!pip install igbo-text


In [None]:
!pip install datasets

In [None]:
!pip install --upgrade tensorflow

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## *Data*

In [None]:
# load datasets

ds = tfds.load('huggingface:igbo_english_machine_translation/ig-en')
train_df1 = pd.read_csv('/content/english-igbo-bible.csv')
train_df2 = pd.read_csv('/content/english-igbo-dictionary.csv')

In [None]:
# extract sentences from dataset
eng = []
igbo = []

for split in ['train', 'validation', 'test']:
  for elem in ds[split]:
    eng.append(elem['translation']['en'].numpy().decode('utf-8'))
    igbo.append(elem['translation']['ig'].numpy().decode('utf-8'))

In [None]:
combined_ig = pd.concat([pd.Series(igbo), train_df1['ig'],train_df2['ig'] ], axis=0).reset_index(drop=True)
combined_en = pd.concat([pd.Series(eng), train_df1['en'], train_df2['en']], axis=0).reset_index(drop=True)

# Create the new DataFrame with the combined columns
combined_df = pd.DataFrame({
    'ig': combined_ig,
    'en': combined_en
})

# Replace empty strings with NaN
combined_df.replace('', pd.NA, inplace=True)

# Drop rows with NaN values
combined_df.dropna(inplace=True)

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55944 entries, 0 to 55954
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ig             55944 non-null  object
 1   en             55944 non-null  object
 2   normalized-ig  55944 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB


In [None]:
from igbo_text import IgboText

igbo_sentences = combined_df['ig'].to_list()
normalized_sentences = []
# normalize igbo text
for elem in igbo_sentences:
  igbo_text = IgboText()
  normalized_sentences.append(igbo_text.normalize(elem, convert_to_lower=True, remove_abbreviations=True))


In [None]:
combined_df['normalized-ig'] = normalized_sentences

In [None]:
# save preprocessed data
#combined_df.to_csv('eng-ig.csv')

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

combined_df = pd.read_csv('/content/eng-ig.csv')

In [None]:
total_count = len(combined_df)
train_size = int(0.80 * total_count)
valid_test_size = total_count - train_size
valid_size = int(0.10 * total_count)
test_size = valid_test_size - valid_size

# Shuffle the indices
indices = np.random.permutation(total_count)

# Get indices for validation and test sets
valid_indices = indices[:valid_size]
test_indices = indices[valid_size:valid_size + test_size]

# Create validation and test sets
valid_data = combined_df.iloc[valid_indices]
test_data = combined_df.iloc[test_indices]

# Remove the validation and test indices from the original data to get the training set
train_indices = indices[valid_size + test_size:]
train_data = combined_df.iloc[train_indices]

# Verify sizes
print(f"Train set: {len(train_data)} samples")
print(f"Validation set: {len(valid_data)} samples")
print(f"Test set: {len(test_data)} samples")

Train set: 44755 samples
Validation set: 5594 samples
Test set: 5595 samples


In [None]:
max_length = 128

text_vec_layer_ig = tf.keras.layers.TextVectorization(
    max_tokens=28800, output_sequence_length=max_length)

text_vec_layer_ig.adapt([f"[START] {s} [END]" for s in combined_df['normalized-ig'].to_list()])

text_vec_layer_en = tf.keras.layers.TextVectorization(output_sequence_length=max_length)
text_vec_layer_en.adapt([f"[START] {s} [END]" for s in combined_df['en']])

In [None]:
encoding = [  3,  3126, 28590,   513,  1141, 15703]
trans = ''
for i in encoding:
  trans += " " + text_vec_layer_ig.get_vocabulary()[i]

trans

' start kaara achuzie dike bụrụgodị alala'

In [None]:
def prepare_batch(en_list, ig_list):
  en_input = tf.constant([f"[START] {s} [END]" for s in en_list]) #input for encoder
  ig_input = tf.constant([f"[START] {s}" for s in ig_list]) # input for decoder

  ig_label = text_vec_layer_ig([f"{s} [END]" for s in ig_list])  # output/target of decoder

  return (en_input, ig_input), ig_label

In [None]:
# Convert lists to a TensorFlow dataset for training
#(en_input, ig_input), ig_label = prepare_batch(combined_df['en'].to_list(), normalized_sentences)

##
train_ds = tf.data.Dataset.from_tensor_slices((prepare_batch(train_data['en'].to_list(), train_data['normalized-ig'].to_list())))

# Batch the dataset
batch_size = 64
BUFFER_SIZE = 20000
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(batch_size).prefetch(tf.data.AUTOTUNE)

## create validation and test set
valid_ds = tf.data.Dataset.from_tensor_slices((prepare_batch(valid_data['en'].to_list(), valid_data['normalized-ig'].to_list())))

valid_ds = valid_ds.shuffle(BUFFER_SIZE).batch(batch_size).prefetch(tf.data.AUTOTUNE)

##
test_ds = tf.data.Dataset.from_tensor_slices((prepare_batch(test_data['en'].to_list(), test_data['normalized-ig'].to_list())))

test_ds = test_ds.shuffle(BUFFER_SIZE).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
for elem in train_ds.take(1):
  print(elem)

In [None]:
# Convert lists to a TensorFlow dataset for training
# Batch the dataset
batch_size = 64
BUFFER_SIZE = 20000

full_ds = tf.data.Dataset.from_tensor_slices((prepare_batch(combined_df['en'].to_list(), combined_df['normalized-ig'].to_list())))
full_ds = full_ds.shuffle(BUFFER_SIZE).batch(batch_size).prefetch(tf.data.AUTOTUNE)



In [None]:
!export TF_USE_CUDNN=1

import os
os.environ['TF_DISABLE_MKL'] = '1'

## Training

In [None]:
%reload_ext autoreload
%autoreload 2

from Transformer import CustomSchedule, masked_loss, masked_accuracy, Transformer

num_layers = 4
d_model = 256
dff = 750
num_heads = 8
dropout_rate = 0.1


learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)


transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab = text_vec_layer_en,
    target_vocab = text_vec_layer_ig,
    dropout_rate=dropout_rate)

transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
# Early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_masked_accuracy',  # Metric to monitor
    patience=5,          # Number of epochs to wait for improvement
    verbose=1,
    mode='max',
    restore_best_weights=True
)


In [None]:
history = transformer.fit(
    train_ds,
    epochs=100,
    validation_data=valid_ds,
    callbacks=[early_stopping]
)


Epoch 1/100




[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408ms/step - loss: 8.6992 - masked_accuracy: 0.0615
Epoch 1: val_masked_accuracy improved from -inf to 0.18369, saving model to /content/drive/MyDrive/best_trans_model/my_model.keras
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 438ms/step - loss: 8.6971 - masked_accuracy: 0.0616 - val_loss: 5.3159 - val_masked_accuracy: 0.1837
Epoch 2/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408ms/step - loss: 4.9445 - masked_accuracy: 0.2160
Epoch 2: val_masked_accuracy improved from 0.18369 to 0.29127, saving model to /content/drive/MyDrive/best_trans_model/my_model.keras
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 434ms/step - loss: 4.9442 - masked_accuracy: 0.2160 - val_loss: 4.2783 - val_masked_accuracy: 0.2913
Epoch 3/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408ms/step - loss: 3.9418 - masked_accuracy: 0.3271
Epoch 3: val_masked

In [None]:
# Training on all data
#Early stopping callback

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='masked_accuracy',  # Metric to monitor
    patience=5,          # Number of epochs to wait for improvement
    verbose=1,
    mode='max',
    restore_best_weights=True
)


history = transformer.fit(
    full_ds,
    epochs=100,
    callbacks=[early_stopping]
)


Epoch 1/100




[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 413ms/step - loss: 8.3251 - masked_accuracy: 0.0694
Epoch 2/100
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 413ms/step - loss: 4.6997 - masked_accuracy: 0.2446
Epoch 3/100
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 413ms/step - loss: 3.7342 - masked_accuracy: 0.3565
Epoch 4/100
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 412ms/step - loss: 3.1641 - masked_accuracy: 0.4250
Epoch 5/100
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 412ms/step - loss: 2.7826 - masked_accuracy: 0.4740
Epoch 6/100
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 412ms/step - loss: 2.4188 - masked_accuracy: 0.5260
Epoch 7/100
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 412ms/step - loss: 2.0541 - masked_accuracy: 0.5836
Epoch 8/100
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 412ms/st

KeyboardInterrupt: 

In [None]:


# Show the model architecture
transformer.summary()

In [None]:
transformer.evaluate(test_ds)



[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 179ms/step - loss: 0.2036 - masked_accuracy: 0.9653


[0.20758189260959625, 0.9651038646697998]

In [None]:
transformer.evaluate(test_ds)

[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 181ms/step - loss: 0.2196 - masked_accuracy: 0.9633


[0.2141198366880417, 0.9639821648597717]

In [None]:
sentence = combined_df['en'][1]
sentence = tf.constant([sentence])

sentence

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Joseph Achuzie, the Biafran brave man is gone.'], dtype=object)>

In [None]:
predictions =transformer([sentence, tf.constant(['[START] joseph nna onye obi agụ eze ata bụ onye'])], training=False)


In [None]:
preds = predictions[0][9]
tf.argmax(preds)

<tf.Tensor: shape=(), dtype=int64, numpy=4>

## *Export Model*

In [None]:
class ExportTranslator(tf.Module):
  def __init__(self, translator):
    self.translator = translator
    self.MAX_TOKENS = tf.constant(1000, dtype=tf.int64)


  @tf.function(input_signature=[tf.TensorSpec(shape=(1,), dtype=tf.string) ])
  def __call__(self, sentence):
    result = self.translator(sentence, self.MAX_TOKENS)
    return result

In [None]:
%reload_ext autoreload
%autoreload 2

from Transformer import CustomTokenizer, Translator

tokenizers = CustomTokenizer(text_vec_layer_en, text_vec_layer_ig)
translator = Translator(tokenizers, transformer)

In [None]:
text = translator(sentence, 40)
text.numpy()[0].decode('utf-8')

<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'[START] joseph nna onye obi ag\xe1\xbb\xa5 eze ata b\xe1\xbb\xa5 onye end'],
      dtype=object)>

In [None]:
tf.saved_model.save(translator, export_dir='translator_en-ig')

In [None]:
export_translator = ExportTranslator(translator)
text = export_translator(['Joseph Achuzie, the Biafran brave man is gone.'])
text.numpy()[0].decode('utf-8')

In [None]:
tf.saved_model.save(export_translator, export_dir='translator')

In [None]:
!unzip /content/translator_en-ig.zip

In [None]:
test_trans = tf.saved_model.load('/content/content/translator_en-ig')

In [None]:
text = test_trans(np.array(['5 You must love Jehovah your God with all your heart and all your soul and all your strength.']), 60)
text.numpy()[0].decode('utf-8')

'[START] unu ga eji obi unu dum na mkpụrụ obi unu dum na ike unu dum hụ ya n anya end'

In [None]:
beam = BeamTranslator(tokenizers, transformer)

In [None]:
beam(sentence, beam_width=3, beam_length=4)

##Deploy

In [None]:
import sys
# We need sudo prefix if not on a Google Colab.
if 'google.colab' not in sys.modules:
  SUDO_IF_NEEDED = 'sudo'
else:
  SUDO_IF_NEEDED = ''

In [None]:
# This is the same as you would do from your command line, but without the [arch=amd64], and no sudo
# You would instead do:
# echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list && \
# curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add -

!echo "deb http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | {SUDO_IF_NEEDED} tee /etc/apt/sources.list.d/tensorflow-serving.list && \
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | {SUDO_IF_NEEDED} apt-key add -
!{SUDO_IF_NEEDED} apt update

deb http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2943  100  2943    0     0   2329      0  0:00:01  0:00:01 --:--:--  2330
OK
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://storage.googleapis.com/tensorflow-serving-apt stable InRelease [3,026 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:7 http://storage.googleapis.com/tensorflow-serving-apt stable/tensorflow-model-server-universal amd64 Packages [349 B]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c

In [None]:
# TODO: Use the latest model server version when colab supports it.
#!{SUDO_IF_NEEDED} apt-get install tensorflow-model-server
# We need to install Tensorflow Model server 2.8 instead of latest version
# Tensorflow Serving >2.9.0 required `GLIBC_2.29` and `GLIBCXX_3.4.26`. Currently colab environment doesn't support latest version of`GLIBC`,so workaround is to use specific version of Tensorflow Serving `2.8.0` to mitigate issue.
!wget 'http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-2.8.0/t/tensorflow-model-server/tensorflow-model-server_2.8.0_all.deb'
!dpkg -i tensorflow-model-server_2.8.0_all.deb
!pip3 install tensorflow-serving-api==2.8.0

In [None]:
!nohup tensorflow_model_server --rest_api_port=8501 --model_name=translator_en_ig --model_base_path=/content/translator_en-ig >server.log 2>&1 &


In [None]:
export_path = "/content/translator_en-ig/1"
tf.saved_model.save(test_trans, export_path)


In [None]:
import grpc

# Create a channel that will be connected to the gRPC port of the container
channel = grpc.insecure_channel("localhost:8501")
from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc

# Create a stub made for prediction
# This stub will be used to send the gRPCrequest to the TF Server
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
# Get the serving_input key
loaded_model = tf.saved_model.load('/content/content/translator')
input_name = list(
    loaded_model.signatures["serving_default"].structured_input_signature[1].keys()
)



input_name

['sentence', 'max_length']

In [None]:
def predict_grpc(data, input_name, stub):
    # Create a gRPC request made for prediction
    request = predict_pb2.PredictRequest()

    # Set the name of the model, for this use case it is "model"
    request.model_spec.name = "translator_en_ig"

    # Set which signature is used to format the gRPC query
    # here the default one "serving_default"
    request.model_spec.signature_name = "serving_default"

    # Set the input as the data
    # tf.make_tensor_proto turns a TensorFlow tensor into a Protobuf tensor
    request.inputs[input_name[0]].CopyFrom(tf.make_tensor_proto(data))
    request.inputs[input_name[1]].CopyFrom(tf.make_tensor_proto(np.int64(10)))

    # Send the gRPC request to the TF Server
    result = stub.Predict(request)
    return result


grpc_outputs = predict_grpc(np.array(['5 You must love Jehovah your God with all your heart and all your soul and all your strength.'], dtype=np.string_), input_name, stub)
grpc_outputs = np.array([grpc_outputs.outputs['predictions'].float_val])

print(f"gRPC output shape: {grpc_outputs}")