<a href="https://colab.research.google.com/github/johnobodai/capstone/blob/main/Copy_of_notebookd99dbcab17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Weights & Biases (used for logging training metrics to the web dashboard)


In [1]:
pip install wandb



#Log in to Weights & Biases

In [2]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mj-obodai[0m ([33mj-obodai-african-leadership-group[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# 📦 Import Weights & Biases and its Keras integration for experiment tracking


In [3]:
import wandb
from wandb.integration.keras import WandbCallback


#Mount Google Drive to access/save model files and training artifacts


In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


#Load & Preview Data

In [2]:
import pandas as pd

# Path to your TSV file
file_path = '/content/drive/MyDrive/pretraining/data.tsv'

# Load the TSV file
df = pd.read_csv(file_path, sep='\t', names=['English', 'Ga'])

# Preview
print("🔍 Sample data:")
print(df.sample(3))


🔍 Sample data:
                                                English  \
585   Then Jehovah said to Moses: “Stretch out your ...   
8530  The One who made me feel secure on my mother’s...   
2551   The people of the region thought of them all ...   

                                                     Ga  
585   Ni Yehowa kɛɛ Mose akɛ: “Kpã onine mli yɛ ŋshɔ...  
8530      Mɔ ni bu mihe be ni mikã minyɛ fufɔi anaa lɛ.  
2551   Mɛi ni yɔɔ kpokpaa lɛ nɔ lɛ susu akɛ Noa kɛ e...  


Clean and Normalize Text

In [3]:
# Lowercase and strip whitespace
df['English'] = df['English'].astype(str).str.lower().str.strip()
df['Ga'] = df['Ga'].astype(str).str.lower().str.strip()

# Add special tokens to Ga side
df['Ga'] = df['Ga'].apply(lambda x: 'startseq ' + x + ' endseq')

# Confirm changes
df.sample(3)

Unnamed: 0,English,Ga
3527,unaware of the unfolding drama bethlehem slept...,startseq yosef kɛ maria wo amɛbi yesu kɛtsɔ du...
8941,"he commanded, and it stood firm.",startseq efã ni ema shi shiŋŋ. endseq
3925,"however, jonʹa·than said to saul his father wh...",startseq shi yonatan bi etsɛ saul akɛ mɛni hew...


Split into Training and Testing Sets

In [4]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

print(f"Train size: {len(train_data)} | Test size: {len(test_data)}")


Train size: 9477 | Test size: 1053


#Tokenize Sentences

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize tokenizers for both English and Ga
eng_tokenizer = Tokenizer(filters='', oov_token='<unk>')
ga_tokenizer = Tokenizer(filters='', oov_token='<unk>')

# Fit the tokenizers on the training data
eng_tokenizer.fit_on_texts(train_data['English'])
ga_tokenizer.fit_on_texts(train_data['Ga'])

# Convert sentences to sequences of token IDs
train_eng_seq = eng_tokenizer.texts_to_sequences(train_data['English'])
train_ga_seq = ga_tokenizer.texts_to_sequences(train_data['Ga'])

# Calculate vocabulary sizes (used for defining model input/output dimensions)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
ga_vocab_size = len(ga_tokenizer.word_index) + 1

# Print vocabulary sizes
print("English Vocabulary Size:", eng_vocab_size)
print("Ga Vocabulary Size:", ga_vocab_size)


English Vocabulary Size: 20547
Ga Vocabulary Size: 21225


#Determine Max Sequence Lengths

In [6]:
# Determine the maximum sequence length for encoder (English) and decoder (Ga)
max_encoder_len = max(len(seq) for seq in train_eng_seq)
max_decoder_len = max(len(seq) for seq in train_ga_seq)

# Print the maximum lengths
print("Maximum encoder sequence length:", max_encoder_len)
print("Maximum decoder sequence length:", max_decoder_len)


Maximum encoder sequence length: 349
Maximum decoder sequence length: 376


#Pad Sequences

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad English sequences (encoder input)
encoder_input_data = pad_sequences(train_eng_seq, maxlen=max_encoder_len, padding='post')

# Pad Ga sequences (decoder input)
decoder_input_data = pad_sequences(train_ga_seq, maxlen=max_decoder_len, padding='post')


#Create Decoder Target Data

In [8]:
import numpy as np

# Create target data by shifting decoder input one step to the left
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

# Ensure the final token is the 'endseq' token
decoder_target_data[:, -1] = ga_tokenizer.word_index.get('endseq', 0)

# Expand the target data shape to match model output
decoder_target_data = np.expand_dims(decoder_target_data, -1)


#Define the Model Architecture

In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Set hyperparameters
embedding_dim = 256
lstm_units = 512

# ----- Encoder -----
# Input for English sentences
encoder_inputs = Input(shape=(max_encoder_len,))
# Embedding layer for English input
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim)(encoder_inputs)
# LSTM to process the embedded input, return the hidden and cell states
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
# Encoder state to pass to decoder
encoder_states = [state_h, state_c]

# ----- Decoder -----
# Input for Ga sentences
decoder_inputs = Input(shape=(max_decoder_len,))
# Embedding for target (Ga) language
decoder_embedding = Embedding(input_dim=ga_vocab_size, output_dim=embedding_dim)(decoder_inputs)
# LSTM that uses encoder state as its initial state
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Final dense layer with softmax activation to predict vocabulary tokens
decoder_dense = Dense(ga_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# ----- Final Model -----
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


#Compile the Model

In [10]:
# Compile the model
model.compile(
    optimizer='adam',  # Adaptive optimizer commonly used in NLP
    loss='sparse_categorical_crossentropy',  # Suitable for integer-labeled sequence data
    metrics=['accuracy']  # Track accuracy during training
)

# Print model architecture and parameter count
model.summary()


 Add Callbacks for Checkpoints & Early Stopping

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import os
import wandb
from wandb.integration.keras import WandbCallback

# Initialize WandB project
wandb.init(project="english-ga-translation", name="lstm-512-embed-256")


# Create output directory to store checkpoints and best model
output_path = "/content/drive/MyDrive/pretraining/"
os.makedirs(output_path, exist_ok=True)

# Checkpoint path to save every epoch with epoch number
all_epochs_path = os.path.join(output_path, "epoch_{epoch:02d}.keras")
save_all = ModelCheckpoint(
    filepath=all_epochs_path,
    save_freq='epoch',        # Save the model after every epoch
    save_best_only=False,     # Save every epoch regardless of performance
    verbose=1
)

# Checkpoint to save only the best-performing model
best_model_path = os.path.join(output_path, "best_model.keras")
save_best = ModelCheckpoint(
    filepath=best_model_path,
    monitor='val_loss',       # Use validation loss to determine "best"
    save_best_only=True,      # Only save the best model
    verbose=1
)

# Stop training early if validation loss doesn't improve for 3 consecutive epochs
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,               # Wait 3 epochs before stopping
    restore_best_weights=True,
    verbose=1
)

# Bundle all callbacks
callbacks = [save_all, save_best, early_stop, WandbCallback()]



Load or Create Model (Optional Resume)

Train One Epoch and Save

In [12]:
!pip install --upgrade wandb




# For all Epoch

In [None]:
import wandb
import tensorflow as tf

# Start a new run for your translation model
wandb.init(
    project="english-ga-translation",
    name="lstm-512-embed-256",
    config={
        "embedding_dim": 256,
        "lstm_units": 512,
        "batch_size": 64,
        "epochs": 20,
        "learning_rate": 0.01,  # Add your actual learning rate
        "architecture": "Seq2Seq-LSTM",
        "dataset": "English-Ga-Translation"
    }
)

# Custom callback to manually log to WandB (like the example)
class ManualWandbLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs:
            # Log exactly like the working example
            wandb.log({
                "loss": logs.get('loss', 0),
                "val_loss": logs.get('val_loss', 0),
                "accuracy": logs.get('accuracy', logs.get('acc', 0)),
                "val_accuracy": logs.get('val_accuracy', logs.get('val_acc', 0)),
                "epoch": epoch + 1
            })
            print(f"📊 Epoch {epoch+1}/20 - Loss: {logs.get('loss', 0):.4f} - Val Loss: {logs.get('val_loss', 0):.4f}")

# Train your model
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=20,
    validation_split=0.2,
    callbacks=[
        save_all,
        save_best,
        early_stop,
        ManualWandbLogger()  # Use manual logging like the example
    ],
    verbose=1
)

# Mark the run as finished (like the example)
wandb.finish()

Epoch 1/20
[1m 23/119[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1:49:38[0m 69s/step - accuracy: 0.7449 - loss: 5.9894

Save the Final Model & Tokenizers

In [None]:
import pickle

# Save model
model.save("/content/drive/MyDrive/pretraining/english_ga_final_model.keras")

# Save tokenizers
with open("/content/drive/MyDrive/pretraining/eng_tokenizer.pkl", "wb") as f:
    pickle.dump(eng_tokenizer, f)

with open("/content/drive/MyDrive/pretraining/ga_tokenizer.pkl", "wb") as f:
    pickle.dump(ga_tokenizer, f)

# Save max sequence lengths
with open("/content/drive/MyDrive/pretraining/max_lengths.pkl", "wb") as f:
    pickle.dump({"encoder_len": max_encoder_len, "decoder_len": max_decoder_len}, f)

print("✅ All model components saved to /content/drive/MyDrive/pretraining/")


In [None]:
import shutil

# Source path from epoch 20
src = "/content/drive/MyDrive/pretraining/epoch_20.keras"
# Destination path
dst = "/content/drive/MyDrive/pretraining/english_ga_best_model.keras"

# Copy or rename
shutil.copy(src, dst)
print("✅ Model saved as 'english_ga_best_model.keras'")


Define the Inference Models (Encoder + Decoder)

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load saved model
model = load_model("/content/drive/MyDrive/pretraining/english_ga_final_model.keras")

# Load tokenizers
with open("/content/drive/MyDrive/pretraining/eng_tokenizer.pkl", "rb") as f:
    eng_tokenizer = pickle.load(f)

with open("/content/drive/MyDrive/pretraining/ga_tokenizer.pkl", "rb") as f:
    ga_tokenizer = pickle.load(f)

# Load max lengths
with open("/content/drive/MyDrive/pretraining/max_lengths.pkl", "rb") as f:
    max_lengths = pickle.load(f)

max_encoder_len = max_lengths["encoder_len"]
max_decoder_len = max_lengths["decoder_len"]


Define the Translate Function

In [None]:
def translate_input(input_text):
    input_seq = eng_tokenizer.texts_to_sequences([input_text.lower()])
    input_seq = pad_sequences(input_seq, maxlen=max_encoder_len, padding='post')

    start_token = ga_tokenizer.word_index.get('startseq', 1)
    end_token = ga_tokenizer.word_index.get('endseq', 2)

    target_seq = np.zeros((1, max_decoder_len))
    target_seq[0, 0] = start_token

    translated_sentence = []

    for i in range(1, max_decoder_len):
        predictions = model.predict([input_seq, target_seq], verbose=0)
        predicted_id = np.argmax(predictions[0, i, :])
        predicted_word = ga_tokenizer.index_word.get(predicted_id, '<unk>')

        if predicted_word in ['endseq', '<pad>']:
            break

        translated_sentence.append(predicted_word)
        target_seq[0, i] = predicted_id

    return ' '.join(translated_sentence)


Try Sample Translations

In [None]:
while True:
    user_input = input("Enter English (or 'exit'): ")
    if user_input.lower() == 'exit':
        break
    result = translate_input(user_input)
    print("Ga Translation:", result)


In [None]:
with open("/content/drive/MyDrive/pretraining/eng_tokenizer.pkl", "wb") as f:
    pickle.dump(eng_tokenizer, f)


In [None]:
import pickle

with open("/content/drive/MyDrive/pretraining/eng_tokenizer.pkl", "rb") as f:
    eng_tokenizer = pickle.load(f)

with open("/content/drive/MyDrive/pretraining/ga_tokenizer.pkl", "rb") as f:
    ga_tokenizer = pickle.load(f)

with open("/content/drive/MyDrive/pretraining/max_lengths.pkl", "rb") as f:
    max_lengths = pickle.load(f)

max_encoder_len = max_lengths["encoder_len"]
max_decoder_len = max_lengths["decoder_len"]


In [None]:
# Get input from user
input_text = input("Enter English: ").strip().lower()

# Tokenize and pad
input_seq = eng_tokenizer.texts_to_sequences([input_text])
input_seq = pad_sequences(input_seq, maxlen=max_encoder_len, padding='post')


In [None]:
print("Encoded input sequence:", input_seq)


In [None]:
def decode_sequence(input_seq):
    # Encode the input
    states_value = encoder_model.predict(input_seq)

    # Create empty target sequence with just the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = ga_tokenizer.word_index['startseq']

    stop_condition = False
    decoded_sentence = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = ga_tokenizer.index_word.get(sampled_token_index, '<unk>')

        if sampled_word == 'endseq' or len(decoded_sentence) > max_decoder_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)


In [None]:
print(decode_sequence(pad_sequences(
    eng_tokenizer.texts_to_sequences(["how are you?"]),
    maxlen=max_encoder_len, padding='post')))


In [None]:
from tensorflow.keras.models import load_model
import pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the trained model
model = load_model("/content/drive/MyDrive/pretraining/english_ga_best_model.keras")

# Load tokenizers
with open("/content/drive/MyDrive/pretraining/eng_tokenizer.pkl", "rb") as f:
    eng_tokenizer = pickle.load(f)

with open("/content/drive/MyDrive/pretraining/ga_tokenizer.pkl", "rb") as f:
    ga_tokenizer = pickle.load(f)

with open("/content/drive/MyDrive/pretraining/max_lengths.pkl", "rb") as f:
    max_lengths = pickle.load(f)

max_encoder_len = max_lengths['encoder_len']
max_decoder_len = max_lengths['decoder_len']

eng_vocab_size = len(eng_tokenizer.word_index) + 1
ga_vocab_size = len(ga_tokenizer.word_index) + 1


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Get layers from the trained model
encoder_inputs = model.input[0]
decoder_inputs = model.input[1]
decoder_outputs = model.output

# Extract the layers we need
encoder_embedding_layer = model.get_layer('embedding')
decoder_embedding_layer = model.get_layer('embedding_1')
encoder_lstm_layer = model.get_layer('lstm')
decoder_lstm_layer = model.get_layer('lstm_1')
decoder_dense = model.get_layer('dense')

# Encoder model
encoder_embedded = encoder_embedding_layer(encoder_inputs)
_, state_h_enc, state_c_enc = encoder_lstm_layer(encoder_embedded)
encoder_model = Model(encoder_inputs, [state_h_enc, state_c_enc])

# Decoder model
decoder_state_input_h = Input(shape=(512,))
decoder_state_input_c = Input(shape=(512,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedded = decoder_embedding_layer(decoder_inputs)
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm_layer(
    decoder_embedded, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)


In [None]:
def decode_sequence(input_seq):
    # Encode the input sentence
    states_value = encoder_model.predict(input_seq)

    # Start with "startseq"
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = ga_tokenizer.word_index.get('startseq', 1)

    stop_condition = False
    decoded_sentence = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = ga_tokenizer.index_word.get(sampled_token_index, '<unk>')

        if sampled_word == 'endseq' or len(decoded_sentence) > max_decoder_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)


In [None]:
test_sentence = "God"
input_seq = eng_tokenizer.texts_to_sequences([test_sentence.lower()])
input_seq = pad_sequences(input_seq, maxlen=max_encoder_len, padding='post')

translation = decode_sequence(input_seq)
print(f"Ga Translation: {translation}")
