In [1]:
import pandas as pd

# Load the dataset
file_path = 'data-ga-en.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


                     ga          english
0  Bo ni otswa mi aloo?  Did you call me
1  Bo ni otswa mi aloo?  Did you call me
2  Bo ni otswa mi aloo?  Did you call me
3  Bo ni otswa mi aloo?  Did you call me
4  Bo ni otswa mi aloo?  Did you call me


In [2]:

def preprocess_texts(texts):
    texts = texts.fillna('')  # Fill missing values
    texts = texts.str.lower()  # Convert to lower case
    texts = '<start> ' + texts + ' <end>'  # Add start and end tokens
    return texts

# Preprocess the texts
data['ga'] = preprocess_texts(data['ga'])
data['english'] = preprocess_texts(data['english'])

# Display the first few rows of the preprocessed data
print(data.head())


                                   ga                        english
0  <start> bo ni otswa mi aloo? <end>  <start> did you call me <end>
1  <start> bo ni otswa mi aloo? <end>  <start> did you call me <end>
2  <start> bo ni otswa mi aloo? <end>  <start> did you call me <end>
3  <start> bo ni otswa mi aloo? <end>  <start> did you call me <end>
4  <start> bo ni otswa mi aloo? <end>  <start> did you call me <end>


In [5]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create a tokenizer for Twi texts
ga_tokenizer = Tokenizer()
ga_tokenizer.fit_on_texts(data['ga'])
ga_sequences = ga_tokenizer.texts_to_sequences(data['ga'])
ga_vocab_size = len(ga_tokenizer.word_index) + 1

# Create a tokenizer for English texts
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(data['english'])
english_sequences = english_tokenizer.texts_to_sequences(data['english'])
english_vocab_size = len(english_tokenizer.word_index) + 1

# Pad the sequences
max_len = max(max(len(seq) for seq in ga_sequences), max(len(seq) for seq in english_sequences))
ga_sequences = pad_sequences(ga_sequences, maxlen=max_len, padding='post')
english_sequences = pad_sequences(english_sequences, maxlen=max_len, padding='post')


In [6]:
import numpy as np

# Prepare encoder input data
encoder_input_data = np.array(ga_sequences)

# Prepare decoder input and output data
decoder_input_data = np.array(english_sequences)
decoder_output_data = np.zeros_like(decoder_input_data)

# Shift the decoder output data by one timestep
decoder_output_data[:, :-1] = decoder_input_data[:, 1:]
decoder_output_data[:, -1] = 0


In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Define model parameters
embedding_dim = 256
latent_dim = 512

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(ga_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(english_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# Train the model
batch_size = 64
epochs = 100

model.fit([encoder_input_data, decoder_input_data],
          np.expand_dims(decoder_output_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)


Epoch 1/100
[1m 23/307[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:47[0m 377ms/step - loss: 2.8234

KeyboardInterrupt: 

In [6]:
model.save("fine-tuned-translation-model.keras")
twi_tokenizer_json = twi_tokenizer.to_json()
english_tokenizer_json = english_tokenizer.to_json()

with open("ga_tokenizer.json", "w") as f:
    f.write(twi_tokenizer_json)

with open("english_tokenizer.json", "w") as f:
    f.write(english_tokenizer_json)


In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

# Load the model
model = load_model("fine-tuned-translation-model.keras")

# Load the tokenizers
with open("ga_tokenizer.json") as f:
    twi_tokenizer_json = json.load(f)
    twi_tokenizer = tokenizer_from_json(json.dumps(twi_tokenizer_json))

with open("english_tokenizer.json") as f:
    english_tokenizer_json = json.load(f)
    english_tokenizer = tokenizer_from_json(json.dumps(english_tokenizer_json))

def translate_twi_to_english(twi_sentence):
    # Preprocess the sentence
    twi_sentence = preprocess_texts(pd.Series([twi_sentence]))[0]

    # Tokenize the Twi sentence
    tokenized_text = twi_tokenizer.texts_to_sequences([twi_sentence])
    tokenized_text = pad_sequences(tokenized_text, maxlen=max_len, padding='post')

    # Generate translation
    translated = model.predict([tokenized_text, np.zeros_like(tokenized_text)])

    # Decode the generated translation
    translated_sentence = english_tokenizer.sequences_to_texts(np.argmax(translated, axis=-1))[0]

    return translated_sentence.replace('<start> ', '').replace(' <end>', '')

# Example translation
twi_sentence = "Ani nakai lo"
translated_sentence = translate_twi_to_english(twi_sentence)
print(f"Twi: {twi_sentence}\nEnglish: {translated_sentence}")
