In [1]:
import pandas as pd

# Load the dataset
file_path = 'data-tw-en.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


                   twi                 english Unnamed: 2
0  Maa wɔn ho yɛɛ huam  For them to smell good        NaN
1  Maa wɔn ho yɛɛ huam  For them to smell good        NaN
2  Maa wɔn ho yɛɛ huam  For them to smell good        NaN
3  Maa wɔn ho yɛɛ huam  For them to smell good        NaN
4  Maa wɔn ho yɛɛ huam  For them to smell good        NaN


In [2]:
def preprocess_texts(texts):
    texts = texts.fillna('')  # Fill missing values
    texts = texts.str.lower()  # Convert to lower case
    texts = '<start> ' + texts + ' <end>'  # Add start and end tokens
    return texts

# Preprocess the texts
data['twi'] = preprocess_texts(data['twi'])
data['english'] = preprocess_texts(data['english'])

# Display the first few rows of the preprocessed data
print(data.head())

                                 twi                               english  \
0  <start> maa wɔn ho yɛɛ huam <end>  <start> for them to smell good <end>   
1  <start> maa wɔn ho yɛɛ huam <end>  <start> for them to smell good <end>   
2  <start> maa wɔn ho yɛɛ huam <end>  <start> for them to smell good <end>   
3  <start> maa wɔn ho yɛɛ huam <end>  <start> for them to smell good <end>   
4  <start> maa wɔn ho yɛɛ huam <end>  <start> for them to smell good <end>   

  Unnamed: 2  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create a tokenizer for Twi texts
twi_tokenizer = Tokenizer()
twi_tokenizer.fit_on_texts(data['twi'])
twi_sequences = twi_tokenizer.texts_to_sequences(data['twi'])
twi_vocab_size = len(twi_tokenizer.word_index) + 1

# Create a tokenizer for English texts
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(data['english'])
english_sequences = english_tokenizer.texts_to_sequences(data['english'])
english_vocab_size = len(english_tokenizer.word_index) + 1

# Pad the sequences
max_len = max(max(len(seq) for seq in twi_sequences), max(len(seq) for seq in english_sequences))
ga_sequences = pad_sequences(twi_sequences, maxlen=max_len, padding='post')
english_sequences = pad_sequences(english_sequences, maxlen=max_len, padding='post')

In [4]:
import numpy as np

# Prepare encoder input data
encoder_input_data = np.array(ga_sequences)

# Prepare decoder input and output data
decoder_input_data = np.array(english_sequences)
decoder_output_data = np.zeros_like(decoder_input_data)

# Shift the decoder output data by one timestep
decoder_output_data[:, :-1] = decoder_input_data[:, 1:]
decoder_output_data[:, -1] = 0

In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Define model parameters
embedding_dim = 256
latent_dim = 512

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(twi_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(english_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# Train the model
batch_size = 64
epochs = 100

model.fit([encoder_input_data, decoder_input_data],
          np.expand_dims(decoder_output_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)


Epoch 1/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 441ms/step - loss: 1.3716 - val_loss: 1.8981
Epoch 2/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 399ms/step - loss: 0.6575 - val_loss: 2.2914
Epoch 3/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 367ms/step - loss: 0.2797 - val_loss: 2.7258
Epoch 4/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m974s[0m 3s/step - loss: 0.1488 - val_loss: 2.9601
Epoch 5/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 372ms/step - loss: 0.0675 - val_loss: 3.1690
Epoch 6/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24777s[0m 71s/step - loss: 0.0265 - val_loss: 3.3590
Epoch 7/100
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m994s[0m 3s/step - loss: 0.0125 - val_loss: 3.4750
Epoch 8/100
[1m307/351[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m15s[0m 362ms/step - loss: 0.0141


KeyboardInterrupt

