In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from gensim.models import KeyedVectors

In [None]:
# Load the dataset
df = pd.read_csv('cleaned_infopankki-fa.csv')
english_sentences = df['English'].astype(str).tolist()
farsi_sentences = df['Persian'].astype(str).tolist()

In [None]:
# English and Farsi tokenizers
tokenizer_en = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_fa = BertTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

In [None]:
# Tokenize sentences
tokenized_en = [tokenizer_en.tokenize(sentence) for sentence in english_sentences]
tokenized_fa = [tokenizer_fa.tokenize(sentence) for sentence in farsi_sentences]

In [None]:
# Build English and Farsi vocabularies
vocab_en = set()
for tokens in tokenized_en:
    vocab_en.update(tokens)

vocab_fa = set()
for tokens in tokenized_fa:
    vocab_fa.update(tokens)

In [None]:
# Split the data into training and testing sets (80% training, 20% testing)
english_train_sentences, english_test_sentences, farsi_train_sentences, farsi_test_sentences = train_test_split(
    english_sentences, farsi_sentences, test_size=0.2, random_state=42)

In [None]:
# Convert sentences into sequences of indices based on vocabulary
word2idx_en = {word: idx + 1 for idx, word in enumerate(vocab_en)}
word2idx_fa = {word: idx + 1 for idx, word in enumerate(vocab_fa)}
idx2word_fa = {idx: word for word, idx in word2idx_fa.items()}

In [None]:
# Helper function to convert sentences into sequences of indices
def sentences_to_sequences(tokenizer, sentences, word2idx):
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    sequences = [[word2idx.get(token, 0) for token in tokens] for tokens in tokenized_sentences]
    return sequences

In [None]:
# Convert training and test data to sequences
sequences_en_train = sentences_to_sequences(tokenizer_en, english_train_sentences, word2idx_en)
sequences_fa_train = sentences_to_sequences(tokenizer_fa, farsi_train_sentences, word2idx_fa)
sequences_en_test = sentences_to_sequences(tokenizer_en, english_test_sentences, word2idx_en)
sequences_fa_test = sentences_to_sequences(tokenizer_fa, farsi_test_sentences, word2idx_fa)

In [None]:
# Get maximum sequence lengths
max_len_en = max(len(seq) for seq in sequences_en_train)
max_len_fa = max(len(seq) for seq in sequences_fa_train)

In [None]:
# Pad the sequences
sequences_en_train_padded = pad_sequences(sequences_en_train, maxlen=max_len_en, padding='post')
sequences_fa_train_padded = pad_sequences(sequences_fa_train, maxlen=max_len_fa, padding='post')
sequences_en_test_padded = pad_sequences(sequences_en_test, maxlen=max_len_en, padding='post')
sequences_fa_test_padded = pad_sequences(sequences_fa_test, maxlen=max_len_fa, padding='post')

In [None]:
# Prepare target sequences for the decoder (shifted Farsi sentences)
decoder_target_data_train = np.zeros_like(sequences_fa_train_padded)
decoder_target_data_train[:, :-1] = sequences_fa_train_padded[:, 1:]
decoder_target_data_train[:, -1] = 0
decoder_target_data_train = decoder_target_data_train.reshape(*decoder_target_data_train.shape, 1)

decoder_target_data_test = np.zeros_like(sequences_fa_test_padded)
decoder_target_data_test[:, :-1] = sequences_fa_test_padded[:, 1:]
decoder_target_data_test[:, -1] = 0
decoder_target_data_test = decoder_target_data_test.reshape(*decoder_target_data_test.shape, 1)


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!gunzip cc.en.300.vec.gz

--2024-09-10 10:03:40--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.162.163.11, 3.162.163.51, 3.162.163.19, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.162.163.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2024-09-10 10:03:44 (346 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]



In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
!gunzip cc.fa.300.vec.gz

--2024-09-10 10:04:16--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.165.83.44, 18.165.83.35, 18.165.83.79, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.165.83.44|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1258183862 (1.2G) [binary/octet-stream]
Saving to: ‘cc.fa.300.vec.gz’


2024-09-10 10:05:06 (24.1 MB/s) - ‘cc.fa.300.vec.gz’ saved [1258183862/1258183862]



In [None]:
# Load pre-trained FastText vectors for English and Farsi
def load_fasttext_vectors(vector_file, vocab):
    ft_model = KeyedVectors.load_word2vec_format(vector_file, binary=False, limit=None)
    embeddings = {}
    for word in vocab:
        if word in ft_model:
            embeddings[word] = ft_model[word]
        else:
            embeddings[word] = np.random.uniform(-0.1, 0.1, ft_model.vector_size)
    return embeddings, ft_model.vector_size

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m41.0/73.4 kB[0m [31m966.6 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m867.9 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.to

In [None]:
import fasttext
# Load English and Farsi embeddings
embeddings_en, embedding_dim_en = load_fasttext_vectors('cc.en.300.vec', vocab_en)
embeddings_fa, embedding_dim_fa = load_fasttext_vectors('cc.fa.300.vec', vocab_fa)

In [None]:
# Create embedding matrices
embedding_matrix_en = np.zeros((len(word2idx_en) + 1, embedding_dim_en))
for word, idx in word2idx_en.items():
    embedding_matrix_en[idx] = embeddings_en.get(word, np.random.uniform(-0.1, 0.1, embedding_dim_en))

In [None]:
embedding_matrix_fa = np.zeros((len(word2idx_fa) + 1, embedding_dim_fa))
for word, idx in word2idx_fa.items():
    embedding_matrix_fa[idx] = embeddings_fa.get(word, np.random.uniform(-0.1, 0.1, embedding_dim_fa))

In [None]:
# Define LSTM model for English to Farsi translation

# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=len(word2idx_en) + 1,
                              output_dim=embedding_dim_en,
                              weights=[embedding_matrix_en],
                              input_length=max_len_en,
                              trainable=False,  # FastText embeddings are not trainable
                              name='encoder_embedding')(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True, name='encoder_lstm')(encoder_embedding)
encoder_states = [state_h, state_c]

In [None]:
# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=len(word2idx_fa) + 1,
                              output_dim=embedding_dim_fa,
                              weights=[embedding_matrix_fa],
                              input_length=max_len_fa,
                              trainable=False,  # FastText embeddings are not trainable
                              name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(word2idx_fa) + 1, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
# Add callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min', verbose=1)

In [None]:
# Train the model with training data
model.fit([sequences_en_train_padded, sequences_fa_train_padded],
          decoder_target_data_train,
          batch_size=64,
          epochs=10,
          validation_split=0.2,
          callbacks=[early_stopping, model_checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 1.06324, saving model to best_model.keras
Epoch 2/10
Epoch 2: val_loss improved from 1.06324 to 0.99006, saving model to best_model.keras
Epoch 3/10
Epoch 3: val_loss improved from 0.99006 to 0.94784, saving model to best_model.keras
Epoch 4/10
Epoch 4: val_loss improved from 0.94784 to 0.91089, saving model to best_model.keras
Epoch 5/10
Epoch 5: val_loss improved from 0.91089 to 0.88520, saving model to best_model.keras
Epoch 6/10
Epoch 6: val_loss improved from 0.88520 to 0.86674, saving model to best_model.keras
Epoch 7/10
Epoch 7: val_loss did not improve from 0.86674
Epoch 8/10
Epoch 8: val_loss improved from 0.86674 to 0.83111, saving model to best_model.keras
Epoch 9/10
Epoch 9: val_loss improved from 0.83111 to 0.81635, saving model to best_model.keras
Epoch 10/10
Epoch 10: val_loss improved from 0.81635 to 0.80429, saving model to best_model.keras


<keras.src.callbacks.History at 0x79575118f040>

In [None]:
# Evaluate the model on test data
test_loss = model.evaluate([sequences_en_test_padded, sequences_fa_test_padded], decoder_target_data_test)
print(f"Test Loss: {test_loss}")

Test Loss: 0.7981439828872681


In [None]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(256,), name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(256,), name='decoder_state_input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs_inference, state_h_inference, state_c_inference = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)

decoder_states = [state_h_inference, state_c_inference]
decoder_outputs_inference = decoder_dense(decoder_outputs_inference)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inference] + decoder_states
)

In [None]:
def translate_sentence(input_sentence, max_len_fa=max_len_fa):
    # Tokenize and pad the input sentence (English)
    input_sequence = [word2idx_en.get(token, 0) for token in tokenizer_en.tokenize(input_sentence)]
    input_sequence = pad_sequences([input_sequence], maxlen=max_len_en, padding='post')

    # Encode the input sentence using the encoder model
    states_value = encoder_model.predict(input_sequence)

    # Initialize the target sequence with the start token ('[CLS]')
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = word2idx_fa.get('[CLS]', 1)  # Use CLS as start token (adjust if needed)

    # Initialize variables to store the generated words
    translated_sentence = []
    stop_condition = False
    while not stop_condition:
        # Predict the next word and its hidden states
        output_tokens, h, c = decoder_model.predict([target_sequence] + states_value)

        # Get the predicted word index
        predicted_word_index = np.argmax(output_tokens[0, -1, :])
        predicted_word = idx2word_fa.get(predicted_word_index, '')

        # Append the predicted word to the translated sentence
        translated_sentence.append(predicted_word)

        # Stop if we reach the end token ('[SEP]') or exceed max sentence length
        if predicted_word == '[SEP]' or len(translated_sentence) >= max_len_fa:
            stop_condition = True
        else:
            # Update the target sequence with the predicted word for the next time step
            target_sequence = np.zeros((1, 1))
            target_sequence[0, 0] = predicted_word_index

            # Update the decoder states
            states_value = [h, c]

    # Join the translated sentence into a single string and return it
    return ' '.join([word for word in translated_sentence if word != '[SEP]' and word != '[CLS]'])

In [None]:
# Example usage:
input_sentence = "Swedish English is music and concerts."
translation = translate_sentence(input_sentence)
print("English:", input_sentence)
print("Farsi Translation:", translation)


English: Swedish English is music and concerts.
Farsi Translation: به زبان فنلاندی                                                                                                  
