In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

In [2]:


def load_translations(french_file, tamil_file):
    with open(french_file, 'r', encoding='utf-8') as file:
        french_sentences = file.read().strip().split('\n')
    with open(tamil_file, 'r', encoding='utf-8') as file:
        tamil_sentences = file.read().strip().split('\n')
    return dict(zip(french_sentences, tamil_sentences))

def suggest_similar_words(word, word_list):
    from difflib import get_close_matches
    return get_close_matches(word, word_list)

def translate_to_tamil(sentence, french_to_tamil, french_sentences, wrong_words):
    tamil_translation = french_to_tamil.get(sentence)
    if tamil_translation:
        print(f"French Word: {sentence}")
        print(f"Tamil Translation --> {tamil_translation}")
    else:
        print("Translation not found.")
        suggestions = suggest_similar_words(sentence, french_sentences)
        print(f"Suggestions: {suggestions}")
        wrong_words.append(sentence)
        if len(wrong_words) > 1:
            print(f"Wrong words so far: {wrong_words}")

In [3]:


def create_tokenizer_and_vocab(sentences, file_path_prefix):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    with open(f"{file_path_prefix}_tokenizer.json", 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer.to_json(), ensure_ascii=False))
    vocab = tokenizer.word_index
    with open(f"{file_path_prefix}_vocab.json", 'w', encoding='utf-8') as f:
        f.write(json.dumps(vocab, ensure_ascii=False))
    return tokenizer, vocab

def create_tensorflow_model(french_sentences, tamil_sentences):
    french_tokenizer, french_vocab = create_tokenizer_and_vocab(french_sentences, "french")
    tamil_tokenizer, tamil_vocab = create_tokenizer_and_vocab(tamil_sentences, "tamil")
    french_sequences = french_tokenizer.texts_to_sequences(french_sentences)
    tamil_sequences = tamil_tokenizer.texts_to_sequences(tamil_sentences)
    max_sequence_length = max(max(len(seq) for seq in french_sequences), max(len(seq) for seq in tamil_sequences))
    french_sequences = pad_sequences(french_sequences, maxlen=max_sequence_length, padding='post')
    tamil_sequences = pad_sequences(tamil_sequences, maxlen=max_sequence_length, padding='post')
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=len(french_vocab) + 1, output_dim=128, input_length=max_sequence_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(len(tamil_vocab) + 1, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model, french_sequences, tamil_sequences




In [4]:
# Save the model
def save_model(model, model_file):
    model.save(model_file)
    print(f"Model saved to {model_file}")

# Load the model
def load_model_from_file(model_file):
    model = load_model(model_file)
    print(f"Model loaded from {model_file}")
    return model

# Usage example
french_file = "data4/french.txt"
tamil_file = "data4/tamil.txt"
french_to_tamil = load_translations(french_file, tamil_file)
french_sentences = list(french_to_tamil.keys())
tamil_sentences = list(french_to_tamil.values())
model, french_sequences, tamil_sequences = create_tensorflow_model(french_sentences, tamil_sentences)
model_file = "french_to_tamil_model.h5"
save_model(model, model_file)
loaded_model = load_model_from_file(model_file)
sentence = "maison"
wrong_words = []
translate_to_tamil(sentence, french_to_tamil, french_sentences, wrong_words)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7, 128)            1435520   
                                                                 
 bidirectional (Bidirectiona  (None, 7, 128)           98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 8541)              555165    
                                                                 
Total params: 2,196,573
Trainable params: 2,196,573
Non-