In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

import re

In [27]:
df = pd.read_csv('/kaggle/input/language-translation-englishfrench/eng_-french.csv')
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [28]:
df.columns = ['English', 'French']
df.head()

Unnamed: 0,English,French
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [29]:
df['English'] = df['English'].str.lower()
df['French'] = df['French'].str.lower()


df['English'] = df['English'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
df['French'] = df['French'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

df['English'] = df['English'].str.replace(r'\s+', ' ', regex=True).str.strip()
df['French'] = df['French'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [30]:
"""
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

df['English_tokens'] = df['English'].apply(lambda x: tokenizer.encode(x, truncation=True, padding="max_length", max_length=128))
df['French_tokens'] = df['French'].apply(lambda x: tokenizer.encode(x, truncation=True, padding="max_length", max_length=128))
"""

'\ntokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")\n\ndf[\'English_tokens\'] = df[\'English\'].apply(lambda x: tokenizer.encode(x, truncation=True, padding="max_length", max_length=128))\ndf[\'French_tokens\'] = df[\'French\'].apply(lambda x: tokenizer.encode(x, truncation=True, padding="max_length", max_length=128))\n'

# 0.IMPORTING LIBS

In [31]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input, Bidirectional, Attention, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 1. Preprocessing the Data

In [None]:
# Load your dataset (replace the file path with your CSV file)
df = pd.read_csv('/kaggle/input/language-translation-englishfrench/eng_-french.csv')

df = df.dropna()

df.columns = ['English', 'French']

df['English'] = df['English'].str.lower()
df['French'] = df['French'].str.lower()

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

input_sentences = train_data['English'].tolist()
output_sentences = train_data['French'].tolist()

output_sentences_input = ['\t' + sentence for sentence in output_sentences]
output_sentences_target = [sentence + '\n' for sentence in output_sentences]

input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_sentences)
input_sequences = input_tokenizer.texts_to_sequences(input_sentences)

output_tokenizer = Tokenizer()
output_tokenizer.fit_on_texts(output_sentences_input + output_sentences_target)
output_sequences_input = output_tokenizer.texts_to_sequences(output_sentences_input)
output_sequences_target = output_tokenizer.texts_to_sequences(output_sentences_target)

max_input_length = max([len(seq) for seq in input_sequences])
max_output_length = max([len(seq) for seq in output_sequences_input])

encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_length, padding='post')
decoder_input_data = pad_sequences(output_sequences_input, maxlen=max_output_length, padding='post')
decoder_target_data = pad_sequences(output_sequences_target, maxlen=max_output_length, padding='post')

encoder_input_data = np.array(encoder_input_data)
decoder_input_data = np.array(decoder_input_data)
decoder_target_data = np.array(decoder_target_data)


# 3. Model Building (Encoder-Decoder with Attention)

In [None]:
latent_dim = 256

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=len(input_tokenizer.word_index) + 1, output_dim=latent_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True))  # set return_sequences=True
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
encoder_state_h = Concatenate()([forward_h, backward_h])
encoder_state_c = Concatenate()([forward_c, backward_c])

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=len(output_tokenizer.word_index) + 1, output_dim=latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True)  # set return_sequences=True
decoder_lstm_output, decoder_state_h, decoder_state_c = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])

attention = Attention(use_scale=True)
attention_output = attention([decoder_lstm_output, encoder_outputs])

decoder_concat_input = Concatenate(axis=-1)([decoder_lstm_output, attention_output])

decoder_dense = Dense(len(output_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

# 4.Model Training

In [None]:
model.fit(
    [encoder_input_data, decoder_input_data], 
    np.expand_dims(decoder_target_data, -1), 
    batch_size=64, 
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 271ms/step - accuracy: 0.9206 - loss: 0.7368 - val_accuracy: 0.9876 - val_loss: 0.1100
Epoch 2/10
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m477s[0m 271ms/step - accuracy: 0.9904 - loss: 0.0790 - val_accuracy: 0.9939 - val_loss: 0.0577
Epoch 3/10
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m477s[0m 271ms/step - accuracy: 0.9959 - loss: 0.0288 - val_accuracy: 0.9957 - val_loss: 0.0440
Epoch 4/10
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m478s[0m 272ms/step - accuracy: 0.9984 - loss: 0.0092 - val_accuracy: 0.9961 - val_loss: 0.0408
Epoch 5/10
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m478s[0m 272ms/step - accuracy: 0.9995 - loss: 0.0024 - val_accuracy: 0.9963 - val_loss: 0.0402
Epoch 6/10
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m477s[0m 271ms/step - accuracy: 0.9999 - loss: 6.5978e-04 - val_accuracy: 0.9965 - val_l

<keras.src.callbacks.history.History at 0x7af09adb9cc0>