In [None]:
import pandas as pd
import pickle
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import string


In [None]:
!ls

In [None]:
df = pd.read_csv('mar.txt',encoding='utf-8', sep='	',  names=['English', 'Hindi', 'Attribution'])

In [None]:
df.head()

In [None]:
df.drop(['Attribution'], axis=1, inplace=True)


In [None]:
df.info()


In [None]:
df.isna().sum()

In [None]:
df.English = df.English.apply(lambda x: " ".join(x.split()))
df.Hindi = df.Hindi.apply(lambda x: " ".join(x.split()))


In [None]:
df.English = df.English.apply(lambda x: x.lower())


In [None]:
with open("contraction_expansion.txt", 'rb') as fp:
    contractions= pickle.load(fp)

In [None]:
def expand_contras(text):
    if type(text) is str:
        for key in contractions:
            value = contractions[key]
            text = text.replace(key, value)
        return text
    else:
        return text


In [None]:
df.sample(10)


In [None]:
xyz = "i'm don't he'll you'll"
expand_contras(xyz)


In [None]:
df.English = df.English.apply(lambda x: expand_contras(x))


In [None]:
df.shape

In [None]:
df.sample(5)


In [None]:
translator= str.maketrans('','', string.punctuation)


In [None]:
df.English= df.English.apply(lambda x: x.translate(translator))
df.Hindi= df.Hindi.apply(lambda x: x.translate(translator))


In [None]:
df.sample(5)


In [None]:
import re


In [None]:
df.English= df.English.apply(lambda x: re.sub(r'[\d]+','', x))
df.Hindi= df.Hindi.apply(lambda x: re.sub(r'[\d]+','', x))


In [None]:
df['en_word_count']= df.English.apply(lambda x: len(x.split()))
df['mar_word_count']= df.Hindi.apply(lambda x: len(x.split()))


In [None]:
df['mar_char_count']= df.Hindi.apply(lambda x: len("".join(x.split())))
df['en_char_count']= df.English.apply(lambda x: len("".join(x.split())))


In [None]:
df.head()


In [None]:
plt.figure(figsize=(15,10))
sns.kdeplot(x=df.en_word_count, shade=True, color='blue', label='Real')


In [None]:
max(df.en_word_count)


In [None]:
plt.figure(figsize=(15,10))
sns.kdeplot(x=df.mar_word_count, shade=True, color='green', label='Real')


In [None]:
max(df.mar_word_count)


In [None]:
plt.figure(figsize=(10,8))
sns.distplot(x=df.en_char_count)


In [None]:
plt.figure(figsize=(10,8))
sns.distplot(x=df.mar_char_count)


In [None]:
def plot_word_cloud(data):
    words=""
    for sent in data:
        sent= str(sent)
        sent=sent.lower()
        tokens= sent.split()
        words +=" ".join(tokens)+" "
    plt.figure(figsize=(1,12))
    wordcloud= WordCloud(width=800,height=800, background_color='aqua').generate(words)
    plt.imshow(wordcloud)
    plt.axis('off')


In [None]:
plot_word_cloud(df.English)


In [None]:
df.head()


In [None]:
df.to_csv("cleaned.csv",index=None)


In [None]:
import pandas as pd
import numpy as np
import string

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

from sklearn.model_selection import train_test_split
import re
import os


In [None]:
df = pd.read_csv("cleaned.csv")

In [None]:
df.tail()


In [None]:
df['Hindi'] =df.Hindi.apply(lambda x: 'sos '+ x + ' eos')


In [None]:
df.head()


In [None]:
eng_texts = df.English.to_list()
mar_texts = df.Hindi.to_list()


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [None]:
def tokenize_sent(text):
  '''
  Take list on texts as input and
  returns its tokenizer and enocded text
  '''
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)

  return tokenizer, tokenizer.texts_to_sequences(text)


In [None]:
eng_tokenizer, eng_encoded= tokenize_sent(text= eng_texts)
mar_tokenizer, mar_encoded= tokenize_sent(text= mar_texts)


In [None]:
eng_encoded[100:105]


In [None]:
eng_index_word = eng_tokenizer.index_word
eng_word_indec= eng_tokenizer.word_index


In [None]:
ENG_VOCAB_SIZE = len(eng_tokenizer.word_counts)+1
ENG_VOCAB_SIZE


In [None]:
mar_encoded[3000:3005]


In [None]:
mar_index_word = mar_tokenizer.index_word
mar_word_index= mar_tokenizer.word_index


In [None]:
MAR_VOCAB_SIZE=len(mar_tokenizer.word_counts)+1
MAR_VOCAB_SIZE


In [None]:
max_eng_len = 0
for i in range(len(eng_encoded)):
  if len(eng_encoded[i]) > max_eng_len:
    max_eng_len= len(eng_encoded[i])

max_mar_len = 0
for i in range(len(mar_encoded)):
  if len(eng_encoded[i]) > max_mar_len:
    max_mar_len= len(mar_encoded[i])


In [None]:
print(max_eng_len)
max_mar_len


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
eng_padded = pad_sequences(eng_encoded, maxlen=max_eng_len, padding='post')
mar_padded = pad_sequences(mar_encoded, maxlen=max_mar_len, padding='post')


In [None]:
eng_padded


In [None]:
mar_padded.shape


In [None]:
eng_padded= np.array(eng_padded)
mar_padded= np.array(mar_padded)


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(eng_padded, mar_padded, test_size=0.1, random_state=0)


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Concatenate, Dropout
from tensorflow.keras import Input, Model


In [None]:
from BahdanauAttention import AttentionLayer


In [None]:
from tensorflow.keras.layers import Layer

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def call(self, inputs):
        encoder_output, decoder_output = inputs
        # Define your attention mechanism here
        # For example:
        attention_scores = tf.matmul(decoder_output, encoder_output, transpose_b=True)
        attention_weights = tf.nn.softmax(attention_scores, axis=-1)
        attention_result = tf.matmul(attention_weights, encoder_output)

        return attention_result, attention_weights


In [None]:
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(ENG_VOCAB_SIZE, 1024)(encoder_inputs)

# Bidirectional lstm layer
enc_lstm1 = Bidirectional(LSTM(256,return_sequences=True,return_state=True))
encoder_outputs1, forw_state_h, forw_state_c, back_state_h, back_state_c = enc_lstm1(enc_emb)

final_enc_h = Concatenate()([forw_state_h,back_state_h])
final_enc_c = Concatenate()([forw_state_c,back_state_c])

encoder_states =[final_enc_h, final_enc_c]

# Set up the decoder.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(MAR_VOCAB_SIZE, 1024)
dec_emb = dec_emb_layer(decoder_inputs)
#LSTM using encoder_states as initial state
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

#Attention Layer
attention_layer = AttentionLayer()
attention_result, attention_weights = attention_layer([encoder_outputs1, decoder_outputs])

# Concat attention output and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_result])

#Dense layer
decoder_dense = Dense(MAR_VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)


# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
model.summary()


In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model,show_shapes=True)


In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint = ModelCheckpoint("checkpoint.weights.h5", monitor='val_accuracy', save_weights_only=True)

early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)

callbacks_list = [checkpoint, early_stopping]


In [None]:
encoder_input_data = X_train
decoder_input_data = y_train[:,:-1]
decoder_target_data =  y_train[:,1:]

# Testing
encoder_input_test = X_test
decoder_input_test = y_test[:,:-1]
decoder_target_test=  y_test[:,1:]


In [None]:
  EPOCHS= 50 #@param {type:'slider',min:10,max:100, step:10 }


In [None]:

history = model.fit([encoder_input_data, decoder_input_data],decoder_target_data,
                    epochs=EPOCHS,
                    batch_size=128,
                    validation_data = ([encoder_input_test, decoder_input_test],decoder_target_test),
                    callbacks= callbacks_list)
#model.save_weights("model.h5") # can give whole path to save model

In [None]:
model.save_weights("model.weights.h5")

In [None]:
model.load_weights("model.weights.h5")

In [None]:
encoder_model = Model(encoder_inputs, outputs = [encoder_outputs1, final_enc_h, final_enc_c])

decoder_state_h = Input(shape=(512,))
decoder_state_c = Input(shape=(512,))
decoder_hidden_state_input = Input(shape=(36,512))

dec_states = [decoder_state_h, decoder_state_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states)

# Attention inference
attention_result_inf, attention_weights_inf = attention_layer([decoder_hidden_state_input, decoder_outputs2])

decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([decoder_outputs2, attention_result_inf])

dec_states2= [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_concat_input_inf)

decoder_model= Model(
                    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_h, decoder_state_c],
                     [decoder_outputs2]+ dec_states2)

In [None]:
def get_predicted_sentence(input_seq):
    # Encode the input as state vectors.
    enc_output, enc_h, enc_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = mar_word_index['sos']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [enc_output, enc_h, enc_c ])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
          break
        else:
            # convert max index number to marathi word
            sampled_char = mar_index_word[sampled_token_index]

        if (sampled_char!='end'):
            # aapend it ti decoded sent
            decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length or find stop token.
        if (sampled_char == 'eos' or len(decoded_sentence.split()) >= 36):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        enc_h, enc_c = h, c

    return decoded_sentence


In [None]:
def get_hindi_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0 :
        sentence =sentence +mar_index_word[i]+' '
    return sentence

def get_english_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0:
        sentence =sentence +eng_index_word[i]+' '
    return sentence


In [None]:
len(X_test)


In [None]:
for i in np.random.randint(10, 50, size=15):
    # Get English and Marathi sentences
    print("English Sentence:", get_english_sentence(X_test[i]))
    print("Actual Hindi Sentence:", get_hindi_sentence(y_test[i])[4:-4])

    padded_input = pad_sequences([X_test[i]], maxlen=36, padding='post', truncating='post')

    print("Predicted Hindi Translation:", get_predicted_sentence(padded_input))  # Pass padded input directly
    print("\n\n\n\n")
