In [1]:
import string
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
%matplotlib inline


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('Dataset_English_Hindi.csv')

In [3]:
df.head()

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [4]:
df['English'] = df['English'].astype(str)
df['Hindi'] = df['Hindi'].astype(str)
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

df['English'] = df['English'].apply(remove_punctuation)
df['Hindi'] = df['Hindi'].apply(remove_punctuation)

In [5]:
def count_words(sentence):
    words = sentence.split()
    return len(words)

sen = "Why do they not set deadlines for at least cases of vital public interest like the Uphaar cinema case in which more than 50 innocent citizens lost their lives because of the criminal negligence of municipal officials ?"
count_words(sen)

max_words = 10

df = df[(df['English'].apply(count_words) <= max_words) & (df['Hindi'].apply(count_words) <= max_words)]
df = df.sample(50000, random_state=42)

In [6]:
df.sample(5)

Unnamed: 0,English,Hindi
100307,at this point in human history,य वकई एक खतरनक भरम ह
116384,Proteins form nearly onesixth of the whole egg,अणड क लगभग छठ भग परटन ह हत ह
66684,Thereafter even that avenue was closed,उसक बद वह भ दरवज बद ह गय
25022,what exactly autism is,क औटसम कय ह
109207,Shrikhand,शरखड


In [7]:
df['English'] = df['English'].str.lower()
df

Unnamed: 0,English,Hindi
96199,2 the country gets political stability,2 दश क रजनतक सथईतव मलत ह
93355,his sister jodhabai was akbars queen,इनक बहन जधबई अकबर क पटरन थ
98405,there are some good points to be made here,इन म कछ अचछ तरक भ ह
63092,intercourse,सतभन
66879,a single brilliant idea,कछ अचछ यजनओ क
...,...,...
28350,and were going to be able to invent brainbased...,और हम दमग प आधरत हसतकषप क आवषकर कर सकग
77739,pavaktha news think portal,परवकत समचरवचर परटल
43337,come to empathize with you,आपस सहनभत दखयग
103367,i had an inkling,मझ हलक स अदश थ


In [8]:
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(df['English'])
eng_len = len(tokenizer_eng.word_index)+1
tokenizer_hin = Tokenizer()
tokenizer_hin.fit_on_texts(df['Hindi'])
hin_len = len(tokenizer_hin.word_index)+1
eng_len


23362

In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)
train = train.reset_index(0).drop(['index'], axis=1)
test = test.reset_index(0).drop(['index'], axis=1)
train


Unnamed: 0,English,Hindi
0,for anyone to learn from their mistakes,कस क लए अपन गलतय स सखन
1,53 of agras population is male 47 are females,आगर क जनसखय क ५३ परष और ४७ महलए ह
2,bringing them to the real world,और वसतवक दनय म एक सथ लकर आय
3,art is not supposed to change the world,कल क कम दनय क बदलन नह ह
4,daily awakening,दनक जगरण यनकडत
...,...,...
39995,heres kallikuppam,यह Kallikuppam ह
39996,and youll see on the vertical,और हम ऊरधवधर पर दखग
39997,theyve begun to walk,उनहन चलन शर कय ह
39998,depending on the places i would go,हम जह ज रह ह उसक हसब स


In [10]:
def encode_sequence(tokenizer, len, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=len,padding='post')
    return seq


X_train = encode_sequence(tokenizer_eng,30,train['English'])
y_train = encode_sequence(tokenizer_hin,30,train['Hindi'])
X_test = encode_sequence(tokenizer_eng,30,test['English'])
y_test = encode_sequence(tokenizer_hin,30,test['Hindi'])


In [11]:
X_train

array([[  16,  943,    4, ...,    0,    0,    0],
       [2718,    2, 5840, ...,    0,    0,    0],
       [1619,   54,    4, ...,    0,    0,    0],
       ...,
       [1500, 3369,    4, ...,    0,    0,    0],
       [3200,   17,    1, ...,    0,    0,    0],
       [5032,  713,    1, ...,    0,    0,    0]])

In [12]:
def seq2seq(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, mask_zero= True))
    model.add(LSTM(units,return_sequences=False))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units,return_sequences=True))
    model.add(Dense(out_vocab,activation='softmax'))
    return model

In [13]:
model = seq2seq(eng_len, hin_len, 8, 8, 512)
rms = optimizers.RMSprop(learning_rate= 0.001)
model.compile(optimizer = rms, loss = 'sparse_categorical_crossentropy')

In [14]:
y_train

array([[ 39,   1,  25, ...,   0,   0,   0],
       [559,   1, 657, ...,   0,   0,   0],
       [  4, 601, 117, ...,   0,   0,   0],
       ...,
       [ 85, 441, 113, ...,   0,   0,   0],
       [ 14, 152,  17, ...,   0,   0,   0],
       [  1,  59,   3, ...,   0,   0,   0]])

In [15]:
model.fit(X_train,y_train,epoches = 10, batch_size= 256,validation_split = 0.2)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\abhis\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\abhis\AppData\Local\Temp\ipykernel_12560\2689835177.py", line 1, in <module>
    model.fit(X_train,y_train,epoches = 10, batch_size= 256,validation_split = 0.2)
  File "C:\Users\abhis\AppData\Roaming\Python\Python312\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\abhis\AppData\Roaming\Python\Python312\site-packages\keras\src\utils\traceback_utils.py", line 119, in error_handler
    filtered_tb = _process_traceback_frames(e.__traceback__)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: TensorFlowTrainer.fit() got an unexpected keyword argument 'epoches'

During handling of the above exception, another exception occurred:

Traceback (most