In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM
from tensorflow.keras.models import Model
import numpy as np

In [2]:
file = open('hin.txt', encoding='UTF-8')
data = file.readlines()
file.close()

In [3]:
len(data)

2952

In [4]:
data[0]

'Wow!\tवाह!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #6179147 (fastrizwaan)\n'

In [5]:
data[-1]

"When I was a kid, touching bugs didn't bother me a bit. Now I can hardly stand looking at pictures of them.\tजब मैं बच्चा था, मुझे कीड़ों को छूने से कोई परेशानी नहीं होती थी, पर अब मैं उनकी तस्वीरें देखना भी बर्दाश्त नहीं कर सकता।\tCC-BY 2.0 (France) Attribution: tatoeba.org #272157 (CM) & #485964 (minshirui)\n"

In [6]:
data[0].split("\t")

['Wow!',
 'वाह!',
 'CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #6179147 (fastrizwaan)\n']

In [7]:
# STEPS TO BE FOLLOWED

# 1. Split English and hindi words
# 2. English - Featurres , Hindi - Target
# 3. Encoder - Decoder
# 4. Dict - {'f' : 9}, {'':8}
# 5. Build and Train Model
# 6. Test Model

In [8]:
inputs = []
targets = []
input_set = set()
target_set = set()

for line in data:
    try:
        feature_text, target_text, _ = line.split('\t')
        target_text = '\t' + target_text + '\t'
        
        inputs.append(feature_text)
        targets.append(target_text)
        
        for char in feature_text:
            if char not in input_set:
                input_set.add(char)
                
        for char in target_text:
            if char not in target_set:
                target_set.add(char)
                
    except BaseException as ex:
        pass

In [9]:
print(inputs[:10])

['Wow!', 'Duck!', 'Duck!', 'Help!', 'Jump.', 'Jump.', 'Jump.', 'Hello!', 'Hello!', 'Cheers!']


In [10]:
len(inputs)

2952

In [11]:
print(targets[:10])

['\tवाह!\t', '\tझुको!\t', '\tबतख़!\t', '\tबचाओ!\t', '\tउछलो.\t', '\tकूदो.\t', '\tछलांग.\t', '\tनमस्ते।\t', '\tनमस्कार।\t', '\tवाह-वाह!\t']


In [12]:
len(targets)

2952

In [13]:
print(input_set)

{'B', 'N', 'M', 'y', 'q', ',', '2', 'k', '6', '8', ':', 'p', 'D', 'm', '-', '4', 'R', "'", '7', 'F', 'Y', 'a', 'x', '?', '3', 'n', 'b', '0', 'j', '!', 's', 'K', 'e', 'W', 'A', 'Q', 'w', 'U', 'T', 'G', 'C', 'J', 'V', 'z', 'l', 'S', 'P', ' ', 'i', 'f', 'h', 'o', 'H', 'O', 'v', 'c', 'r', 't', '9', 'd', 'g', 'I', '5', '1', '$', '"', 'E', '.', 'L', 'u'}


In [14]:
print(target_set)

{'ै', '्', 'अ', '१', 'क', 'ड', 'छ', 'फ', 'ओ', 'ऊ', 'ठ', 'ु', 'ध', 'A', 'ण', 'उ', 'ह', '\t', 'य', 'न', 'ऋ', 'ब', '४', 'ए', 'ं', 'आ', '"', 'श', ':', '\u200d', 'म', 'ष', '़', '!', 'ॅ', 'च', 'भ', 'ढ', 'औ', 'ल', 'ृ', '९', '.', 'घ', 'B', 'ऐ', 'ः', 'ॉ', 'प', 'ज', '(', 'ा', 'व', 'त', '।', ' ', 'स', 'झ', '०', 'ि', '७', 'ऑ', 'इ', 'े', 'द', '-', ')', '२', '|', '८', '५', '?', 'ट', 'ँ', 'ञ', 'ग', 'ो', 'ख', 'ू', 'ी', 'ई', 'थ', 'र', '६', 'I', ',', 'ौ'}


In [15]:
input_set = sorted(list(input_set))
target_set = sorted(list(target_set))

In [16]:
print(input_set)

[' ', '!', '"', '$', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [17]:
print(target_set)

['\t', ' ', '!', '"', '(', ')', ',', '-', '.', ':', '?', 'A', 'B', 'I', '|', 'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', '।', '०', '१', '२', '४', '५', '६', '७', '८', '९', '\u200d']


In [18]:
len_encoder = len(input_set)
len_decoder = len(target_set)

In [19]:
len_encoder, len_decoder

(70, 87)

In [20]:
max_encoder_len = max([len(text) for text in inputs])
max_decoder_len = max([len(text) for text in targets])

In [21]:
max_decoder_len, max_encoder_len

(123, 107)

In [22]:
input_dict = dict([(char, i) for i , char in enumerate(input_set) ])
target_dict = dict([(char, i) for i , char in enumerate(target_set)])

In [23]:
print(input_dict)   #{char : Index }

{' ': 0, '!': 1, '"': 2, '$': 3, "'": 4, ',': 5, '-': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, '?': 19, 'A': 20, 'B': 21, 'C': 22, 'D': 23, 'E': 24, 'F': 25, 'G': 26, 'H': 27, 'I': 28, 'J': 29, 'K': 30, 'L': 31, 'M': 32, 'N': 33, 'O': 34, 'P': 35, 'Q': 36, 'R': 37, 'S': 38, 'T': 39, 'U': 40, 'V': 41, 'W': 42, 'Y': 43, 'a': 44, 'b': 45, 'c': 46, 'd': 47, 'e': 48, 'f': 49, 'g': 50, 'h': 51, 'i': 52, 'j': 53, 'k': 54, 'l': 55, 'm': 56, 'n': 57, 'o': 58, 'p': 59, 'q': 60, 'r': 61, 's': 62, 't': 63, 'u': 64, 'v': 65, 'w': 66, 'x': 67, 'y': 68, 'z': 69}


In [24]:
print(target_dict)

{'\t': 0, ' ': 1, '!': 2, '"': 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, ':': 9, '?': 10, 'A': 11, 'B': 12, 'I': 13, '|': 14, 'ँ': 15, 'ं': 16, 'ः': 17, 'अ': 18, 'आ': 19, 'इ': 20, 'ई': 21, 'उ': 22, 'ऊ': 23, 'ऋ': 24, 'ए': 25, 'ऐ': 26, 'ऑ': 27, 'ओ': 28, 'औ': 29, 'क': 30, 'ख': 31, 'ग': 32, 'घ': 33, 'च': 34, 'छ': 35, 'ज': 36, 'झ': 37, 'ञ': 38, 'ट': 39, 'ठ': 40, 'ड': 41, 'ढ': 42, 'ण': 43, 'त': 44, 'थ': 45, 'द': 46, 'ध': 47, 'न': 48, 'प': 49, 'फ': 50, 'ब': 51, 'भ': 52, 'म': 53, 'य': 54, 'र': 55, 'ल': 56, 'व': 57, 'श': 58, 'ष': 59, 'स': 60, 'ह': 61, '़': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॅ': 69, 'े': 70, 'ै': 71, 'ॉ': 72, 'ो': 73, 'ौ': 74, '्': 75, '।': 76, '०': 77, '१': 78, '२': 79, '४': 80, '५': 81, '६': 82, '७': 83, '८': 84, '९': 85, '\u200d': 86}


In [25]:
# (2952, 107, 70)
encoder_input_data = np.zeros((len(inputs), max_encoder_len, len_encoder))
#(2952, 123, 87)
decoder_input_data = np.zeros((len(inputs), max_decoder_len, len_decoder))

# (2952, 123, 87)
decoder_target_data = np.zeros((len(targets), max_decoder_len, len_decoder))

In [26]:
encoder_input_data.shape

(2952, 107, 70)

In [27]:
decoder_input_data.shape 

(2952, 123, 87)

In [28]:
decoder_target_data.shape

(2952, 123, 87)

In [29]:
for i , (input_text, target_text) in enumerate(zip(inputs, targets)):
    for j, char in enumerate(input_text):
        encoder_input_data[i, j, input_dict[char]] = 1.
    for j, char in enumerate(target_text):
        decoder_input_data[i, j, target_dict[char]] = 1.
        
        if j > 0:
            decoder_target_data[i, j-1, target_dict[char]] = 1.

In [30]:
encoder_input = Input(shape=(None, len_encoder))
encoder = LSTM(256, return_state = True)  # if return state is not true then it will return only encoder_output
encoder_output, state_hidden , state_cell = encoder(encoder_input)
encoder_states = [state_hidden, state_cell]

In [31]:
decoder_input = Input(shape=(None, len_decoder))
decoder = LSTM(256, return_state = True, return_sequences=True) # if return Seq is not true so it dosent not return 3d Data in 2nd layer of LSTM
decoder_output, _ , _ = decoder(decoder_input)
decoder_output_layer = Dense(len_decoder, activation='softmax')
decoder_outputs = decoder_output_layer(decoder_output)

In [36]:
model = Model([encoder_input, decoder_input], decoder_outputs)

In [37]:
model.compile(loss='categorical_crossentropy', optimizer='adam',
             metrics=['accuracy'])

In [38]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data ,
          batch_size=32, epochs=5)

Epoch 1/5

KeyboardInterrupt: 