In [1]:
import string, re
from unicodedata import normalize
from numpy.random import shuffle
from numpy import argmax
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from pickle import load, dump
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


## Load Dataset

In [2]:
# load doc into memory
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
filename = '/content/drive/My Drive/Bangla English Translation/ben.txt'
doc = load_doc(filename)

In [4]:
doc

'Go.\tযাও।\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5545004 (tanay)\nGo.\tযান।\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5545005 (tanay)\nGo.\tযা।\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5545006 (tanay)\nRun!\tপালাও!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #5548781 (tanay)\nRun!\tপালান!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #5548783 (tanay)\nWho?\tকে?\tCC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #5548787 (tanay)\nFire!\tআগুন!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #3232240 (tanay)\nHelp!\tবাঁচাও!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #5548780 (tanay)\nHelp!\tবাঁচান!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #5548782 (tanay)\nStop!\tথামুন!\tCC-BY 2.0 (France) Attribution: tatoeba.org #448320 (FeuDRenais) & #5545000 (tanay)\nStop!\tথামো!\tCC-BY 2.0 (France) Attribution: tatoeba.

## Document into Sentence

In [5]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [6]:
pairs = to_pairs(doc)

In [7]:
#printing five list
for i in range(5):
  print(pairs[i])

['Go.', 'যাও।', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5545004 (tanay)']
['Go.', 'যান।', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5545005 (tanay)']
['Go.', 'যা।', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5545006 (tanay)']
['Run!', 'পালাও!', 'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #5548781 (tanay)']
['Run!', 'পালান!', 'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #5548783 (tanay)']


In [8]:
print('Length of the dataset is :', len(pairs))

Length of the dataset is : 4349


## Clean Dataset

In [9]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    
    for pair in lines:
        clean_pair = list()
        for line in pair:
            
            # normalize unicode characters
            #line = normalize('NFD', line).encode('ascii', 'ignore')
            #line = line.decode('UTF-8')
            line = line.split()
            
            # remove punctuation from each token
            line = [re_punc.sub('', w) for w in line]

            #line = line.replace('?','')
            line = [item.replace("?", "") for item in line]
            #line = line.replace('।','')
            line = [item.replace("।", "") for item in line]
            #line = line.replace('!','')
            line = [item.replace("!", "") for item in line]
            
            
            
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [10]:
clean_pairs = clean_pairs(pairs)

In [11]:
#clean data
clean_pairs

array([['Go', 'যাও',
        'CCBY 20 France Attribution tatoebaorg 2877272 CM  5545004 tanay'],
       ['Go', 'যান',
        'CCBY 20 France Attribution tatoebaorg 2877272 CM  5545005 tanay'],
       ['Go', 'যা',
        'CCBY 20 France Attribution tatoebaorg 2877272 CM  5545006 tanay'],
       ...,
       ['Tom told Mary that he was going to kill himself but he didnt have the courage to do it',
        'টম মেরিকে বললো যে ও নিজেকে হত্যা করতে চলেছিলো কিন্ত তা করার মতো সাহস ছিলো না',
        'CCBY 20 France Attribution tatoebaorg 1027913 CK  5692504 tanay'],
       ['Toms an irritating person to work with because hell never admit it when hes made a mistake',
        'টমের সঙ্গে কাজ করা খুব বিরক্তিকর কারণ ও কখনই মেনে নেয় না যে ও ভুল করেছে',
        'CCBY 20 France Attribution tatoebaorg 1023866 CK  5692494 tanay'],
       ['I thought doing this would be easy but weve been working all day and were still not finished',
        'আমি ভেবেছিলাম এটা করা সহজ হবে কিন্তু আমরা সারাদিন ধরে কাজ করেছ

## Saving Clean data

In [12]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [13]:
save_clean_data(clean_pairs, 'english-bangla.pkl')

Saved: english-bangla.pkl


## Printing Some Pair of Word

In [14]:
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))
    

[Go] => [যাও]
[Go] => [যান]
[Go] => [যা]
[Run] => [পালাও]
[Run] => [পালান]
[Who] => [কে]
[Fire] => [আগুন]
[Help] => [বাঁচাও]
[Help] => [বাঁচান]
[Stop] => [থামুন]
[Stop] => [থামো]
[Stop] => [থাম]
[Hello] => [নমস্কার]
[I see] => [বুঝলাম]
[I try] => [আমি চেষ্টা করি]
[Smile] => [একটু হাসুন]
[Smile] => [একটু হাসো]
[Attack] => [আক্রমণ]
[Get up] => [ওঠো]
[Get up] => [উঠুন]
[Got it] => [বুঝে গেছি]
[Got it] => [ধরেছি]
[Got it] => [বুঝেছো]
[Got it] => [বুঝেছেন]
[Got it] => [বুঝেছিস]
[I know] => [আমি জানি]
[I know] => [আমার জানা আছে]
[I lost] => [আমি হেরে গেছি]
[Im 19] => [আমার ১৯ বছর বয়স]
[Im OK] => [আমি ঠিক আছি]
[Listen] => [শোন]
[Listen] => [শুনুন]
[No way] => [কোন মতেই না]
[Really] => [সত্যি]
[Really] => [তাই নাক]
[Thanks] => [ধন্যবাদ]
[Try it] => [চেখে দেখুন]
[Try it] => [চেখে দেখো]
[We won] => [আমরা জিতে গেছে]
[Why me] => [আমিই কেন]
[Ask Tom] => [টমকে জিজ্ঞাসা করো]
[Ask Tom] => [টমকে জিজ্ঞাসা করুন]
[Ask Tom] => [টমকে জিজ্ঞাসা কর]
[Call me] => [আমাকে ডেক]
[Call me] => [আমাকে ডাকবেন]
[Call me

## Loading Dataset

In [15]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [16]:
raw_dataset = load_clean_sentences('english-bangla.pkl')

In [17]:
#loaded data
raw_dataset


array([['Go', 'যাও',
        'CCBY 20 France Attribution tatoebaorg 2877272 CM  5545004 tanay'],
       ['Go', 'যান',
        'CCBY 20 France Attribution tatoebaorg 2877272 CM  5545005 tanay'],
       ['Go', 'যা',
        'CCBY 20 France Attribution tatoebaorg 2877272 CM  5545006 tanay'],
       ...,
       ['Tom told Mary that he was going to kill himself but he didnt have the courage to do it',
        'টম মেরিকে বললো যে ও নিজেকে হত্যা করতে চলেছিলো কিন্ত তা করার মতো সাহস ছিলো না',
        'CCBY 20 France Attribution tatoebaorg 1027913 CK  5692504 tanay'],
       ['Toms an irritating person to work with because hell never admit it when hes made a mistake',
        'টমের সঙ্গে কাজ করা খুব বিরক্তিকর কারণ ও কখনই মেনে নেয় না যে ও ভুল করেছে',
        'CCBY 20 France Attribution tatoebaorg 1023866 CK  5692494 tanay'],
       ['I thought doing this would be easy but weve been working all day and were still not finished',
        'আমি ভেবেছিলাম এটা করা সহজ হবে কিন্তু আমরা সারাদিন ধরে কাজ করেছ

## Dividing Dataset

In [18]:
#name change
dataset = raw_dataset

# random shuffle
shuffle(dataset)

# split into train/test
train, test = dataset[:4000], dataset[4000:]


## Saving Divided dataset

In [19]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
    

In [20]:
# save
save_clean_data(dataset, 'english-bangla-both.pkl')
save_clean_data(train, 'english-bangla-train.pkl')
save_clean_data(test, 'english-bangla-test.pkl')


Saved: english-bangla-both.pkl
Saved: english-bangla-train.pkl
Saved: english-bangla-test.pkl


## Loading Divided dataset

In [21]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
    

In [22]:
# load datasets
dataset = load_clean_sentences('english-bangla-both.pkl')
train = load_clean_sentences('english-bangla-train.pkl')
test = load_clean_sentences('english-bangla-test.pkl')


## Tokenizer

In [23]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
    

## Maximum sentence Length

In [24]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)
    

## English and Bangla Tokenizer

In [25]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare Bangla tokenizer
bang_tokenizer = create_tokenizer(dataset[:, 1])
bang_vocab_size = len(bang_tokenizer.word_index) + 1
bang_length = max_length(dataset[:, 1])
print('Bangla Vocabulary Size: %d' % bang_vocab_size)
print('Bangla Max Length: %d' % (bang_length))


English Vocabulary Size: 1876
English Max Length: 19
Bangla Vocabulary Size: 2951
Bangla Max Length: 18


## Encoding and pad Sequence

In [26]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
    

## One-hot-encoding

In [27]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y


In [28]:
# prepare training data
trainX = encode_sequences(bang_tokenizer, bang_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(bang_tokenizer, bang_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)


## Model

In [29]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    
    # compile model
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy')
    
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model


In [30]:
# define model
model = define_model(bang_vocab_size, eng_vocab_size, bang_length, eng_length, 256)


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 18, 256)           755456    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 19, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 19, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 19, 1876)          482132    
Total params: 2,288,212
Trainable params: 2,288,212
Non-trainable params: 0
_________________________________________________________________


In [31]:
#checkpoint
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')


In [32]:
# fit model
model.fit(trainX, trainY, epochs=100, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4000 samples, validate on 349 samples
Epoch 1/100
 - 27s - loss: 2.9079 - val_loss: 1.7638

Epoch 00001: val_loss improved from inf to 1.76384, saving model to model.h5
Epoch 2/100
 - 26s - loss: 1.6930 - val_loss: 1.5584

Epoch 00002: val_loss improved from 1.76384 to 1.55840, saving model to model.h5
Epoch 3/100
 - 26s - loss: 1.5593 - val_loss: 1.5171

Epoch 00003: val_loss improved from 1.55840 to 1.51709, saving model to model.h5
Epoch 4/100
 - 26s - loss: 1.4912 - val_loss: 1.4625

Epoch 00004: val_loss improved from 1.51709 to 1.46248, saving model to model.h5
Epoch 5/100
 - 26s - loss: 1.4592 - val_loss: 1.4387

Epoch 00005: val_loss improved from 1.46248 to 1.43869, saving model to model.h5
Epoch 6/100
 - 26s - loss: 1.4152 - val_loss: 1.4332

Epoch 00006: val_loss improved from 1.43869 to 1.43325, saving model to model.h5
Epoch 7/100
 - 26s - loss: 1.3912 - val_loss: 1.4166

Epoch 00007: val_loss improved from 1.43325 to 1.41660, saving model to model.h5
Epoch 8/100


<keras.callbacks.callbacks.History at 0x7f2a50bf5cc0>

## Model Evaluation

##### Loading Data

In [33]:
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))
  

In [34]:
dataset = load_clean_sentences('english-bangla-both.pkl')
train = load_clean_sentences('english-bangla-train.pkl')
test = load_clean_sentences('english-bangla-test.pkl')


##### Tokenizer and Max Length

In [35]:
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer


In [36]:
# max sentence length
def max_length(lines):
  return max(len(line.split()) for line in lines)
  

In [37]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# prepare bangla tokenizer
bang_tokenizer = create_tokenizer(dataset[:, 1])
bang_vocab_size = len(bang_tokenizer.word_index) + 1
bang_length = max_length(dataset[:, 1])


##### Sequence Encoding

In [38]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
  # integer encode sequences
  X = tokenizer.texts_to_sequences(lines)
  # pad sequences with 0 values
  X = pad_sequences(X, maxlen=length, padding= 'post')
  return X
  

In [39]:
# prepare data
trainX = encode_sequences(bang_tokenizer, bang_length, train[:, 1])
testX = encode_sequences(bang_tokenizer, bang_length, test[:, 1])


##### Generate Sequence

In [40]:
# map an integer to a word
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index==integer:
      return word
  return None


# generate target given source sequence
def predict_sequence(model, tokenizer, source):
  prediction = model.predict(source, verbose=0)[0]
  integers = [argmax(vector) for vector in prediction]
  target = list()
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.append(word)
  return ' '.join(target)


# evaluate the skill of the model
def evaluate_model(model, sources, raw_dataset):
  actual, predicted = list(), list()
  for i, source in enumerate(sources):
    # translate encoded source text
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, eng_tokenizer, source)
    raw_target, raw_src = raw_dataset[i,:2]
    if i < 10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
    actual.append(raw_target.split())
    predicted.append(translation.split())

  # calculate BLEU score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))) 
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
  

In [41]:
# load model
model = load_model('model.h5')

# test on some training sequences
print('train')
evaluate_model(model, trainX, train)

# test on some test sequences
print('test')
evaluate_model(model, testX, test)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


train
src=[ও হাঁটে], target=[She walks], predicted=[she walks]
src=[অারো অাস্তে হাঁটো], target=[Walk more slowly], predicted=[walk more slowly]
src=[আপনি পিয়ানো বাজাতে পারেন], target=[Can you play the piano], predicted=[can you the the piano]
src=[বাইরে গিয়ে অপেক্ষা করো], target=[Go wait outside], predicted=[go wait outside]
src=[আমি এটা সামলে নিতে পারি], target=[I can manage it], predicted=[i can manage it]
src=[এখানে দাঁড়াও], target=[Stay here], predicted=[stay here]
src=[ওখানে দেখা হবে], target=[See you there], predicted=[see you there]
src=[তিনি ফরাসিতে কথা বলতে পারেন], target=[She can speak French], predicted=[she can speak french]
src=[টমকে মার], target=[Hit Tom], predicted=[hit tom]
src=[সেটা কোন তলায়], target=[Which floor is it on], predicted=[what floor is it on]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.041064
BLEU-2: 0.202643
BLEU-3: 0.383742
BLEU-4: 0.450159
test
src=[কে দৌড়ালেন], target=[Who ran], predicted=[who was]
src=[আপনি ওখানে বসে আছেন কেন], target=[Why are you sitting there], predicted=[why are you sitting there]
src=[ও নাচতে চায়], target=[She wants to dance], predicted=[he cant dance dance]
src=[সময় শেষ], target=[Time is up], predicted=[whats is]
src=[আমি নিশ্চিত আমার বাবামা আমাকে একলা যেতে দেবে না], target=[Im sure my parents wont let me go by myself], predicted=[i dont want to i but i the of of]
src=[টম চেঁচাতে আরম্ভ করলো], target=[Tom started to scream], predicted=[tom began to shout]
src=[বাজারটা ডাক্তারখানার পাশে], target=[The market is next to the pharmacy], predicted=[is is a doctor]
src=[অনুগ্রহ করে আপনার পাসপোর্টটি দেখাবেন], target=[Show me your passport please], predicted=[you you show your your passport please]
src=[আমি যখন গিলতে যাই তখন আমার গলা ব্যাথা করে], target=[My throat hurts when I swallow], predicted=[i thought i when when when when i was the b

## How to get more accurate translated English Language

##### Note: How to increasse more accuracy/ reduce loss

1) Take more and more data for training

2) Hyperparameter tuning

3) Add more Layer and check different parameter

