In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

In [2]:
def load_data(path):
    with open(path) as fil:
        data = pd.read_csv(fil,sep='\t',header=None,names=["hi","en",""],skip_blank_lines=True,index_col=None)
    data = data[data['hi'].notna()]
    data = data[data['en'].notna()]
    data = data[['hi','en']]
    return data

In [3]:
train = load_data("hi.translit.sampled.train.tsv")
dev = load_data("hi.translit.sampled.dev.tsv")
test = load_data("hi.translit.sampled.test.tsv")

In [4]:
train.head()

Unnamed: 0,hi,en
0,अं,an
1,अंकगणित,ankganit
2,अंकल,uncle
3,अंकुर,ankur
4,अंकुरण,ankuran


In [5]:
x = train['en'].values
y = train['hi'].values
y = '\t'+y+'\n'

In [6]:
english_tokens = set()
hindi_tokens = set()

for xx,yy in zip(x,y):
    for ch in xx:
        english_tokens.add(ch)
    for ch in yy:
        hindi_tokens.add(ch)
    
english_tokens = sorted(list(english_tokens))
hindi_tokens = sorted(list(hindi_tokens))

In [7]:
eng_token_map = dict([(ch,i+1) for i,ch in enumerate(english_tokens)])
hin_token_map = dict([(ch,i+1) for i,ch in enumerate(hindi_tokens)])

In [8]:
hin_token_map[" "] = 0
eng_token_map[" "] = 0

In [9]:
max_eng_len = max([len(i) for i in x])
max_hin_len = max([len(i) for i in y])

In [10]:
def process(data):
    x,y = data['en'].values, data['hi'].values
    y = "\t" + y + "\n"
    
    a = np.zeros((len(x),max_eng_len),dtype="float32")
    b = np.zeros((len(y),max_hin_len),dtype="float32")
    c = np.zeros((len(y),max_hin_len,len(hindi_tokens)+1),dtype="int")
    
    
    for i,(xx,yy) in enumerate(zip(x,y)):
        for j,ch in enumerate(xx):
            a[i,j] = eng_token_map[ch]

        a[i,j+1:] = eng_token_map[" "]
        for j,ch in enumerate(yy):
            b[i,j] = hin_token_map[ch]

            if j>0:
                c[i,j-1,hin_token_map[ch]] = 1

        b[i,j+1:] = hin_token_map[" "]
        c[i,j:,hin_token_map[" "]] = 1
        
    return a,b,c

In [11]:
trainx, trainxx, trainy = process(train)
valx, valxx, valy = process(dev)
testx,testxx,testy = process(test)

In [12]:
np.random.seed(42)

In [13]:
reverse_eng_map = dict([(i,char) for char,i in eng_token_map.items()])
reverse_hin_map = dict([(i,char) for char,i in hin_token_map.items()])

In [14]:
import keras
import tensorflow as tf
from keras.layers import SimpleRNN,LSTM,GRU,Embedding,Dense,Dropout,Input
from keras.optimizers import Adam,Nadam
from keras import Model

In [15]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [16]:
def build_model(cell = "LSTM",nunits = 32, enc_layers = 1, dec_layers = 1,embed_dim = 32,dense_size=32,dropout=None):
    keras.backend.clear_session()
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_dim=len(english_tokens)+1,output_dim = embed_dim,mask_zero=True,name="enc_embed")
    encoder_context = encoder_embedding(encoder_inputs)
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(input_dim = len(hindi_tokens)+1,output_dim = embed_dim,mask_zero=True,name="dec_embed")
    decoder_context = decoder_embedding(decoder_inputs)
    
    if cell == "LSTM":
        encoder_prev = [LSTM(nunits,return_sequences=True,name=f"enc_{i}") for i in range(enc_layers-1)]
        encoder_fin = LSTM(nunits,return_state=True,name=f"enc_{enc_layers-1}")
        temp = encoder_context
        for i,lay in enumerate(encoder_prev):
            temp = lay(temp)
            if dropout is not None:
                temp = Dropout(dropout,name=f"do_{i}")(temp)
            
        _, state_h,state_c = encoder_fin(temp)
        encoder_states = [state_h,state_c]
        
        decoder = [LSTM(nunits,return_sequences=True,return_state=True,name=f"dec_{i}") for i in range(dec_layers)]
        
        temp,sh,sc = decoder[0](decoder_context,initial_state=encoder_states)
        for i in range(1,dec_layers):
            temp,sh,sc = decoder[i](temp,initial_state=encoder_states)
            
    elif cell == "Simple":
        encoder_prev = [SimpleRNN(nunits,return_sequences=True,name=f"enc_{i}") for i in range(enc_layers-1)]
        encoder_fin = SimpleRNN(nunits,return_state=True,name=f"enc_{enc_layers-1}")
        temp = encoder_context
        for i,lay in enumerate(encoder_prev):
            temp = lay(temp)
            if dropout is not None:
                temp = Dropout(dropout,name=f"do_{i}")(temp)
            
        _, state = encoder_fin(temp)
        encoder_states = state
        
        decoder = [SimpleRNN(nunits,return_sequences=True,return_state=True,name=f"dec_{i}") for i in range(dec_layers)]
        
        temp,s = decoder[0](decoder_context,initial_state=state)
        for i in range(1,dec_layers):
            temp,s = decoder[i](temp,initial_state=state)
       
    elif cell == "GRU":
        encoder_prev = [GRU(nunits,return_sequences=True,name=f"enc_{i}") for i in range(enc_layers-1)]
        encoder_fin = GRU(nunits,return_state=True,name=f"enc_{enc_layers-1}")
        temp = encoder_context
        for i,lay in enumerate(encoder_prev):
            temp = lay(temp)
            if dropout is not None:
                temp = Dropout(dropout,name=f"do_{i}")(temp)
            
        _, state = encoder_fin(temp)
        encoder_states = state
        
        decoder = [GRU(nunits,return_sequences=True,return_state=True,name=f"dec_{i}") for i in range(dec_layers)]
        
        temp,s = decoder[0](decoder_context,initial_state=state)
        for i in range(1,dec_layers):
            temp,s = decoder[i](temp,initial_state=state)
            
        
    dense_lay1 = Dense(dense_size,activation='relu',name='dense1')
    pre_out = dense_lay1(temp)
    dense_lay2 = Dense(len(hindi_tokens)+1,activation = 'softmax',name='dense2')
    final_output = dense_lay2(pre_out)
    
    train = Model([encoder_inputs,decoder_inputs],final_output)
    
    return train

In [17]:
def accuracy1(real,pred):
    real = tf.math.argmax(real,axis=2)
    pred = tf.math.argmax(pred,axis=2)
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    acc = tf.math.equal(real,pred)
    mask = tf.cast(mask, dtype='int32')
    acc = tf.cast(acc, dtype='int32')
    acc = tf.math.multiply(acc,mask)
    mask = tf.reduce_sum(mask,axis=1)
    acc = tf.reduce_sum(acc,axis=1)
    acc = tf.math.equal(acc,mask)
    acc = tf.cast(acc, dtype='float32')
    return tf.reduce_mean(acc)

In [18]:
train = build_model(nunits=256,
                                dense_size=512,
                                enc_layers=3,
                                dec_layers=1,
                                cell = "LSTM",
                                dropout = 0.2,
                                embed_dim = 256)
train.compile(optimizer = Adam(lr=1e-3),loss='categorical_crossentropy',metrics=[accuracy1])

In [19]:
model_cb = tf.keras.callbacks.ModelCheckpoint('best_model.h5',monitor='val_accuracy1',mode='max',save_best_only=True,verbose=1)

In [20]:
train.fit([trainx,trainxx],trainy,
             batch_size=64,
             validation_data = ([valx,valxx],valy),
             epochs=20,
             callbacks = [model_cb])

Epoch 1/20
Epoch 00001: val_accuracy1 improved from -inf to 0.07420, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_accuracy1 improved from 0.07420 to 0.27876, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_accuracy1 improved from 0.27876 to 0.31997, saving model to best_model.h5
Epoch 4/20
Epoch 00004: val_accuracy1 improved from 0.31997 to 0.35417, saving model to best_model.h5
Epoch 5/20
Epoch 00005: val_accuracy1 improved from 0.35417 to 0.38338, saving model to best_model.h5
Epoch 6/20
Epoch 00006: val_accuracy1 did not improve from 0.38338
Epoch 7/20
Epoch 00007: val_accuracy1 improved from 0.38338 to 0.38987, saving model to best_model.h5
Epoch 8/20
Epoch 00008: val_accuracy1 did not improve from 0.38987
Epoch 9/20
Epoch 00009: val_accuracy1 did not improve from 0.38987
Epoch 10/20
Epoch 00010: val_accuracy1 did not improve from 0.38987
Epoch 11/20
Epoch 00011: val_accuracy1 did not improve from 0.38987
Epoch 12/20
Epoch 00012: val_accuracy1 did not improve

<tensorflow.python.keras.callbacks.History at 0x7fe24476a460>

In [21]:
model = keras.models.load_model('best_model.h5',custom_objects={'accuracy1':accuracy1})

In [22]:
def inference_models(model,nunits=32,enc_layers=1,dec_layers=1,cell='LSTM',dropout=None):
    encoder_inputs = model.input[0]
    encoder_embedding = model.get_layer('enc_embed')
    encoder_context = encoder_embedding(encoder_inputs)
    decoder_inputs = model.input[1]
    decoder_embedding = model.get_layer('dec_embed')
    decoder_context = decoder_embedding(decoder_inputs)
    
    
    encoder_prev = [model.get_layer(f'enc_{i}') for i in range(enc_layers-1)]
    encoder_fin = model.get_layer(f'enc_{enc_layers-1}')
    temp = encoder_context
    for i,lay in enumerate(encoder_prev):
        temp = lay(temp)
        if dropout is not None:
            temp = model.get_layer(f'do_{i}')(temp)
     
    if cell == "LSTM":
        _, state_h,state_c = encoder_fin(temp)
        encoder_states = [state_h,state_c]
        
    elif cell == "GRU":
        _, state = encoder_fin(temp)
        encoder_states = state

    encoder_model = keras.models.Model(encoder_inputs,encoder_states)
    
    
    
    decoder = [model.get_layer(f'dec_{i}') for i in range(dec_layers)]

    if cell == "LSTM":
        state_inputs = []
        state_outputs = []

        decoder_input_h = Input(shape=(nunits,),name='inputh0')
        decoder_input_c = Input(shape=(nunits,),name='inputc0')
        temp,sh,sc = decoder[0](decoder_context,initial_state = [decoder_input_h,decoder_input_c])
        state_inputs += [decoder_input_h,decoder_input_c]
        state_outputs += [sh,sc]

        for i in range(1,dec_layers):
            decoder_input_h = Input(shape=(nunits,),name=f'inputh{i}')
            decoder_input_c = Input(shape=(nunits,),name=f'inputc{i}')
            temp,sh,sc = decoder[i](temp,initial_state = [decoder_input_h,decoder_input_c])
            state_inputs += [decoder_input_h,decoder_input_c]
            state_outputs += [sh,sc]

        decoder_input_pass = [decoder_inputs] + state_inputs

    elif cell == "GRU":
        state_inputs = []
        state_outputs = []

        state_input = Input(shape=(nunits,),name='inputs0')
        temp,s = decoder[0](decoder_context,initial_state = state_input)
        state_inputs.append(state_input)
        state_outputs.append(s)

        for i in range(1,dec_layers):
            state_input = Input(shape=(nunits,),name=f'inputs{i}')
            temp,s = decoder[i](temp,initial_state = state_input)
            state_inputs.append(state_input)
            state_outputs.append(s)

        decoder_input_pass = [decoder_inputs] + state_inputs

    pre_out = model.get_layer('dense1')(temp)
    final_output = model.get_layer('dense2')(pre_out)

    decoder_model = keras.models.Model(decoder_input_pass, [final_output]+state_outputs)
    
    return encoder_model,decoder_model

In [23]:
enc,dec = inference_models(model,nunits=256,enc_layers=3,dec_layers=1,cell="LSTM",dropout='yes')

In [24]:
enc.save('best_enc.h5')
dec.save('best_dec.h5')