In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

In [2]:
def load_data(path):
    with open(path) as fil:
        data = pd.read_csv(fil,sep='\t',header=None,names=["hi","en",""],skip_blank_lines=True,index_col=None)
    data = data[data['hi'].notna()]
    data = data[data['en'].notna()]
    data = data[['hi','en']]
    return data

In [3]:
train = load_data("hi.translit.sampled.train.tsv")
dev = load_data("hi.translit.sampled.dev.tsv")
test = load_data("hi.translit.sampled.test.tsv")

In [4]:
x = train['en'].values
y = train['hi'].values
y = '\t'+y+'\n'

In [5]:
english_tokens = set()
hindi_tokens = set()

for xx,yy in zip(x,y):
    for ch in xx:
        english_tokens.add(ch)
    for ch in yy:
        hindi_tokens.add(ch)
    
english_tokens = sorted(list(english_tokens))
hindi_tokens = sorted(list(hindi_tokens))

In [6]:
eng_token_map = dict([(ch,i+1) for i,ch in enumerate(english_tokens)])
hin_token_map = dict([(ch,i+1) for i,ch in enumerate(hindi_tokens)])

In [7]:
hin_token_map[" "] = 0
eng_token_map[" "] = 0

In [8]:
max_eng_len = max([len(i) for i in x])
max_hin_len = max([len(i) for i in y])

In [9]:
def process(data):
    x,y = data['en'].values, data['hi'].values
    y = "\t" + y + "\n"
    
    a = np.zeros((len(x),max_eng_len),dtype="float32")
    b = np.zeros((len(y),max_hin_len),dtype="float32")
    c = np.zeros((len(y),max_hin_len,len(hindi_tokens)+1),dtype="int")
    
    
    for i,(xx,yy) in enumerate(zip(x,y)):
        for j,ch in enumerate(xx):
            a[i,j] = eng_token_map[ch]

        a[i,j+1:] = eng_token_map[" "]
        for j,ch in enumerate(yy):
            b[i,j] = hin_token_map[ch]

            if j>0:
                c[i,j-1,hin_token_map[ch]] = 1

        b[i,j+1:] = hin_token_map[" "]
        c[i,j:,hin_token_map[" "]] = 1
        
    return a,b,c

In [10]:
testx,testxx,testy = process(test)

In [11]:
np.random.seed(42)

In [12]:
reverse_eng_map = dict([(i,char) for char,i in eng_token_map.items()])
reverse_hin_map = dict([(i,char) for char,i in hin_token_map.items()])

In [14]:
import keras
import tensorflow as tf
from keras.layers import SimpleRNN,LSTM,GRU,Embedding,Dense,Dropout,Input
from keras.optimizers import Adam,Nadam
from keras import Model
tf.compat.v1.enable_eager_execution()

In [15]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

IndexError: list index out of range

In [16]:
tf.config.set_visible_devices([], 'GPU')

In [17]:
enc = keras.models.load_model('best_enc.h5')
dec = keras.models.load_model('best_dec.h5')



In [16]:
import time

In [19]:
def beam_search(inp,k,dec_layers,cell="LSTM"):
    statess = enc.predict(inp)
    print("encoder done")
    target_seq = np.zeros((inp.shape[0],1))
    target_seq[:,0] = hin_token_map["\t"]
    if cell == "LSTM":
        states = []
        for i in range(dec_layers):
            states += [statess[0],statess[1]]
    else:
        states = []
        for i in range(dec_layers):
            states += [statess]
            
    output = dec.predict([target_seq]+states)
    states = output[1:]
    
    stat1 = np.asarray(states).transpose([1,0,2])
    
    best_chars = np.argsort(output[0][:,-1,:],axis=-1)[:,-k:]
    scores = np.sort(output[0][:,-1,:],axis=-1)[:,-k:]
    sequences = [[([ch],-np.log(sc),stat1[i],0) for ch,sc in zip(best_chars[i],scores[i])] for i in range(inp.shape[0])]
    
    
    for t1 in range(max_hin_len-1):
        candidates = [[] for _ in range(inp.shape[0])]
        for j in range(k):
            target_seq[:,0] = [sequences[i][j][0][-1] for i in range(inp.shape[0])]
            states = list(np.asarray([sequences[i][j][2] for i in range(inp.shape[0])]).transpose([1,0,2]))
            output = dec.predict([target_seq]+states,batch_size=32)
            best_chars = np.argsort(output[0][:,-1,:],axis=-1)[:,-k:]
            scores = np.sort(output[0][:,-1,:],axis=-1)[:,-k:]
            
            stat1 = np.asarray(output[1:]).transpose([1,0,2])
            
            for i in range(inp.shape[0]):
                chk = 1 if (sequences[i][j][3]==1 or sequences[i][j][0][-1] == hin_token_map["\n"]) else 0
                if chk == 0:
                    candidates[i] += [(sequences[i][j][0]+[best_chars[i,rep]],
                                       sequences[i][j][1]-np.log(scores[i,rep]),
                                       stat1[i],chk)
                                      for rep in range(k)]
                else:
                    candidates[i] += [sequences[i][j]]
                    
        for i in range(inp.shape[0]):
            candidates[i] = sorted(candidates[i],key = lambda tup:tup[1]/len(tup[0]))
            sequences[i] = candidates[i][:k]
            
        print(f"decoder {t1} done")
            
        
    res = [list() for i in range(inp.shape[0])]
    for i in range(inp.shape[0]):
        for j in range(k):
            res[i].append(sequences[i][j][0])
        
    return res

In [20]:
start = time.time()
pred1 = beam_search(testx,5,3,cell="LSTM")
print(time.time()-start)

encoder done
decoder 0 done
decoder 1 done
decoder 2 done
decoder 3 done
decoder 4 done
decoder 5 done
decoder 6 done
decoder 7 done
decoder 8 done
decoder 9 done
decoder 10 done
decoder 11 done
decoder 12 done
decoder 13 done
decoder 14 done
decoder 15 done
decoder 16 done
decoder 17 done
decoder 18 done
decoder 19 done
38.40921425819397


In [21]:
def inference(inp,dec_layers,cell="LSTM"):
    statess = enc.predict(inp)
    target_seq = np.zeros((inp.shape[0],1))
    target_seq[:,0] = hin_token_map["\t"]
    
    states = []
    
    if cell == "LSTM":
        for c in range(dec_layers):
            states += [statess[0],statess[1]]
            
    else:
        for c in range(dec_layers):
            states += [statess]
            
    ans = np.zeros((inp.shape[0],max_hin_len))
    
    for i in range(max_hin_len):
        output = dec.predict([target_seq]+states,batch_size=64)
        ans[:,i] = np.argmax(output[0][:,-1,:],axis=1)
        target_seq[:,0] = ans[:,i]
        states = output[1:]
        
    return ans

In [22]:
pred = inference(testx,3,cell="LSTM")

In [25]:
for i in range(5):
    idx = np.random.choice(testx.shape[0])
    orig = ""
    for ch in testx[idx]:
        orig += reverse_eng_map[ch]
        if reverse_eng_map[ch] == "\n":
            break
    print("English word:",orig)
        
    deco = ""
    for ch in pred[idx]:
        deco += reverse_hin_map[ch]
        if reverse_hin_map[ch] == "\n":
            break
            
    print("Decoded word:", deco)
    
    for pr in pred1[idx]:
        deco1 = ""
        for ch in pr:
            deco1 += reverse_hin_map[ch]
            if reverse_hin_map[ch] == "\n":
                break
        
        print("Decoded beam word:", deco1)
            
    hind = ""
    for ch in testxx[idx]:
        hind += reverse_hin_map[ch]
        if reverse_hin_map[ch] == "\n":
            break
        
    print("Hindi original:",hind[1:])
    print("=========")

English word: purjon              
Decoded word: पुरजों

Decoded beam word: पुरजों

Decoded beam word: पुर्जों

Decoded beam word: पूरजों

Decoded beam word: पूर्जों

Decoded beam word: पर्जों

Hindi original: पुर्जों

English word: peratrupers         
Decoded word: पैरार्ड्रपर्स

Decoded beam word: पैरार्टपर्स

Decoded beam word: पैरारफर्स

Decoded beam word: पैराटरप्र्स

Decoded beam word: पैरार्ड्रपर्स

Decoded beam word: पैरार्डर्प्स

Hindi original: पैराट्रूपर

English word: chaupal             
Decoded word: चौपाल

Decoded beam word: चौपाल

Decoded beam word: चौपल

Decoded beam word: छौपाल

Decoded beam word: छौपल

Decoded beam word: चाउपल

Hindi original: चौपाल

English word: raak                
Decoded word: राक

Decoded beam word: राक

Decoded beam word: रक

Decoded beam word: राक़

Decoded beam word: रॉक

Decoded beam word: रैक

Hindi original: राक

English word: saleeb              
Decoded word: सलीब

Decoded beam word: सलीब

Decoded beam word: सालीब

Decoded beam word: स

In [26]:
acc = 0

for i,pr in enumerate(pred):
    fl = 1
    for j,ch in enumerate(pr):
        if ch != np.argmax(testy[i,j,:]):
            fl = 0
            break
        if ch == hin_token_map["\n"]:
            break
            
    if fl==1:
        acc+=1
        
        
print(acc/len(pred))

0.3858285206574856


In [27]:
acc = 0

for i,pre in enumerate(pred1):
    chk = 0
    
    for pr in pre:
        fl = 1
        for j,ch in enumerate(pr):
            if ch!=np.argmax(testy[i,j,:]):
                fl=0
                break
            if ch==hin_token_map["\n"]:
                break
        chk = chk or fl
        
    if chk==1:
        acc+=1
        
        
print(acc/len(pred1))

0.7145713016437139


In [55]:
ans = []
ans1 = []

for i,pre in enumerate(pred1):
    word = []
    word1 = []
    
    orig = ""
    for ch in testx[i]:
        if reverse_eng_map[ch] == " ":
            break
        orig += reverse_eng_map[ch]
    word.append(orig)
    word1.append(orig)
    
    hind = ""
    for ch in testxx[i,1:]:
        if reverse_hin_map[ch] == "\n":
            break
        hind += reverse_hin_map[ch]
    
    word.append(hind)
    word1.append(hind)
    
    for j,pr in enumerate(pre):
        deco1 = ""
        for ch in pr:
            if reverse_hin_map[ch] == "\n":
                break
            deco1 += reverse_hin_map[ch]
        word.append(deco1)
        if j==0:
            word1.append(deco1)
        
    ans.append(word)
    ans1.append(word1)

In [105]:
df = pd.DataFrame(ans,columns=['English','Hindi']+[f'Hindi_pred_{i}' for i in range(5)])
df.sample(n=5)

Unnamed: 0,English,Hindi,Hindi_pred_0,Hindi_pred_1,Hindi_pred_2,Hindi_pred_3,Hindi_pred_4
3910,shervasiyon,शहरवासियों,शेर्वासियों,शेरवासियों,क्षेरवाजियों,शर्वासियों,शेर्वायियोजन
2338,pigmentation,पिगमेंटेशन,पिक्सेनेशन,पिग्मेंटेशन,पिग्लेशनेशन,पिग्लेस्थेनियन,पिग्लेशनेटन
938,khinchakar,खिंचकर,खिंचाकार,खींचाकार,खींचकार,खींचकर,खिंचकर
3676,loge,लोगे,लोगे,लोज,लोग,लॉज,लॉग
83,adhyaynrat,अध्ययनरत,अध्ययंत्र,अध्ययंत्राव,अध्ययन्रत,अध्ययंत्राद,अध्ययंत्रा


In [98]:
df.to_csv('predictions_vanilla_beam.csv')

In [102]:
df1 = pd.DataFrame(ans1,columns=['English','Hindi','Hindi_pred'])
df1.sample(n=5)

Unnamed: 0,English,Hindi,Hindi_pred
2090,nigar,निगर,निगार
762,kaagaz,कागज़,काग़ज़
3955,shawn,शॉन,शावन
2896,bosten,बॉस्टन,बॉस्टन
859,colan,कोलन,कोलन


In [100]:
df1.to_csv('predictions_vanilla.csv')