# download Data

In [1]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xvf  'dakshina_dataset_v1.0.tar'

--2021-05-17 11:57:42--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.142.128, 74.125.195.128, 142.250.107.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.142.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2021-05-17 11:57:53 (163 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]

dakshina_dataset_v1.0/bn/
dakshina_dataset_v1.0/bn/lexicons/
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv
dakshina_dataset_v1.0/bn/native_script_wikipedia/
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.valid.text.shuf.txt.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.info.sorted.tsv.

# load data and process

In [3]:
%pip install wandb -q
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import wandb
from wandb.keras import WandbCallback

In [13]:
input_token_index = None
target_token_index = None
MAX_LEN_input = None
MAX_LEN_target = None
num_encoder_tokens = 30
num_decoder_tokens = 70
input_tokenizer = None
target_tokenizer = None

In [5]:
def tokenize(data,vocab_size):
  tokenizer = Tokenizer(num_words=vocab_size,char_level=True)
  tokenizer.fit_on_texts(data)
  temp=tokenizer.texts_to_sequences(data)
  # print(data[:3])
  # print(temp[:3])
  dictionary = tokenizer.word_index
  return temp , dictionary , tokenizer

In [17]:
def load_and_preprocess():
  global input_token_index , target_token_index , MAX_LEN_input , MAX_LEN_target ,num_decoder_tokens,num_encoder_tokens , input_tokenizer , target_tokenizer
  data_path = 'dakshina_dataset_v1.0/gu/lexicons/gu.translit.sampled.train.tsv'
  input_texts = []
  target_texts = []
  with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
  for line in lines[:-1]:
    temp = line.split('\t')
    input_text, target_text = temp[1],temp[0] 
    target_text = "\t" + target_text + "\n"
    input_text = input_text+"\n"

    input_texts.append(input_text)
    target_texts.append(target_text)
  
  MAX_LEN_input = max([len(txt) for txt in input_texts])
  MAX_LEN_target = max([len(txt) for txt in target_texts])

  # toeknize
  encoder_input , input_token_index , input_tokenizer = tokenize(input_texts , num_encoder_tokens)
  decoder_input , target_token_index, target_tokenizer = tokenize(target_texts , num_decoder_tokens) 

  # padding
  encoder_input_data = pad_sequences(encoder_input, maxlen=MAX_LEN_input, dtype='int32', padding='post', truncating='post',value= input_token_index["\n"])
  decoder_input_data = pad_sequences(decoder_input, maxlen=MAX_LEN_target, dtype='int32', padding='post', truncating='post',value=target_token_index["\n"])

  decoder_target_data = np.zeros((len(input_texts), MAX_LEN_target, num_decoder_tokens), dtype="float32")
  for i,  target_text in enumerate(target_texts):
    for t, char in enumerate(target_text):
      if t > 0:
        decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_target_data[i, t:, target_token_index["\n"]] = 1.0

  return encoder_input_data , decoder_input_data, decoder_target_data



In [7]:
def load_val_data(data_path = 'dakshina_dataset_v1.0/gu/lexicons/gu.translit.sampled.dev.tsv'):
  global input_token_index , target_token_index , MAX_LEN_input , MAX_LEN_target ,num_decoder_tokens,num_encoder_tokens , input_tokenizer , target_tokenizer
  
  input_texts = []
  target_texts = []
  with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
  for line in lines[:-1]:
    temp = line.split('\t')
    input_text, target_text = temp[1],temp[0] 
    target_text = "\t" + target_text + "\n"
    input_text = input_text+"\n"

    input_texts.append(input_text)
    target_texts.append(target_text)
  
  # tokenize
  encoder_input  = input_tokenizer.texts_to_sequences(input_texts)
  decoder_input  = target_tokenizer.texts_to_sequences(target_texts) 

  # padding
  encoder_input_data = pad_sequences(encoder_input, maxlen=MAX_LEN_input, dtype='int32', padding='post', truncating='post',value= input_token_index["\n"])
  decoder_input_data = pad_sequences(decoder_input, maxlen=MAX_LEN_target, dtype='int32', padding='post', truncating='post',value=target_token_index["\n"])

  decoder_target_data = np.zeros((len(input_texts), MAX_LEN_target, num_decoder_tokens), dtype="float32")
  for i,  target_text in enumerate(target_texts):
    for t, char in enumerate(target_text):
      if char == 'ૠ':
        char = 'ઋ'
      if t > 0:
        decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_target_data[i, t:, target_token_index["\n"]] = 1.0

  return encoder_input_data , decoder_input_data, decoder_target_data

# load_val_data()

In [8]:
import tensorflow as tf
import os
from tensorflow.python.keras.layers import Layer, Concatenate
from tensorflow.python.keras import backend as K


class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>', U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)
            if verbose:
                print('Ws+Uh>', Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]


In [9]:
from tensorflow import keras
from tensorflow.keras.layers import Dense,LSTM,Input,GRU,SimpleRNN,Dropout,Embedding

def create_model_attention(m_name="LSTM",n_e_layers=1,n_d_layers=1,latent_dim = 100,embedding_size = 16,dropout = 0 , recurrent_dropout = 0):
  global num_decoder_tokens,num_decoder_tokens
  keras.backend.clear_session()
  
  # Define an input sequence and process it.
  input1 = Input(shape=(None,),name= "input_1")
  encoder_inputs = Embedding(input_dim = num_encoder_tokens, output_dim = embedding_size)(input1)

  encoder = globals()[m_name](latent_dim,dropout=dropout,recurrent_dropout = recurrent_dropout, return_state=True,return_sequences=True)
  e_o = encoder(encoder_inputs)
  prev = e_o
  for i in range(1,n_e_layers):
    e = globals()[m_name](latent_dim, dropout=dropout,recurrent_dropout = recurrent_dropout,return_state=True,return_sequences=True)
    e_o = e(prev[0])
    prev = e_o
  
  input2 = Input(shape=(None,),name="input_2")
  decoder_inputs = Embedding(input_dim = num_decoder_tokens, output_dim = embedding_size)(input2)
  d_l = globals()[m_name](latent_dim,dropout=dropout,recurrent_dropout = recurrent_dropout, return_sequences=True, return_state=True)(decoder_inputs,initial_state = e_o[1:])
  p_d = d_l[0]
  for i in range(1,n_d_layers):
    d_l = globals()[m_name](latent_dim,dropout=dropout,recurrent_dropout = recurrent_dropout, return_state=True, return_sequences=True)(p_d,initial_state = e_o[1:])
    p_d = d_l[0]

  attn_layer = AttentionLayer(name="attention_layer")
  attn_op, attn_state = attn_layer([e_o[0], d_l[0]])
  decoder_concat_input = Concatenate(axis=-1)([d_l[0], attn_op])


  dec_dense = Dense(num_decoder_tokens, activation='softmax')
  final_output = dec_dense(decoder_concat_input)
 

  # Define the model that will turn
  # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
  model = keras.Model([input1,input2], final_output)

  return model


# sweep

In [19]:
def train_attention():
  run = wandb.init()
  c = run.config
  name = "model_"+c.model+"_o_"+c.optimizer+"_hs_"+str(c.hidden_size)+"_em_"+str(c.embedding_size)+"_d_"+str(c.dropout)
  run.name = name
  print(name)
  batch_size = 128
  epochs = 1

  # used single encoder and decoder layer
  encoder_layers , decoder_layers = 1 , 1


  encoder_input_data,decoder_input_data ,decoder_target_data = load_and_preprocess()
  val_encoder_input_data,val_decoder_input_data ,val_decoder_target_data = load_val_data()

  es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

  model = create_model_attention(c.model,encoder_layers,decoder_layers,c.hidden_size,c.embedding_size,c.dropout,0)
  model.compile(optimizer=c.optimizer, loss="categorical_crossentropy", metrics=["accuracy"])
  model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([val_encoder_input_data, val_decoder_input_data],val_decoder_target_data),
    # validation_split=0.2,
    callbacks=[WandbCallback(),es]
  )
  
  temp = model.predict([val_encoder_input_data, val_decoder_input_data]).argmax(axis=-1)
  val_word_acc = sum((temp[:,:-1] == val_decoder_input_data[:,1:]).all(axis=-1)) / len(val_encoder_input_data)
  # print(val_word_acc)

  temp = model.predict([encoder_input_data,decoder_input_data]).argmax(axis=-1)
  train_word_acc = sum((temp[:,:-1] == decoder_input_data[:,1:]).all(axis=-1)) / len(encoder_input_data)
  # print(train_word_acc)

  wandb.log({"val_word_acc" : round(val_word_acc,4) , "train_word_acc" : round(train_word_acc,4)})
  return

In [11]:
sweep_config_attention={
    'method' : 'random' ,
    'metric' : { 'name' : 'val_word_acc' , 'goal' : 'maximize' } ,
    'parameters' : {
        'model' : { 'values' : ['LSTM','GRU','SimpleRNN'] },
        'dropout' : { 'values' : [0.1,0.2]},
        'embedding_size' : {'values' : [32,64,128]},
        'hidden_size' : {'values' : [128,256,512]},
        'optimizer' : {'values' : ['rmsprop' ,'adam']}
    }
}


In [None]:

sweepid = wandb.sweep(sweep_config_attention,project="DL_A3_Q5_testing1",entity ="sonagara")
wandb.agent(sweepid,train_attention)

Create sweep with ID: vyq5gxxv
Sweep URL: https://wandb.ai/sonagara/DL_A3_Q5_testing1/sweeps/vyq5gxxv


[34m[1mwandb[0m: Agent Starting Run: cqxawoxc with config:
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	model: LSTM
[34m[1mwandb[0m: 	optimizer: adam


model_LSTM_o_adam_hs_128_em_32_d_0.2
131/823 [===>..........................] - ETA: 4:09 - loss: 2.2198 - accuracy: 0.6426

#  best model

In [None]:
def train_best():
  m_name = "LSTM"
  encoder_layers = 1
  decoder_layers = 1
  latent_dim = 512
  embedding_size = 64
  dropout = 0.2
  batch_size = 64
  recurrent_dropout = 0  # 0 to use cudnnlstm which is faster than lstm

  encoder_input_data,decoder_input_data ,decoder_target_data = load_and_preprocess()
  val_encoder_input_data,val_decoder_input_data ,val_decoder_target_data = load_val_data()
  

  model = create_model_attention(m_name,encoder_layers,decoder_layers,latent_dim,embedding_size,dropout,recurrent_dropout)
  # model = keras.models.load_model("s2s")
  model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

  es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

  model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=20,
    # validation_split = 0.2,
    validation_data=([val_encoder_input_data, val_decoder_input_data],val_decoder_target_data),
    callbacks=[es]
  )

  temp = model.predict([val_encoder_input_data, val_decoder_input_data]).argmax(axis=-1)
  val_word_acc = sum((temp[:,:-1] == val_decoder_input_data[:,1:]).all(axis=-1)) / len(val_encoder_input_data)
  print(val_word_acc)
  


  # Save model
  model.save("s2sa")
  # print(test_acc("LSTM",100,1,2))
train_best()

In [None]:
def enc_dec_attention(m_name="LSTM",n_e_layers=1,n_d_layers=1,latent_dim = 100):
  model = keras.models.load_model("s2sa")
 
  if (n_e_layers == 1):
    l_name = ""
  else:
    l_name = "_"+str(n_e_layers-1)

  if (m_name == "SimpleRNN"):
    n_name = "simple_rnn"
  else:
    n_name = m_name
  # model.summary()

  # encoder
  encoder_inputs = model.input[0]
  encoder_outputs, *encoder_states = model.get_layer(n_name.lower()+l_name).output  # last encoding layer
  encoder_model = tf.keras.models.Model(encoder_inputs, [encoder_outputs,encoder_states])
  # encoder_model.summary()
    

  # decoder
  decoder_inputs = model.input[1]
  decoder_embed = model.get_layer("embedding_1")(decoder_inputs)
  decoder_state_input_h = tf.keras.layers.Input(shape=(latent_dim,))
  decoder_state_input_c = tf.keras.layers.Input(shape=(latent_dim,))
  decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
  decoder_outputs, *decoder_states = model.get_layer(n_name.lower()+"_"+str(n_e_layers))(decoder_embed, initial_state=decoder_states_inputs)
  decoder_model = tf.keras.models.Model([decoder_inputs, decoder_states_inputs],[decoder_outputs] + decoder_states)
  # decoder_model.summary()

  atten_layer = model.get_layer("attention_layer")
  dense_layer = model.get_layer("dense")

  return encoder_model , decoder_model ,atten_layer ,dense_layer 

# enc_dec_attention()

In [None]:

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence_attention(input_seq,encoder_model,decoder_model,decoder_dense,atten_layer):
    global num_decoder_tokens , target_token_index , reverse_target_char_index , MAX_LEN_target
    # Encode the input as state vectors.

    enc_op, states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index["\t"]

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        dec_outputs, h, c = decoder_model.predict([target_seq] + states_value)

        atten_op, attn_state = atten_layer([enc_op, dec_outputs])
        # print(attn_state)
        decoder_concat_input = Concatenate(axis=-1)([dec_outputs, atten_op])
        decoder_concat_input = decoder_dense(decoder_concat_input)
        
        sampled_token_index = np.argmax( decoder_concat_input[0, -1, :] )
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        # print(decoded_sentence)
        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > MAX_LEN_target:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = target_token_index[sampled_char]

        # Update states
        states_value = [h, c]

    return decoded_sentence


In [None]:
def test_acc_attention(m_name="LSTM" ,latent_dim = 100, n_e_layers = 1,n_d_layers = 1):
  x_test , y_test = load_test_data()
  enc ,dec  , atten , dense = enc_dec_attention(m_name , n_e_layers ,n_d_layers, latent_dim)
  score = 0 
  print(len(y_test))

  for seq_index in range(len(y_test)):
    
    input_seq = x_test[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence_attention(input_seq,enc,dec,dense,atten)
    if (y_test[seq_index] == decoded_sentence):
      score += 1
    print("-")
    print("Input sentence:", y_test[seq_index])
    print("Decoded sentence:", decoded_sentence)
  print(score/len(y_test))
  return score/len(y_test)
test_acc_attention()

(10373, 19)
10373
tf.Tensor(
[[[0.03299213 0.03293254 0.03743091 0.0370109  0.04104772 0.04129481
   0.04086666 0.04202858 0.04510744 0.05038547 0.05707331 0.06273546
   0.06598404 0.06765498 0.06856006 0.06901259 0.06922045 0.06931201
   0.06935   ]]], shape=(1, 1, 19), dtype=float32)
tf.Tensor(
[[[0.03760805 0.03796143 0.04199007 0.04221893 0.04535198 0.04603993
   0.04594486 0.04710242 0.04956936 0.05316092 0.0570058  0.05984121
   0.06133004 0.06203482 0.06238378 0.06254701 0.0626171  0.06264339
   0.06264894]]], shape=(1, 1, 19), dtype=float32)
tf.Tensor(
[[[0.04384023 0.04415859 0.04677034 0.04703319 0.04888453 0.04938265
   0.04935042 0.05003621 0.05142121 0.05331973 0.05522535 0.056564
   0.05725317 0.05758072 0.05774564 0.05782426 0.05785898 0.05787314
   0.05787772]]], shape=(1, 1, 19), dtype=float32)
tf.Tensor(
[[[0.04347179 0.04408089 0.04678016 0.04743042 0.04911705 0.04993662
   0.05005114 0.05086787 0.05227703 0.05398079 0.05548227 0.0564263
   0.05687838 0.05708118 0.05

KeyboardInterrupt: ignored