In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import pandas as pd
import unicodedata
import re
import numpy as np
import os
from os import path
import io
import time
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, GRU,LSTM, Multiply
from keras.layers import RepeatVector,TimeDistributed, Dense, Activation, Lambda,BatchNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K


In [None]:
%%bash
wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.en.gz
mkdir /tmp/data/
gunzip -c download.php?f=OpenSubtitles%2Fv2018%2Fmono%2FOpenSubtitles.raw.en.gz > /tmp/lines
split -a 3 -l 100000  /tmp/lines /tmp/data/lines-

In [None]:
os.listdir('/tmp/data/')

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
  w = w.strip()
  w = '<start> ' + w + ' <end>'
  return w

In [None]:
def line_cleaner(line):
  #line =""+str(line)
  line=line.decode("utf-8")
  #print(type(line))
  x=line.strip(" -.!?\n")
  return x

In [None]:
lines=[]
i=0
f= open('/tmp/data/lines-fli','rb')
for line in f:
  #print(line)
  line=line_cleaner(line)
  i=i+1
  if(i>=10000):
    break
  lines.append(line)
print(i,lines)

In [None]:
train_data, validation_data = train_test_split(lines, test_size=.1, random_state=1234)
print(len(train_data))

In [None]:
Tx=20
vocab_size=10000
padding='post'
trunc_type='post'
oov_tok='<OOV>'

In [None]:
#vecs
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(lines)
word_index=tokenizer.word_index
sequences=tokenizer.texts_to_sequences(lines)
padded=pad_sequences(sequences,maxlen=Tx,padding=padding,truncating=trunc_type)
print(padded[:10])

In [None]:
# #MODEL 2 tokenizer
# # Here I will use Tokenizer to extract the keyword vector as baseline
# # I will use train data to fit the Tokenizer, then use this Tokenizer to extract the validation data
# max_length = 100
# max_features = 50000
# token = Tokenizer(num_words=max_features)
# token.fit_on_texts(list(np.asarray(train_data.question_text)))
# xtrain = token.texts_to_sequences(np.asarray(train_data.question_text))
# xvalidate = token.texts_to_sequences(np.asarray(validation_data.question_text))
# xtest = token.texts_to_sequences(np.asarray(test.question_text))

# # Because Tokenizer will split the sentence, for some sentence are smaller,
# # so we have to pad the missing position
# xtrain = pad_sequences(xtrain, maxlen=max_length)
# xvalidate = pad_sequences(xvalidate, maxlen=max_length)
# xtest = pad_sequences(xtest, maxlen=max_length)

# ytrain = train_data.target
# yvaliate = validation_data.target

In [None]:
print(word_index)
print(len(word_index))
reverse_word_index={}
for word,ind in word_index.items():
  reverse_word_index[ind]=word
print(reverse_word_index)

In [None]:
contexts=np.array([padded[i] for i in range(0,len(lines)-1)])
responses=np.array([padded[i] for i in range(1,len(lines))])
responses=np.expand_dims(responses,-1)
print(contexts[6])
print(responses[6])
print(contexts.shape,responses.shape)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
from zipfile import ZipFile
file_name = "/content/glove.6B.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Unzipped glove')


In [None]:
embeddings_index = {}
f = open( 'glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Number of word vectors is %s.' % len(embeddings_index))

In [None]:
embed_dim=100
num_words=vocab_size
embedding_matrix=np.zeros((len(word_index)+1,embed_dim))
for word,i in word_index.items():
  embedding_vector=embeddings_index.get(word)
  if(embedding_vector is not None and i <num_words):
    embedding_matrix[i]=embedding_vector

In [None]:
from keras.layers import Embedding
embedding_layer=Embedding(len(word_index)+1,embed_dim,weights=[embedding_matrix],input_length=Tx,trainable=False)

In [None]:
#from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
#from nmt_utils import *
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [None]:
# # Defined shared layers as global variables
# repeator = RepeatVector(Tx)
# concatenator = Concatenate(axis=-1)
# densor1 = Dense(10, activation = "tanh")
# densor2 = Dense(1, activation = "relu")
# activator = Activation(softmax, name='attention_weights') # We are using a custom softmax(axis = 1) loaded in this notebook
# dotor = Dot(axes = 1)

In [None]:
# n_a = 128 # number of units for the pre-attention, bi-directional LSTM's hidden state 'a'
# n_s = 128 # number of units for the post-attention LSTM's hidden state "s"

# # Please note, this is the post attention LSTM cell.  
# # For the purposes of passing the automatic grader
# # please do not modify this global variable.  This will be corrected once the automatic grader is also updated.
# post_activation_LSTM_cell = LSTM(n_s, return_state = True) # post-attention LSTM 
# output_layer = Dense(10000, activation=softmax)

In [None]:
# # GRADED FUNCTION: one_step_attention

# def one_step_attention(a, s_prev):
#     """
#     Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
#     "alphas" and the hidden states "a" of the Bi-LSTM.
    
#     Arguments:
#     a -- hidden state output of the Bi-LSTM, numpy-array of shape (m, Tx, 2*n_a)
#     s_prev -- previous hidden state of the (post-attention) LSTM, numpy-array of shape (m, n_s)
    
#     Returns:
#     context -- context vector, input of the next (post-attention) LSTM cell
#     """
    
#     ### START CODE HERE ###
#     # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" (≈ 1 line)
#     s_prev = repeator(s_prev)
#     # Use concatenator to concatenate a and s_prev on the last axis (≈ 1 line)
#     # For grading purposes, please list 'a' first and 's_prev' second, in this order.
#     concat = concatenator([a,s_prev])
#     # Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e. (≈1 lines)
#     e = densor1(concat)
#     # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
#     energies = densor2(e)
#     # Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
#     alphas = activator(energies)
#     # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
#     context = dotor([alphas,a])
#     ### END CODE HERE ###
    
#     return context

In [None]:
# def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
#     """
#     Arguments:
#     Tx -- length of the input sequence
#     Ty -- length of the output sequence
#     n_a -- hidden state size of the Bi-LSTM
#     n_s -- hidden state size of the post-attention LSTM
#     human_vocab_size -- size of the python dictionary "human_vocab"
#     machine_vocab_size -- size of the python dictionary "machine_vocab"

#     Returns:
#     model -- Keras model instance
#     """
    
#     # Define the inputs of your model with a shape (Tx,)
#     # Define s0 (initial hidden state) and c0 (initial cell state)
#     # for the decoder LSTM with shape (n_s,)
#     X = Input(shape=(Tx, human_vocab_size))
#     s0 = Input(shape=(n_s,), name='s0')
#     c0 = Input(shape=(n_s,), name='c0')
#     s = s0
#     c = c0
    
#     # Initialize empty list of outputs
#     outputs = []
    

#     ### START CODE HERE ###
    
#     # Step 1: Define your pre-attention Bi-LSTM. (≈ 1 line)
#     a = Bidirectional(LSTM(units=n_a, return_sequences=True))(X)
    
#     # Step 2: Iterate for Ty steps
#     for t in range(Ty):
    
#         # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
#         context = one_step_attention(a,s)
        
#         # Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
#         # Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
#         s, _, c = post_activation_LSTM_cell(context, initial_state= [s,c])
        
#         # Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
#         out = output_layer(inputs=s)
        
#         # Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
#         outputs.append(out)
    
#     # Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
#     model = Model(inputs=[X,s0,c0], outputs=outputs)
    
#     ### END CODE HERE ###
    
#     return model

In [None]:
# model = model(Tx, Tx, n_a, n_s, 10000, 10000)
# model.summary()

In [None]:
# opt = Adam(lr = 0.005,beta_1 =  0.9,beta_2 =  0.999)
# model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# s0=np.zeros((100,n_s))
# c0=np.zeros((100,n_s))
# model.fit([contexts], responses, epochs=20, batch_size=100)
#doesnot work due to a dimensionerror.

In [None]:
model=Sequential()
model.add(embedding_layer)
model.add(BatchNormalization())
model.add(LSTM(units=256,return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size,activation='softmax')))

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam')
model.summary()

In [None]:
model.fit(contexts,responses,epochs=10,batch_size=10)

In [None]:
quest=["what are you doing"]
seq=tokenizer.texts_to_sequences(quest)
pad=pad_sequences(seq,maxlen=Tx,padding=padding,truncating=trunc_type)


In [None]:
pred=model.predict(pad)
pred=np.argmax(pred,axis=-1)
print(pred,pred.shape)
pred=pred.reshape(20,)
for i in pred:
  if i==0:
    continue
  else:
    print(reverse_word_index[i])

In [None]:
import math
def beam_search_decoder(prediction, BW = 3):

    output_sequences = [([], 0)]
    
    for sent in prediction:
        new_sequences = []
        sent = sent.reshape(20, )
        print(sent.shape)
        for old_seq, old_score in output_sequences:
            for char_index in range(len(sent)):
                new_seq = old_seq + [char_index]
                if(sent[char_index]==0):
                  continue
                else:
                  new_score = old_score + math.log(sent[char_index])
                new_sequences.append((new_seq, new_score))
                
        output_sequences = sorted(new_sequences, key = lambda val: val[1], reverse = True)
        output_sequences = output_sequences[:BW]
        
    return output_sequences