In [None]:
from tensorflow.keras.layers import Input, LSTM, RepeatVector,Embedding,Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
import tensorflow.keras as keras
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
import string
import pickle


In [None]:

class TextProcesser:
    
    def __init__(self):
        
        self.word_index = dict() ;
        self.word_index["<PAD>"] = 0 #Padding
        self.word_index["<START>"] = 1 #start
        self.word_index["<UNK>"] = 2  # unknown
        self.word_index["<UNUSED>"] = 3
        self.word_index["<EOS>"] = 4
        self.curr_index = 4 ;
        
        self.rev_word_index = dict()
        self.rev_word_index[0] = "<PAD>"
        self.rev_word_index[1] = "<START>"
        self.rev_word_index[2] = "<UNK>"
        self.rev_word_index[3] = "<UNUSED>"
        self.rev_word_index[4] = "<EOS>"
        
        
    
    def get_word_list(self,sentence):
        
        sentence = sentence.lower()
        
        #Remove Punctuation
        sentence = sentence.translate(str.maketrans('','','!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'))
        
        #Remove Whitespaces
        sentence = sentence.strip()
        
        #Tokenize the sentences 
        tokens = nltk.tokenize.word_tokenize(sentence) # 
        
        return tokens
        
    def fit_on_text(self,sentence):
        
        word_list = self.get_word_list(sentence)
        
        for word in word_list:
            
            if word not in self.word_index:
                self.curr_index = self.curr_index+1 
                
                self.word_index[word] = self.curr_index 
                self.rev_word_index[self.curr_index] = word
                
                
        return word_list
        
    def get_seq_from_text(self,sentence,max_len=256):
        
        word_list = self.fit_on_text(sentence)
        
        seq = [self.word_index["<START>"] ]
        for word in word_list:
            if word is '.':
                seq.append(self.word_index["<EOS>"])
            else:
                seq.append(self.word_index[word])
        
        return seq
    
    def proccess_text(self,file_names=None,text=None,max_len=256):
        
        seqs = []
        
        if file_names :
            
            for file_name in file_names:
                with open(filename,'r') as file:
                    text = file.read()
                    seq = self.get_seq_from_text(text)
                seqs.append(seq)
        elif text:
            
            seq = self.get_seq_from_text(text)
            seqs.append(seq)
        else:
            print("file_names = None and text = None")
            
        seqs = sequence.pad_sequences(seqs,value=self.word_index["<PAD>"],padding='post',maxlen=max_len)
        return seqs
    
    def decode_sequence(self,seq):
        """
            seq : numpy array of shape = (seq_length,)
        """
        line = []
        for i in range(seq.shape[0]):
            index = seq[i]
            if index >= 0 and index <=4:
                
                #This block can be used to handle the special values <START> , <PAD> , <EOS> 
                if index == 4:
                    line.append('.')
            else:
                line.append(self.rev_word_index[index])
                
        
        return ' '.join(line)
            
            
        

        

In [None]:
def build_autoencoder(input_length,vocab_size,latent_dim):

    inputs = Input(shape=(None,))

    x = Embedding(vocab_size,latent_dim,input_length=input_length,name="encoder_embedding")(inputs)

    encoded = Bidirectional(LSTM(latent_dim), merge_mode="sum",name="encoder")(x)


    decoder_input = RepeatVector(input_length, name="repeater")(encoded)
    decoded = Bidirectional(LSTM(latent_dim), merge_mode="sum",name="decoder")(decoder_input)

    autoencoder = Model(inputs,decoded)
    encoder = Model(inputs,encoded)
    
    return autoencoder,encoder

    



In [None]:
tokenizer = TextProcesser()
def load_dataset():
    
    train_data = []
    with open('Data/desc.txt','r') as file:
        for line in file:
            data = tokenizer.proccess_text(text=line)
            train_data.extend([data])
        
    return np.array(train_data)

data = load_dataset()
data = data.reshape(data.shape[0],data.shape[2])

with open('tokenizer.pickle','wb') as tokenizerFile:
    pickle.dump(tokenizer,tokenizerFile)
np.save('data.npy',data)

In [None]:
train_data, test_data = train_test_split(data,shuffle=False)

In [None]:
input_length = 256
vocab_size = len(tokenizer.word_index)
latent_dim = 256

autoencoder,encoder = build_autoencoder(input_length,vocab_size,latent_dim)
autoencoder.compile(optimizer="sgd", loss="mse")


In [None]:
autoencoder.summary()

In [None]:
filepath="sentence_autoencoder.h5"
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
autoencoder.fit(train_data, train_data,
                epochs=100,
                batch_size=512,
                validation_data=(test_data, test_data),callbacks=callbacks_list)

encoder.save("sentence_encoder.h5")

In [None]:
# To load Encoder Model and generate latent representation
# encoder = keras.models.load_model("sentence_encoder.h5")
# encoder.predict(<(batch_length,seq_length)>)
