<a href="https://colab.research.google.com/github/ilyanovak/Manhattan-LSTM/blob/main/ManhattanLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# TODO
1.   Change location of importint data
2.   Methods comments
3.   Data requirements check
4.   assertions for sanity checks, max_seq_length
5.   implement sample_size
6.   Custom vocabulary method
7.   Add comments to each description
8.   Fix save_model

In [2]:
import keras.backend as K
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import random
import re
import spacy
import spacy.cli
import time
from numpy import loadtxt
from keras.layers import Input, Embedding, LSTM, Lambda
from keras.models import Model
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
quora_source = 'https://raw.githubusercontent.com/ilyanovak/Manhattan-LSTM/main/data/quora.csv'
sample_training_data = pd.read_csv(quora_source, engine='python',
                          usecols=['question1', 'question2', 'is_duplicate'],
                        skiprows=random.sample(range(1, 404290), 404290-100))
lstm = ManhatanLSTM(sample_training_data)

In [78]:
lstm.evaluate_model()

loss 1.0004427433013916
binary_accuracy: 65.00%


In [115]:
class ManhatanLSTM:

    def __init__ (self, training_data):

        # Verify training data

        self._training_data = training_data


    def __text_to_word_list(self, text):
        '''
        Helper method used to preprocess text. 
        text: A sequence of words of string type
        Returns text with all characters removed except for letters of the 
        alphabet and space character. All letters are converted to lower case
        '''

        text = str(text).lower()
        text = re.sub('[^a-z ]', '', text)
        text = text

        return text


    def __create_vocabulary(self):
        '''
        
        '''

        self._vocabulary = {}

        # '<unk>' will never be used, its only a placeholder for the [0, 0, ....0] embedding
        inverse_vocabulary = ['<unk>']  

        # Iterate through each sequence
        for idx, sequence in enumerate([self._training_data.columns[0], self._training_data.columns[1]]):    

            # Duplicate sequence columns that will be transformed into vectors
            self._training_data[f'sequence{idx+1}_vect'] = self._training_data[sequence]

            # Iterate through each row
            for index in tqdm(range(len(self._training_data))):

                # Preprocess sequence text
                text_old = self._training_data.loc[index, sequence]
                text_new = self.__text_to_word_list(text_old)
                self._training_data.at[index, sequence] = text_new

                word2id = []

                # Iterate through each word in the sequence
                for word in text_new.split(" "):

                    # Create id for word if its not in vocabulary yet
                    # Add word to text vector
                    if word not in self._vocabulary:
                        self._vocabulary[word] = len(inverse_vocabulary)
                        word2id.append(self._vocabulary[word])
                        inverse_vocabulary.append(word)
                    else:
                        word2id.append(self._vocabulary[word])

                self._training_data.at[index, f'sequence{idx+1}_vect'] = word2id


    def __create_embeddings(self):

        nlp = spacy.load("en_core_web_lg")

        # Create embeddings matrix based on vocabulary
        embedding_dim=300
        self._embeddings = np.zeros((len(self._vocabulary.items()) + 1, embedding_dim))

        for word, index in tqdm(self._vocabulary.items()):
            self._embeddings[index] = nlp(word).vector


    def __create_train_validation(self):

        # Create training and validation data
        train, val = train_test_split(self._training_data, test_size=0.2)

        # Create dictionaries with left/right keys for two sets of sequences 
        self._X_train = {'left':train['sequence1_vect'], 'right':train['sequence2_vect']}
        self._Y_train = train['is_duplicate'].values

        self._X_val = {'left':val['sequence1_vect'], 'right':val['sequence2_vect']}
        self._Y_val = val['is_duplicate'].values

        # Calculate maximum length of sequences
        self._max_seq_length = max(train['sequence1_vect'].apply(lambda x: len(x)).max(),
                                  train['sequence2_vect'].apply(lambda x: len(x)).max(),
                                  val['sequence1_vect'].apply(lambda x: len(x)).max(),
                                  val['sequence2_vect'].apply(lambda x: len(x)).max())

        # Zero padding
        for self._training_data in [self._X_train, self._X_val]:
            for side in ['left', 'right']:
                self._training_data[side] = pad_sequences(self._training_data[side], maxlen=self._max_seq_length, padding='post')

        # Sanity check
        assert(self._X_train['left'].shape == self._X_train['right'].shape)
        assert(self._X_val['left'].shape == self._X_val['right'].shape)


    def __exponent_neg_manhattan_distance(self, left, right):

        ''' Calculates manhattan similarity estimate of the LSTMs outputs'''
        return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))


    def __create_model(self, ):

        # Left and right input layers
        left_input, right_input = Input(shape=(self._max_seq_length,)), Input(shape=(self._max_seq_length,))

        # Embeddings layer
        embedding_dim = 300
        embedding_layer = Embedding(input_dim=len(self._embeddings), # Size of vocabulary
                                    output_dim=embedding_dim, # Dimension of dense embedding
                                    weights=[self._embeddings], # ???
                                    trainable=False, #???
                                    input_length=self._max_seq_length # Length of input sequences
                                    )

        left_embedding, right_embedding = embedding_layer(left_input), embedding_layer(right_input)

        # LSTM later that is used on both sides
        lstm = LSTM(64)

        left_output, right_output = lstm(left_embedding), lstm(right_embedding)

        # Calculate manhattan distance for LSTM
        manhat_dist = Lambda(function=lambda x: self.__exponent_neg_manhattan_distance(x[0], x[1]),
                            output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

        # Setup model
        self.model = Model(inputs=[left_input, right_input], outputs=[manhat_dist])


    def __compile_model(self):

        self.model.compile(loss='binary_crossentropy', optimizer='nadam', metrics='binary_accuracy')


    def __fit_model(self, batch_size=32, epochs=15):

        time_start = time.time()

        self.model_history = self.model.fit([self._X_train['left'], self._X_train['right']], self._Y_train, 
                            batch_size=batch_size, 
                            epochs=epochs,
                            validation_data=([self._X_val['left'], self._X_val['right']], self._Y_val))

        time_end = time.time()

        print(f'Training time is approximately {int((time_end - time_start) / 60)} minutes')


    def fit(self):
        print('Creating Vocabulary...')
        print('--------------------\n')

        self.__create_vocabulary()
        
        print('\n\nCreating embeddings...')
        print('--------------------\n')
        self.__create_embeddings()
        

        print('\n\nCreating training and validation datasets...')
        print('--------------------\n')
        self.__create_train_validation()
        

        print('\n\nCreating model...')
        print('--------------------\n')
        self.__create_model()
        

        print('\n\nCompiling model...')
        print('--------------------\n')
        self.__compile_model()
        

        print('\n\nFitting model...')
        print('-------------------\n')
        self.__fit_model()
        


    def summarize_model(self):
        # summarize model.
        self.model.summary()


    def evaluate_model(self):
        # Evaluate the model
        score = self.model.evaluate([self._X_val['left'], self._X_val['right']], self._Y_val, verbose=0)
        print(self.model.metrics_names[0], score[0])
        print("%s: %.2f%%" % (self.model.metrics_names[1], score[1]*100))


    def plot_model_accuracy_and_loss(self):

        fig = go.Figure(data=go.Scatter(x=list(range(1,len(self.model_history.history['binary_accuracy'])+1)), 
                                        y=self.model_history.history['binary_accuracy'],
                                        name='Train'))
        fig.add_trace(go.Scatter(x=list(range(1,len(self.model_history.history['val_binary_accuracy'])+1)), 
                                y=self.model_history.history['val_binary_accuracy'],
                                name='Validation'))
        fig.update_layout(title_text='Model Accuracy',
                        title_x=0.5,
                        xaxis_title_text='Epoch',
                        yaxis_title_text='Accuracy',
                        height=400,
                        width=800,
                        plot_bgcolor ='#FFFFFF',
                        xaxis_linecolor='#000000',
                        xaxis_linewidth=2,
                        xaxis_mirror=True,
                        yaxis_linecolor='#000000',
                        yaxis_linewidth=2,
                        yaxis_mirror=True)

        fig2 = go.Figure(data=go.Scatter(x=list(range(1,len(self.model_history.history['loss'])+1)), 
                                        y=self.model_history.history['loss'],
                                        name='Train'))
        fig2.add_trace(go.Scatter(x=list(range(1,len(self.model_history.history['val_loss'])+1)), 
                                y=self.model_history.history['val_loss'],
                                name='Validation'))
        fig2.update_layout(title_text='Model Loss',
                        title_x=0.5,
                        xaxis_title_text='Epoch',
                        yaxis_title_text='Loss',
                        height=400,
                        width=800,
                        plot_bgcolor ='#FFFFFF',
                        xaxis_linecolor='#000000',
                        xaxis_mirror=True,
                        xaxis_linewidth=2,
                        yaxis_linecolor='#000000',
                        yaxis_mirror=True,
                        yaxis_linewidth=2,
        )

        fig.show()
        fig2.show()


    def save_model(self):
        # save model and architecture to single file
        self.model.save("model.h5")
        print("Saved model to disk")


    def load_model(self, path):
        # load model
        self.model = load_model(path)

In [116]:
quora_source = 'https://raw.githubusercontent.com/ilyanovak/Manhattan-LSTM/main/data/quora.csv'
sample_training_data = pd.read_csv(quora_source, engine='python',
                                   usecols=['question1', 'question2', 'is_duplicate'],
                                   skiprows=random.sample(range(1, 404290), 404290-100))
lstm = ManhatanLSTM(sample_training_data)

In [117]:
lstm.fit()

100%|██████████| 100/100 [00:00<00:00, 11714.95it/s]
100%|██████████| 100/100 [00:00<00:00, 10178.62it/s]

Creating Vocabulary...
--------------------



Creating embeddings...
--------------------




100%|██████████| 727/727 [00:06<00:00, 109.44it/s]




Creating training and validation datasets...
--------------------

max_seq_length: 39


Creating model...
--------------------



Compiling model...
--------------------



Fitting model...
-------------------

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training time is approximately 0 minutes


In [92]:
lstm.plot_model_accuracy_and_loss()

AttributeError: ignored

# TEST

In [None]:
# ################################
# # Import leafly strains and concatinate each strain's feelings, helps, negatives and description into a single column

# strains = pd.read_json('/content/leafly.json')
# strains['strain_text'] = pd.Series(dtype='str')
# strains = strains.replace({None:""})

# columns = ['feeling_1', 'feeling_2', 'feeling_3', 'feeling_4', 'feeling_5',
#            'helps_1', 'helps_2', 'helps_3', 'helps_4', 'helps_5',
#            'negative_1', 'negative_2', 'negative_3', 'negative_4', 'negative_5',
#            'description']

# for i in range(0, len(strains)):
#     concat = ""
#     for col in columns:
#         concat = concat + " " + strains.loc[i, col]
#     strains.loc[i, 'strain_text'] = concat

# ################################

# '''
# This is an example of fake text submit by user in the website's UI.
# It is a concatination of user's selected feelings, helps and negatives terms.
# It also concatinates the user's submitted description that is written in free form.
# It is used to test the neural network model's predictive accuracy on marijuana strains.
# The text can be changed to test different scenarios.
# '''
# user_text = 'Headache Dizzy Nausea Happy Creative I am very sad every day feeling terrible and I need something that will cure my insomnia and anxiety and will help me get out of bed each day.'

# # Add user text to strains data
# print('user_text:', user_text)
# ################################

# strains[['strain', 'strain_text']].head()