## Neural Network with LSTMs for Text Similarity


In [7]:
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_excel('SORs.xlsx')
df.head()
dataList = df['Observation Details'].to_list()
print(len(dataList))

201


In [3]:
vocab_size = 10000
embedding_dim = 100
max_length = 120
trunc_type='post'
padding_type = "post"
oov_tok = "<OOV>"

In [4]:
# Pre-Processing 
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(dataList)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(dataList)
padded = pad_sequences(sequences,padding=padding_type, maxlen=max_length, truncating=trunc_type)

In [None]:
print(len(word_index))
print(word_index['and'])

In [None]:
# Note this is the 100 dimension version of GloVe from Stanford
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [None]:
print(len(embeddings_matrix))
print(len(embeddings_index['and'])) # 100 dimensional Embedding vector 

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(units=120, activation='relu')
])
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()

num_epochs = 100
history = model.fit(padded, padded, epochs=num_epochs,validation_data=(padded, padded), verbose=2)

In [None]:
import matplotlib.pyplot as plt
# Use matplotlib in notebook output
%matplotlib inline
loss=history.history['loss']
epochs=range(len(loss))

plt.plot(epochs, loss, 'r')
