# Importing the libraries that the program needs

In [1]:
import sys
print("Python:", sys.version)

import numpy as np
print("NumPy:", np.__version__)

import pandas as pd
print("Pandas:", pd.__version__)

import sklearn
print("Scikit-learn:", sklearn.__version__)

import spacy
from spacy.tokens import Doc
from spacy.attrs import IS_TITLE, LOWER, IS_ALPHA, IS_UPPER, IS_DIGIT
print("SpaCy:", spacy.__version__)

import nltk

Python: 3.7.10 (default, Feb 20 2021, 21:17:23) 
[GCC 7.5.0]
NumPy: 1.19.5
Pandas: 1.1.5
Scikit-learn: 0.22.2.post1
SpaCy: 2.2.4


# Downloading the brown corpus

In [2]:
import nltk
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [3]:
nlp = spacy.load("en")

# We load the words from the corpus and while we iterate over them we also filter with the "isalpha" method. By using this method we can remove characters like !#%&?, which we won't need during the training, because this characters don't have much meaning in our case.

# We also transform the words into lower case format, so we can avoid having duplicates of the same word like "The" and "the". 

In [4]:
words = [word.lower() for word in brown.words() if word.isalpha()]
words[:10]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of']

# In the next step we transform the list of words into partial sentences of 6 word length. Basically our model will learn to predict the next word based on the previous 5 words.

# While we are creating the partial sequences we also do another filtering where we skip those partial sequences where one of the words is a non frequent one. A word is frequent if it appears at least 10 times in the corpus. This will help us to train the model, because it's hard to learn a non frequent word, given that it doesn't appear often.

In [5]:
frequency = nltk.FreqDist(w for w in words)
frequent_words = set(map(lambda x: x[0], filter(lambda x: x[1] >= 10, frequency.items())))

In [6]:
def contains_rare_word(sent):
  return any([word not in frequent_words for word in sent])

In [7]:
length = 5 + 1
partial_sents = list()
for i in range(length, len(words)):
    word_seq = words[i-length:i]
    if contains_rare_word(word_seq):
      continue
    sentence = ' '.join(word_seq)
    partial_sents.append(sentence)
print("Total sentences: ", len(partial_sents))

Total sentences:  610583


# The word embedding layer expects integers as it's input, so we need to transform our data.

# To do this we need to map our words to a unique integer and encode our input sequences. When the model makes a prediction, we can convert it back to a word using the same mapping.

# We use the keras Tokenizer to do this which we first train on the dataset, where it learns all of the unique words and assigns a unique integer to each of them, then we can use this trained Tokenizer to transform a list of words into a list of integers.

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(partial_sents)
sents = tokenizer.texts_to_sequences(partial_sents)

In [9]:
sents[:3]

[[1, 4527, 767, 2762, 1514, 54],
 [4527, 767, 2762, 1514, 54, 1921],
 [767, 2762, 1514, 54, 1921, 33]]

# We can acces the mapping with the word_index attribute.

In [10]:
unique_words = len(tokenizer.word_index) + 1
print("Total words: ", len(words))
print("Unique words after filtering: ", unique_words)

Total words:  981716
Unique words after filtering:  8144


# Now the we have our encoded sentences we need to split them into X and y where X is the input and y is the expected output.

# We also need to one-hot encode the output words, because this way the model will learn to predict the probability distribution of the next word.

In [11]:
from tensorflow.keras.utils import to_categorical

temp_array = np.array(sents)
X, y = temp_array[:,:-1], temp_array[:,-1]
y = to_categorical(y, num_classes=unique_words, dtype=np.uint8)

In [12]:
y.shape

(610583, 8144)

# Now we can define our model. We provide the number of unique words and the length of our input sentences to the embedding layer and choose 100 for the output dimension.

# We will use two LSTM layers, with more layers we might achieve better results, but the training time would increase.

# On the last LSTM layer we use the return_state=True, so it will return the cell state

# At the end we have two Dense layers, where the final Dense layer predicts the next word as a probability related to each word.

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow import keras

inputs = keras.Input(shape=(X.shape[1],))
m = Embedding(unique_words, 10, input_length=X.shape[1])(inputs)
#m = LSTM(100, return_sequences=True)(m)
m,_,_ = LSTM(100, return_state=True)(m)
outputs = Dense(unique_words, activation='softmax')(m)

model = keras.Model(inputs=inputs, outputs=outputs, name="language_model")
print(model.summary())

Model: "language_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 10)             81440     
_________________________________________________________________
lstm (LSTM)                  [(None, 100), (None, 100) 44400     
_________________________________________________________________
dense (Dense)                (None, 8144)              822544    
Total params: 948,384
Trainable params: 948,384
Non-trainable params: 0
_________________________________________________________________
None


# Because the model could be interpreted as learning a multiclass classification we use categorical cross entropy.

# For the optimizer we use Adam.

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9ed41a3ad0>

# Generating words

# Here we define a function that we can use to generate an arbitrary long text. To do this we need a seeding text as an input that we first encode using the Tokenizer, then we truncate it to have 5 word length, because our model learned on 5 word long inputs. After predicting the index of the next word using the model, we lookup the corresponding word. Finally, we append it into the end of our seed text and repeat the process. 

In [15]:
from keras.preprocessing.sequence import pad_sequences

def int_to_word(predicted):
  pred_word = ""
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      pred_word = word
      break
  return pred_word

def generate_text(model, seed_text, generate_num, test=False):
  generated = list()
  text = seed_text
  input_len = length - 1

  for i in range(generate_num):
    if test:
      print("Text:", text)
    encoded = tokenizer.texts_to_sequences([text])[0]
    if test:
      print("Encoded:", encoded)
    encoded = pad_sequences([encoded], maxlen=input_len, truncating='pre')
    if test:
      print("Padded:", encoded)
    pred_probability = model.predict(encoded, verbose=0)
    pred = np.argmax(pred_probability,axis=1)
    pred_word = int_to_word(pred)
    text += ' ' + pred_word
    generated.append(pred_word)
  return ' '.join(generated)

In [29]:
from random import randint

seed_text = partial_sents[randint(0,len(partial_sents))]

generated = generate_text(model, seed_text, 5)
print("Start text:", seed_text)
print("Generate text:", generated)

Start text: should be substantially reduced and ultimately
Generate text: as the result of the


# Custom input

In [17]:
def custom_input(text):
  tokenized = nlp(text)
  pre_process = ' '.join([word.text.lower() for word in tokenized])
  return generate_text(model, pre_process, 1)

In [31]:
print("Exit with ''(enter)")
start_text = input()
while start_text != "":
  next_word = custom_input(start_text)
  print("Next word:", next_word)
  start_text = input()

Exit with ''(enter)
 
Next word: and



# Search Engine Part

# Define the helper functions to get the LSTM cell state. The LSTM layer is the 3rd one, that's why we use that layer as our output. 

In [32]:
from keras import backend as K

last_lstm = K.function([model.layers[0].input], 
                       [model.layers[2].output])

def get_cell_state(word_id_vector):
  return last_lstm([word_id_vector])[0][0][2][0]

def get_cell_state_from_sentence(sent):
  encoded = tokenizer.texts_to_sequences([sent])[0]
  encoded = pad_sequences([encoded], maxlen=input_len, truncating='pre')
  return get_cell_state(encoded)

In [33]:
def cosine_similarity(v1, v2):
  return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

# Transforms a sentence into a vector by using the trained model and extracting the cell state, then compares it with another vector to get the cosine similarity

In [34]:
def sentence_cosine_similarity(a,b):
    input_len = length - 1
    
    encoded_a = tokenizer.texts_to_sequences([a])[0]
    encoded_a = pad_sequences([encoded_a], maxlen=input_len, truncating='pre')
    a_cell_state = get_cell_state(encoded_a)

    encoded_b = tokenizer.texts_to_sequences([b])[0]
    encoded_b = pad_sequences([encoded_b], maxlen=input_len, truncating='pre')
    b_cell_state = get_cell_state(encoded_b)

    return cosine_similarity(a_cell_state,b_cell_state)

# Custom sentence similarities (Cosine similarity)

In [37]:
a = input()
b = input()
similarity = sentence_cosine_similarity(a,b)
print("Sentence 1:", a)
print("Sentence 2:", b)
print("Similarity:", similarity)

Hello
Hi
Sentence 1: Hello
Sentence 2: Hi
Similarity: 0.92856324


# Mini search engine

In [38]:
!pip install annoy

Collecting annoy
[?25l  Downloading https://files.pythonhosted.org/packages/a1/5b/1c22129f608b3f438713b91cd880dc681d747a860afe3e8e0af86e921942/annoy-1.17.0.tar.gz (646kB)
[K     |████████████████████████████████| 655kB 8.4MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.0-cp37-cp37m-linux_x86_64.whl size=391640 sha256=85f8340d56071509a97b06cf844fa909cd64101b0d80f06cd8b7f3d9012c71ce
  Stored in directory: /root/.cache/pip/wheels/3a/c5/59/cce7e67b52c8e987389e53f917b6bb2a9d904a03246fadcb1e
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.0


In [39]:
from annoy import AnnoyIndex

In [40]:
len(brown.sents())

57340

# We use annoy to index all of the sentences in the brown corpus. First we transform the sentence into a vector by passing it through our model and taking the cell state representation of the lstm layer, then we pass this vector into annoy to store it.

In [68]:
import tqdm

f = 100
t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
input_len = length - 1

with tqdm.tqdm(total=len(brown.sents())) as pbar:
  for i, sent in enumerate(brown.sents()):  
    encoded = tokenizer.texts_to_sequences([sent])[0]
    encoded = pad_sequences([encoded], maxlen=input_len, truncating='pre')
    cell_state = get_cell_state(encoded)
    t.add_item(i, cell_state)
    pbar.update(1)

100%|██████████| 57340/57340 [04:14<00:00, 225.72it/s]


# We build the trees which will be used for querying. With more trees we can get a higher precision.

In [69]:
t.build(10)

True

# Testing with a sentence. Annoy returns the 5 closest neighbours.

In [72]:
def get_sentence(index):
  return brown.sents()[index]

In [73]:
search = "this is a test"
cell_state = get_cell_state_from_sentence(search)
neighbours = t.get_nns_by_vector(cell_state, 5)
for nn in neighbours:
  print(get_sentence(nn))

['The', 'chase', 'in', 'itself', 'is', 'a', 'narrative', ';', ';']
['This', 'is', 'a', 'mistake', '.']
['And', 'the', '100,000', 'subscribers', 'became', 'a', 'reality', '.']
['The', 'question', 'becomes', ',', '``', 'What', 'is', 'a', 'dream', "''", '?', '?']
['``', 'There', 'must', 'be', 'a', 'line', "''", '!', '!']


# Custom input

In [75]:
print("Exit with ''(enter)")
search = input()
while search != "":
  cell_state = get_cell_state_from_sentence(search)
  neighbours = t.get_nns_by_vector(cell_state, 5)
  for nn in neighbours:
    print(get_sentence(nn))
  search = input()

Exit with ''(enter)
Hello how are you?
['``', 'Hello', ',', 'Julie', ',', 'how', 'are', 'you', "''", '?', '?']
['If', 'he', 'bites', 'a', 'playmate', 'she', 'says', ',', '``', 'Danny', "won't", 'like', 'you', "''", '.']
['It', "wouldn't", 'matter', 'to', 'a', 'fool', 'like', 'you', '.']
['If', 'you', 'have', 'an', 'annual', 'or', 'regular', 'physical', 'examination', 'program', ',', 'is', 'it', 'worth', 'what', 'it', 'is', 'costing', 'you', '?', '?']
['``', "I'd", 'give', 'anything', 'if', 'I', 'could', 'have', 'found', 'a', 'girl', 'like', 'you', "''", '.']
Once
['Now', '!', '!']
['Now', '!', '!']
['``', 'Now', "''", '?', '?']
['Now', '!', '!']
['Then']
 
['(', '6', ')']
['Sec.', '2', '.']
['Mr.', 'Stratton', '.']
[')', '9', '.']
['2', '.']

