In [4]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf

import config

## Loading models and vocabulary

In [5]:
model = tf.keras.models.load_model("models/nn-predictors/first_model.h5")

In [6]:
with open("vocabularies/vocabulary_small.pkl", 'rb') as file:
    vocabulary = pickle.load(file)

with open("models/word2vecs/the_smallest.pkl", 'rb') as file:
    word2vec = pickle.load(file)

## Evaluation 

In [7]:
'a' in vocabulary

True

In [8]:
""" Returns the first not belonging token in the vocabulary
"""
def belong_to_vocabulary(tokens: list, vocabulary: list):
    for token in tokens:
        if not token in vocabulary:
            return token
    
    return None

In [9]:
def vectorize(tokens: list, word2vec):
    embedded_tokens = np.array([])
    
    for token in tokens:
        embedded_tokens = np.append(embedded_tokens, word2vec.wv.get_vector(token))

    return embedded_tokens.reshape(1, 30)

In [10]:
# quick test

print(belong_to_vocabulary(['a', 'story', 'the'], vocabulary))
print(belong_to_vocabulary(['a', 'story', 'marvelous', 'the'], vocabulary))

None
marvelous


In [11]:
model.predict(np.random.randn(1, 30))



array([[3.8113226e-03, 2.9834694e-01, 7.8284580e-01, ..., 1.1835092e-03,
        2.2432350e-06, 1.3455294e-01]], dtype=float32)

In [12]:
test_tokens = ['a', 'story', 'about']
test_tokens = ['the', 'main', 'character']
test_tokens = ['my', 'feelings', 'were']

In [13]:
not_belonging = belong_to_vocabulary(test_tokens, vocabulary)

if not_belonging is None:
    y_pred = model.predict(vectorize(test_tokens, word2vec))
else:
    print("ERROR: Word \'{}\' does not belong to the vocabulary".format(not_belonging))



In [14]:
predicted_word = vocabulary[np.argmax(y_pred)]

In [15]:
predicted_word

'the'

In [16]:
num_proposed_word = 5

In [17]:
predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

In [18]:
predicted_indices

array([185,  74,   2,   8,  27], dtype=int64)

In [20]:
file = open("data/reviews/neg/1_1.txt")
sample_review = file.read()

In [32]:
reviews_df = pd.read_csv("data/reviews_cleaned_sample.csv", converters={'tokenized': pd.eval})
reviews_df

Unnamed: 0,review,positive,preprocessed,tokenized
0,Story of a man who has unnatural feelings for ...,False,story of a man who has unnatural feelings for ...,"[story, of, a, man, who, has, unnatural, feeli..."
1,Airport '77 starts as a brand new luxury 747 p...,False,airport starts as a brand new luxury plane i...,"[airport, starts, as, a, brand, new, luxury, p..."
2,This film lacked something I couldn't put my f...,False,this film lacked something i couldnt put my fi...,"[this, film, lacked, something, i, couldnt, pu..."
3,"Sorry everyone,,, I know this is supposed to b...",False,"sorry everyone,,, i know this is supposed to b...","[sorry, everyone, ,, ,, ,, i, know, this, is, ..."
4,When I was little my parents took me along to ...,False,when i was little my parents took me along to ...,"[when, i, was, little, my, parents, took, me, ..."
...,...,...,...,...
495,B movie at best. Sound effects are pretty good...,False,"b movie at best, sound effects are pretty good...","[b, movie, at, best, ,, sound, effects, are, p..."
496,I chose to see this movie because it got a goo...,False,i chose to see this movie because it got a goo...,"[i, chose, to, see, this, movie, because, it, ..."
497,"Oh Dear Lord, How on Earth was any part of thi...",False,"oh dear lord, how on earth was any part of thi...","[oh, dear, lord, ,, how, on, earth, was, any, ..."
498,This is not a film you can really analyse sepa...,False,this is not a film you can really analyse sepa...,"[this, is, not, a, film, you, can, really, ana..."


In [34]:
sample_review = reviews_df.tokenized[0]
type(sample_review)

list

In [38]:
for index in range(config.PREVIOUS_WORDS_CONSIDERED, len(sample_review)):
    tokens = sample_review[index-config.PREVIOUS_WORDS_CONSIDERED:index]
    
    not_belonging = belong_to_vocabulary(tokens, vocabulary)

    if not_belonging is None:
        y_pred = model.predict(vectorize(test_tokens, word2vec), verbose=0)

        predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

        for index in predicted_indices:
            print(vocabulary[index])
        print()
        # if vocabulary[np.argmax(y_pred)] != "the":
        #     print(tokens, vocabulary[np.argmax(y_pred)])
        #     print()


i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,
the

i
and
a
,


In [19]:
for predicted_index in predicted_indices:
    print("a story about ", vocabulary[predicted_index])

a story about  i
a story about  and
a story about  a
a story about  ,
a story about  the


In [None]:
np.random.randn(1, 30).shape

(1, 30)