In [1]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf

import config

## Loading models and vocabulary

In [2]:
model = tf.keras.models.load_model("models/word2vecs/pwc3-win7-vec100-min5/predicor-lr0.001-batch128.h5")

In [3]:
with open("models/word2vecs/pwc3-win7-vec100-min5/vocabulary_small.pkl", 'rb') as file:
    vocabulary = pickle.load(file)

with open("models/word2vecs/pwc3-win7-vec100-min5/word2vec.pkl", 'rb') as file:
    word2vec = pickle.load(file)

In [12]:
input_size = 100 * 3

## Evaluation 

In [4]:
'a' in vocabulary

True

In [5]:
""" Returns the first not belonging token in the vocabulary
"""
def belong_to_vocabulary(tokens: list, vocabulary: list):
    for token in tokens:
        if not token in vocabulary:
            return token
    
    return None

In [16]:
def vectorize(tokens: list, word2vec):
    embedded_tokens = np.array([])
    
    for token in tokens:
        embedded_tokens = np.append(embedded_tokens, word2vec.wv.get_vector(token))

    return embedded_tokens.reshape(1, input_size)

In [7]:
# quick test

print(belong_to_vocabulary(['a', 'story', 'the'], vocabulary))
print(belong_to_vocabulary(['a', 'story', 'marvelous', 'the'], vocabulary))

None
marvelous


In [13]:
model.predict(np.random.randn(1, input_size))



array([[0.67988485, 0.20432276, 0.07957686, ..., 0.12773414, 0.12160717,
        0.6805879 ]], dtype=float32)

In [14]:
test_tokens = ['a', 'story', 'about']
test_tokens = ['the', 'main', 'character']
test_tokens = ['my', 'feelings', 'were']

In [17]:
not_belonging = belong_to_vocabulary(test_tokens, vocabulary)

if not_belonging is None:
    y_pred = model.predict(vectorize(test_tokens, word2vec))
else:
    print("ERROR: Word \'{}\' does not belong to the vocabulary".format(not_belonging))



In [18]:
predicted_word = vocabulary[np.argmax(y_pred)]

In [19]:
predicted_word

'the'

In [23]:
file = open("data/reviews/neg/1_1.txt")
sample_review = file.read()

In [24]:
reviews_df = pd.read_csv("data/reviews_cleaned_sample.csv", converters={'tokenized': pd.eval})
reviews_df

Unnamed: 0,review,positive,clean_text,tokenized
0,"I must admit, I was one of the skeptics who pr...",False,i must admit i was one of the skeptics who pre...,"[i, must, admit, i, was, one, of, the, skeptic..."
1,Even though an animated film it really bored e...,False,even though an animated film it really bored e...,"[even, though, an, animated, film, it, really,..."
2,"Bah. Another tired, desultory reworking of an ...",False,bah another tired desultory reworking of an ou...,"[bah, another, tired, desultory, reworking, of..."
3,I had the opportunity to see this film debut a...,False,i had the opportunity to see this film debut a...,"[i, had, the, opportunity, to, see, this, film..."
4,Was lucky enough to be an extra in this great ...,False,was lucky enough to be an extra in this great ...,"[was, lucky, enough, to, be, an, extra, in, th..."
...,...,...,...,...
1495,This is a quirky movie that the Brits do so we...,False,this is a quirky movie that the brits do so we...,"[this, is, a, quirky, movie, that, the, brits,..."
1496,For a long time it seemed like all the good Ca...,False,for a long time it seemed like all the good ca...,"[for, a, long, time, it, seemed, like, all, th..."
1497,This is a cartoon series where most of the act...,False,this is a cartoon series where most of the act...,"[this, is, a, cartoon, series, where, most, of..."
1498,"Sequel to ""The Kingdom"" is bloodier and even m...",False,sequel to the kingdom is bloodier and even mor...,"[sequel, to, the, kingdom, is, bloodier, and, ..."


In [25]:
sample_review = reviews_df.tokenized[0]
type(sample_review)

list

In [20]:
num_proposed_word = 5

In [21]:
predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

In [28]:
for predicted_index in predicted_indices:
    print("a story about ", vocabulary[predicted_index])

a story about  in
a story about  to
a story about  and
a story about  a
a story about  the


In [27]:
for index in range(config.PREVIOUS_WORDS_CONSIDERED, len(sample_review)):
    tokens = sample_review[index-config.PREVIOUS_WORDS_CONSIDERED:index]
    
    not_belonging = belong_to_vocabulary(tokens, vocabulary)

    if not_belonging is None:
        y_pred = model.predict(vectorize(test_tokens, word2vec), verbose=0)

        predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

        # for index in predicted_indices:
        #     print(vocabulary[index])
        # print()
        if vocabulary[np.argmax(y_pred)] != "the":
            print(tokens, vocabulary[np.argmax(y_pred)])
            print()
