In [5]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import os

import tensorflow as tf

import config
from utils.DataGenerator import DataGenerator

## Loading models and vocabulary

In [6]:
model = tf.keras.models.load_model(config.NN_MODEL_DIR + "dense-lr0.0001-batch128-acc18.h5")

In [7]:
with open(config.NN_MODEL_DIR + "vocabulary_small.pkl", 'rb') as file:
    vocabulary = pickle.load(file)

with open("models/word2vec-general.pkl", 'rb') as file:
    word2vec = pickle.load(file)

## Evaluation 

In [8]:
reviews_df = pd.read_csv("data/reviews_cleaned_sample.csv", converters={'tokenized': pd.eval})

In [9]:
sample_index = 1

sample_reviews = reviews_df.tokenized.iloc[sample_index:sample_index+2]

In [10]:
data_gen = DataGenerator(sample_reviews, vocabulary=vocabulary)

In [11]:
X_words, y_words = data_gen.create_dataset(config.PREVIOUS_WORDS_CONSIDERED)
embedded_words = data_gen.vectorize(X_words, word2vec, input_size=config.INPUT_SIZE)

In [12]:
y_preds = model.predict(embedded_words)

num_proposed_words = 5
vocabulary_size = len(vocabulary)



In [13]:
pwc_and_predicted = []

for index in range(len(X_words)):
    y_pred = y_preds[index]

    predicted_indices = np.argsort(y_pred).reshape(vocabulary_size)[-num_proposed_words:]
    probabilities = np.sort(y_pred).reshape(vocabulary_size)[-num_proposed_words:]

    predicted_probability = {vocabulary[index]: prob for index, prob in zip(predicted_indices, probabilities)}
    
    pwc_and_predicted.append((X_words[index], y_words[index], predicted_probability))

In [18]:
for tokens, _, predicted_words in pwc_and_predicted:
    text = " ".join(tokens)

    print(text + "...")
    print("PREDICTED:")

    for word, probability in predicted_words.items():
        print(f"       {word}: {int(probability*100)}%")

    print()

the favorite film...
PREDICTED:
       and: 84%
       ive: 86%
       ever: 89%
       i: 93%
       of: 99%

favorite film of...
PREDICTED:
       his: 84%
       its: 87%
       it: 94%
       this: 98%
       the: 98%

film of the...
PREDICTED:
       same: 75%
       time: 76%
       first: 77%
       movie: 81%
       film: 82%

actually more important...
PREDICTED:
       of: 61%
       films: 68%
       movies: 69%
       and: 84%
       than: 95%

more important than...
PREDICTED:
       that: 86%
       to: 87%
       it: 93%
       this: 94%
       the: 95%

and best film...
PREDICTED:
       ever: 79%
       i: 84%
       in: 84%
       to: 89%
       of: 93%

best film of...
PREDICTED:
       her: 81%
       his: 85%
       all: 90%
       this: 93%
       the: 97%

for a film...
PREDICTED:
       the: 80%
       about: 81%
       in: 83%
       i: 85%
       that: 90%

a film from...
PREDICTED:
       one: 73%
       it: 74%
       that: 78%
       a: 89%
       the: 92%
