In [1]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import os

import tensorflow as tf

import config
from utils.DataGenerator import DataGenerator

## Loading models and vocabulary

In [2]:
model = tf.keras.models.load_model(config.NN_MODEL_DIR + "dense-lr0.0001-batch128-acc18.h5")

In [3]:
with open(config.NN_MODEL_DIR + "vocabulary_small.pkl", 'rb') as file:
    vocabulary = pickle.load(file)

with open("models/word2vec-general.pkl", 'rb') as file:
    word2vec = pickle.load(file)

## Evaluation 

In [4]:
reviews_df = pd.read_csv("data/reviews_cleaned_sample.csv", converters={'tokenized': pd.eval})

In [5]:
sample_index = 1

sample_reviews = reviews_df.tokenized.iloc[sample_index:sample_index+2]

In [6]:
data_gen = DataGenerator(sample_reviews, vocabulary=vocabulary)

In [7]:
X_words, y_words = data_gen.create_dataset(config.PREVIOUS_WORDS_CONSIDERED)
embedded_words = data_gen.vectorize(X_words, word2vec, input_size=config.INPUT_SIZE)

In [8]:
y_preds = model.predict(embedded_words)

num_proposed_words = 5
vocabulary_size = len(vocabulary)



In [9]:
pwc_and_predicted = []

for index in range(len(X_words)):
    y_pred = y_preds[index]

    predicted_indices = np.argsort(y_pred).reshape(vocabulary_size)[-num_proposed_words:]
    probabilities = np.sort(y_pred).reshape(vocabulary_size)[-num_proposed_words:]

    predicted_probability = {vocabulary[index]: prob for index, prob in zip(predicted_indices, probabilities)}
    
    pwc_and_predicted.append((X_words[index], y_words[index], predicted_probability))

In [10]:
for tokens, _, predicted_words in pwc_and_predicted:
    text = "".join()

[(['the', 'favorite', 'film'],
  'of',
  {'and': 0.84964985,
   'ive': 0.86608654,
   'ever': 0.8943275,
   'i': 0.9359342,
   'of': 0.99334085}),
 (['favorite', 'film', 'of'],
  'the',
  {'his': 0.84467286,
   'its': 0.87368935,
   'it': 0.94673645,
   'this': 0.9832628,
   'the': 0.98986405}),
 (['film', 'of', 'the'],
  'general',
  {'same': 0.7578436,
   'time': 0.76755905,
   'first': 0.7789799,
   'movie': 0.8179449,
   'film': 0.822566}),
 (['actually', 'more', 'important'],
  'than',
  {'of': 0.61270934,
   'films': 0.6868298,
   'movies': 0.69369113,
   'and': 0.84526026,
   'than': 0.9556844}),
 (['more', 'important', 'than'],
  'the',
  {'that': 0.8649687,
   'to': 0.87962246,
   'it': 0.93503654,
   'this': 0.9438496,
   'the': 0.9551344}),
 (['and', 'best', 'film'],
  'of',
  {'ever': 0.79778475,
   'i': 0.8439637,
   'in': 0.84909296,
   'to': 0.8927415,
   'of': 0.93691236}),
 (['best', 'film', 'of'],
  'another',
  {'her': 0.8135715,
   'his': 0.8575171,
   'all': 0.9058

In [11]:
# pwc_and_predicted = []

# for index in range(config.PREVIOUS_WORDS_CONSIDERED, len(sample_review)):
#     tokens = sample_review[index-config.PREVIOUS_WORDS_CONSIDERED:index]
    
#     not_belonging = belong_to_vocabulary(tokens, vocabulary)

#     if not_belonging is None:
#         y_pred = model.predict(vectorize(tokens, word2vec), verbose=0)

#         predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]
#         probabilities = np.sort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

#         predicted_probability = {vocabulary[index]: prob for index, prob in zip(predicted_indices, probabilities)}
        
#         pwc_and_predicted.append((tokens, predicted_probability))
#         # if vocabulary[np.argmax(y_pred)] != "the":
#         #     pwc_and_predicted.append((tokens, vocabulary[np.argmax(y_pred)]))