In [8]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import os

import tensorflow as tf

import config
from utils.DataGenerator import DataGenerator

## Loading models and vocabulary

In [2]:
model = tf.keras.models.load_model(config.NN_MODEL_DIR + "dense-lr0.0001-batch128-acc0.18.h5")

In [3]:
with open(config.NN_MODEL_DIR + "vocabulary_small.pkl", 'rb') as file:
    vocabulary = pickle.load(file)

with open("models/word2vec-general.pkl", 'rb') as file:
    word2vec = pickle.load(file)

## Evaluation 

In [5]:
'a' in vocabulary

True

In [6]:
def belong_to_vocabulary(tokens: list, vocabulary: list):
    """ Returns the first not belonging token in the vocabulary
    """
    for token in tokens:
        if not token in vocabulary:
            return token
    
    return None

In [7]:
def vectorize(tokens: list, word2vec):
    embedded_tokens = np.array([])
    
    for token in tokens:
        embedded_tokens = np.append(embedded_tokens, word2vec.wv.get_vector(token))

    return embedded_tokens.reshape(1, config.INPUT_SIZE)

In [8]:
# quick test

print(belong_to_vocabulary(['a', 'story', 'the'], vocabulary))
print(belong_to_vocabulary(['a', 'story', 'marvelous', 'the'], vocabulary))

None
marvelous


In [9]:
model.predict(np.random.randn(1, config.INPUT_SIZE))



array([[0.95077896, 0.78511536, 0.28060228, ..., 0.04386942, 0.1251689 ,
        0.05517529]], dtype=float32)

In [10]:
test_tokens = ['a', 'story', 'about']
test_tokens = ['the', 'main', 'character']
# test_tokens = ['my', 'feelings', 'were']

In [11]:
not_belonging = belong_to_vocabulary(test_tokens, vocabulary)

if not_belonging is None:
    y_pred = model.predict(vectorize(test_tokens, word2vec))
else:
    print("ERROR: Word \'{}\' does not belong to the vocabulary".format(not_belonging))



In [12]:
predicted_word = vocabulary[np.argmax(y_pred)]

In [13]:
predicted_word

'is'

In [14]:
num_proposed_word = 5

In [15]:
predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

In [18]:
for predicted_index in predicted_indices:
    print("the main character", vocabulary[predicted_index])

the main character who
the main character and
the main character was
the main character of
the main character is


In [4]:
reviews_df = pd.read_csv("data/reviews_cleaned_sample.csv", converters={'tokenized': pd.eval})
reviews_df

Unnamed: 0,review,positive,clean_text,tokenized
0,Flavia the Heretic is an undeniable work of ar...,False,flavia the heretic is an undeniable work of ar...,"[flavia, the, heretic, is, an, undeniable, wor..."
1,"ROUEN PRIZES AND THE TRIUMPH OF ""VILLA PARANOI...",False,rouen prizes and the triumph of villa paranoia...,"[rouen, prizes, and, the, triumph, of, villa, ..."
2,"I liked the movie, first of all because it tol...",False,i liked the movie first of all because it told...,"[i, liked, the, movie, first, of, all, because..."
3,Im watching it now on pink (Serbia TV station)...,False,im watching it now on pink serbia tv station a...,"[im, watching, it, now, on, pink, serbia, tv, ..."
4,"A warm, touching movie that has a fantasy-like...",False,a warm touching movie that has a fantasylike q...,"[a, warm, touching, movie, that, has, a, fanta..."
...,...,...,...,...
1495,I caught Evening in the cinema with a lady fri...,False,i caught evening in the cinema with a lady fri...,"[i, caught, evening, in, the, cinema, with, a,..."
1496,I originally scored Sarah's show with a nice f...,False,i originally scored sarahs show with a nice fa...,"[i, originally, scored, sarahs, show, with, a,..."
1497,Users who have rated this movie so highly simp...,False,users who have rated this movie so highly simp...,"[users, who, have, rated, this, movie, so, hig..."
1498,This is an exquisite film about the search for...,False,this is an exquisite film about the search for...,"[this, is, an, exquisite, film, about, the, se..."


In [5]:
sample_index = 1

sample_review = reviews_df.tokenized[sample_index]

In [10]:
data_den = DataGenerator(sample_review, vocabulary=vocabulary)

TypeError: __init__() got an unexpected keyword argument 'vocabulary'

In [None]:
data_gen.

In [21]:
pwc_and_predicted = []

for index in range(config.PREVIOUS_WORDS_CONSIDERED, len(sample_review)):
    tokens = sample_review[index-config.PREVIOUS_WORDS_CONSIDERED:index]
    
    not_belonging = belong_to_vocabulary(tokens, vocabulary)

    if not_belonging is None:
        y_pred = model.predict(vectorize(tokens, word2vec), verbose=0)

        predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]
        probabilities = np.sort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

        predicted_probability = {vocabulary[index]: prob for index, prob in zip(predicted_indices, probabilities)}
        
        pwc_and_predicted.append((tokens, predicted_probability))
        # if vocabulary[np.argmax(y_pred)] != "the":
        #     pwc_and_predicted.append((tokens, vocabulary[np.argmax(y_pred)]))

In [22]:
reviews_df.review[sample_index]

'ROUEN PRIZES AND THE TRIUMPH OF "VILLA PARANOIA" The favorite film of the general public, actually more important than the jury prize, was Erik Clausen\'s brilliant bittersweet dramatic comedy, "Villa Paranoia", which was also selected by the European Youth Jury indicative of its appeal to cinephiles of all ages. The following day director-actor Clausen traveled to the remote Town of MAMERS, Pays de Loire, for a provincial festival of new European cinema, where "Villa Paranoia" picked up three more prizes -- Best film, Professional Jury; Best Film, Audience prize; and Best film of another youth jury composed of "lycéens", French high school students. Five prizes in a single weekend -- not a bad scoop for a film from a small country with unknown actors. In addition, "Villa" was awarded the Grand Prix, the MAVERICK SPIRIT AWARD, at San Jose, California, just a week ago, by distinguished British actor Sir Ben Kingsley ("Ghandi"), making for a grand total of six prizes in a single week. I

In [23]:
pwc_and_predicted

[(['the', 'favorite', 'film'],
  {'ive': 0.83264333,
   'and': 0.86457866,
   'ever': 0.93646973,
   'i': 0.9395511,
   'of': 0.98943293}),
 (['favorite', 'film', 'of'],
  {'his': 0.8163801,
   'all': 0.83452153,
   'my': 0.8371152,
   'this': 0.94962,
   'the': 0.96721977}),
 (['film', 'of', 'the'],
  {'same': 0.80845857,
   'film': 0.8216265,
   's': 0.82805264,
   'movie': 0.8327734,
   'first': 0.87102735}),
 (['of', 'the', 'general'],
  {'but': 0.74813604,
   'scenes': 0.75337756,
   'of': 0.8174693,
   'in': 0.817786,
   'and': 0.9334418}),
 (['actually', 'more', 'important'],
  {'scenes': 0.7624456,
   'films': 0.777586,
   'of': 0.8178852,
   'and': 0.876454,
   'than': 0.97628176}),
 (['more', 'important', 'than'],
  {'that': 0.86767685,
   'a': 0.9256853,
   'it': 0.9422558,
   'the': 0.96634954,
   'this': 0.96659064}),
 (['important', 'than', 'the'],
  {'same': 0.8008089,
   'most': 0.8506802,
   'best': 0.85811234,
   'worst': 0.86720806,
   'first': 0.89203227}),
 (['whic