In [2]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import os

import tensorflow as tf

import config

## Loading models and vocabulary

In [3]:
model_dir = "models/word2vecs/pwc3-win7-vec200-min20/"

model = tf.keras.models.load_model(model_dir + "predicor-extralayer-lr0.001-batch128.h5")

In [5]:
with open(model_dir + "vocabulary_small.pkl", 'rb') as file:
    vocabulary = pickle.load(file)

with open(model_dir + "word2vec-big-dataset.pkl", 'rb') as file:
    word2vec = pickle.load(file)

In [6]:
input_size = 200 * 3

## Evaluation 

In [7]:
'a' in vocabulary

True

In [8]:
""" Returns the first not belonging token in the vocabulary
"""
def belong_to_vocabulary(tokens: list, vocabulary: list):
    for token in tokens:
        if not token in vocabulary:
            return token
    
    return None

In [9]:
def vectorize(tokens: list, word2vec):
    embedded_tokens = np.array([])
    
    for token in tokens:
        embedded_tokens = np.append(embedded_tokens, word2vec.wv.get_vector(token))

    return embedded_tokens.reshape(1, input_size)

In [10]:
# quick test

print(belong_to_vocabulary(['a', 'story', 'the'], vocabulary))
print(belong_to_vocabulary(['a', 'story', 'marvelous', 'the'], vocabulary))

None
marvelous


In [11]:
model.predict(np.random.randn(1, input_size))



array([[0.92905706, 0.8018987 , 0.5040532 , ..., 0.0461078 , 0.06307044,
        0.6458937 ]], dtype=float32)

In [12]:
test_tokens = ['a', 'story', 'about']
test_tokens = ['the', 'main', 'character']
# test_tokens = ['my', 'feelings', 'were']

In [13]:
not_belonging = belong_to_vocabulary(test_tokens, vocabulary)

if not_belonging is None:
    y_pred = model.predict(vectorize(test_tokens, word2vec))
else:
    print("ERROR: Word \'{}\' does not belong to the vocabulary".format(not_belonging))



In [14]:
predicted_word = vocabulary[np.argmax(y_pred)]

In [15]:
predicted_word

'is'

In [16]:
num_proposed_word = 5

In [17]:
predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

In [18]:
for predicted_index in predicted_indices:
    print("a story about ", vocabulary[predicted_index])

a story about  to
a story about  in
a story about  of
a story about  and
a story about  is


In [19]:
reviews_df = pd.read_csv("data/reviews_cleaned_sample.csv", converters={'tokenized': pd.eval})
reviews_df

Unnamed: 0,review,positive,clean_text,tokenized
0,Flavia the Heretic is an undeniable work of ar...,False,flavia the heretic is an undeniable work of ar...,"[flavia, the, heretic, is, an, undeniable, wor..."
1,"ROUEN PRIZES AND THE TRIUMPH OF ""VILLA PARANOI...",False,rouen prizes and the triumph of villa paranoia...,"[rouen, prizes, and, the, triumph, of, villa, ..."
2,"I liked the movie, first of all because it tol...",False,i liked the movie first of all because it told...,"[i, liked, the, movie, first, of, all, because..."
3,Im watching it now on pink (Serbia TV station)...,False,im watching it now on pink serbia tv station a...,"[im, watching, it, now, on, pink, serbia, tv, ..."
4,"A warm, touching movie that has a fantasy-like...",False,a warm touching movie that has a fantasylike q...,"[a, warm, touching, movie, that, has, a, fanta..."
...,...,...,...,...
1495,I caught Evening in the cinema with a lady fri...,False,i caught evening in the cinema with a lady fri...,"[i, caught, evening, in, the, cinema, with, a,..."
1496,I originally scored Sarah's show with a nice f...,False,i originally scored sarahs show with a nice fa...,"[i, originally, scored, sarahs, show, with, a,..."
1497,Users who have rated this movie so highly simp...,False,users who have rated this movie so highly simp...,"[users, who, have, rated, this, movie, so, hig..."
1498,This is an exquisite film about the search for...,False,this is an exquisite film about the search for...,"[this, is, an, exquisite, film, about, the, se..."


In [26]:
sample_index = 1

sample_review = reviews_df.tokenized[sample_index]

In [21]:
pwc_and_predicted = []

for index in range(config.PREVIOUS_WORDS_CONSIDERED, len(sample_review)):
    tokens = sample_review[index-config.PREVIOUS_WORDS_CONSIDERED:index]
    
    not_belonging = belong_to_vocabulary(tokens, vocabulary)

    if not_belonging is None:
        y_pred = model.predict(vectorize(tokens, word2vec), verbose=0)

        predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]
        probabilities = np.sort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

        predicted_probability = {vocabulary[index]: prob for index, prob in zip(predicted_indices, probabilities)}
        
        pwc_and_predicted.append((tokens, predicted_probability))
        # if vocabulary[np.argmax(y_pred)] != "the":
        #     pwc_and_predicted.append((tokens, vocabulary[np.argmax(y_pred)]))

In [27]:
reviews_df.review[sample_index]

'ROUEN PRIZES AND THE TRIUMPH OF "VILLA PARANOIA" The favorite film of the general public, actually more important than the jury prize, was Erik Clausen\'s brilliant bittersweet dramatic comedy, "Villa Paranoia", which was also selected by the European Youth Jury indicative of its appeal to cinephiles of all ages. The following day director-actor Clausen traveled to the remote Town of MAMERS, Pays de Loire, for a provincial festival of new European cinema, where "Villa Paranoia" picked up three more prizes -- Best film, Professional Jury; Best Film, Audience prize; and Best film of another youth jury composed of "lycéens", French high school students. Five prizes in a single weekend -- not a bad scoop for a film from a small country with unknown actors. In addition, "Villa" was awarded the Grand Prix, the MAVERICK SPIRIT AWARD, at San Jose, California, just a week ago, by distinguished British actor Sir Ben Kingsley ("Ghandi"), making for a grand total of six prizes in a single week. I

In [24]:
pwc_and_predicted

[(['the', 'favorite', 'film'],
  {'to': 0.08033809,
   'that': 0.091807626,
   'and': 0.097708546,
   'i': 0.12499924,
   'of': 0.864169}),
 (['favorite', 'film', 'of'],
  {'all': 0.37627825,
   'this': 0.52601856,
   'his': 0.5465943,
   'a': 0.69181037,
   'the': 0.92827797}),
 (['film', 'of', 'the'],
  {'end': 0.51332116,
   'film': 0.56404805,
   'and': 0.5950996,
   'movie': 0.59991795,
   'of': 0.67791873}),
 (['of', 'the', 'general'],
  {'is': 0.10040785,
   'as': 0.11007168,
   'the': 0.11981049,
   'in': 0.1488824,
   'and': 0.38661638}),
 (['actually', 'more', 'important'],
  {'if': 0.09449345,
   'to': 0.1265785,
   '[END]': 0.16708659,
   'and': 0.19387765,
   'than': 0.42175585}),
 (['more', 'important', 'than'],
  {'his': 0.0983838,
   'a': 0.19368914,
   'it': 0.20898777,
   'this': 0.38417178,
   'the': 0.7674739}),
 (['important', 'than', 'the'],
  {'and': 0.3899348,
   'story': 0.40128,
   'of': 0.5361946,
   'film': 0.6207638,
   'movie': 0.636231}),
 (['which', 'was