In [2]:
# essentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import os

import tensorflow as tf

import config

## Loading models and vocabulary

In [33]:
model_dir = "models/word2vecs/pwc3-win7-vec200-min50/"

model = tf.keras.models.load_model(model_dir + "predicor-extralayer-lr0.001-batch128.h5")

In [34]:
with open(model_dir + "vocabulary_small.pkl", 'rb') as file:
    vocabulary = pickle.load(file)

with open(model_dir + "word2vec.pkl", 'rb') as file:
    word2vec = pickle.load(file)

In [35]:
input_size = 200 * 3

## Evaluation 

In [36]:
'a' in vocabulary

True

In [37]:
""" Returns the first not belonging token in the vocabulary
"""
def belong_to_vocabulary(tokens: list, vocabulary: list):
    for token in tokens:
        if not token in vocabulary:
            return token
    
    return None

In [38]:
def vectorize(tokens: list, word2vec):
    embedded_tokens = np.array([])
    
    for token in tokens:
        embedded_tokens = np.append(embedded_tokens, word2vec.wv.get_vector(token))

    return embedded_tokens.reshape(1, input_size)

In [39]:
# quick test

print(belong_to_vocabulary(['a', 'story', 'the'], vocabulary))
print(belong_to_vocabulary(['a', 'story', 'marvelous', 'the'], vocabulary))

None
marvelous


In [40]:
model.predict(np.random.randn(1, input_size))



array([[0.8287111 , 0.5412161 , 0.20005795, 0.10574338, 0.35401824,
        0.45367306, 0.09602261, 0.22088313, 0.40025556, 0.5392873 ,
        0.5635784 , 0.43980128, 0.23014396, 0.5074685 , 0.16855712,
        0.08047201, 0.22627427, 0.08021531, 0.2636618 , 0.19950962,
        0.31991604, 0.16677734, 0.09795438, 0.14400737, 0.12036452,
        0.2531241 , 0.34096292, 0.6339765 , 0.09568626, 0.0693303 ,
        0.09909873, 0.35216552, 0.11913209, 0.19364113, 0.15176456,
        0.17687114, 0.34218678, 0.30802083, 0.07646267, 0.1413412 ,
        0.09426312, 0.09740797, 0.02743375, 0.04345704, 0.04709029,
        0.1244511 , 0.6080524 , 0.45993158, 0.32360363, 0.08730534,
        0.09894819, 0.19628564, 0.06592494, 0.20557976, 0.09030134,
        0.09901462, 0.22819465, 0.11821279, 0.1211023 , 0.04136219,
        0.101469  , 0.41000432, 0.07166714, 0.0555346 , 0.09031345,
        0.09812423, 0.06894501, 0.08771562, 0.06199967, 0.21627213,
        0.22747362, 0.13074657, 0.14340149, 0.20

In [41]:
test_tokens = ['a', 'story', 'about']
test_tokens = ['the', 'main', 'character']
# test_tokens = ['my', 'feelings', 'were']

In [42]:
not_belonging = belong_to_vocabulary(test_tokens, vocabulary)

if not_belonging is None:
    y_pred = model.predict(vectorize(test_tokens, word2vec))
else:
    print("ERROR: Word \'{}\' does not belong to the vocabulary".format(not_belonging))



In [43]:
predicted_word = vocabulary[np.argmax(y_pred)]

In [44]:
predicted_word

'the'

In [45]:
num_proposed_word = 5

In [46]:
predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

In [47]:
for predicted_index in predicted_indices:
    print("a story about ", vocabulary[predicted_index])

a story about  a
a story about  of
a story about  is
a story about  and
a story about  the


In [48]:
reviews_df = pd.read_csv("data/reviews_cleaned_sample.csv", converters={'tokenized': pd.eval})
reviews_df

Unnamed: 0,review,positive,clean_text,tokenized
0,Flavia the Heretic is an undeniable work of ar...,False,flavia the heretic is an undeniable work of ar...,"[flavia, the, heretic, is, an, undeniable, wor..."
1,"ROUEN PRIZES AND THE TRIUMPH OF ""VILLA PARANOI...",False,rouen prizes and the triumph of villa paranoia...,"[rouen, prizes, and, the, triumph, of, villa, ..."
2,"I liked the movie, first of all because it tol...",False,i liked the movie first of all because it told...,"[i, liked, the, movie, first, of, all, because..."
3,Im watching it now on pink (Serbia TV station)...,False,im watching it now on pink serbia tv station a...,"[im, watching, it, now, on, pink, serbia, tv, ..."
4,"A warm, touching movie that has a fantasy-like...",False,a warm touching movie that has a fantasylike q...,"[a, warm, touching, movie, that, has, a, fanta..."
...,...,...,...,...
1495,I caught Evening in the cinema with a lady fri...,False,i caught evening in the cinema with a lady fri...,"[i, caught, evening, in, the, cinema, with, a,..."
1496,I originally scored Sarah's show with a nice f...,False,i originally scored sarahs show with a nice fa...,"[i, originally, scored, sarahs, show, with, a,..."
1497,Users who have rated this movie so highly simp...,False,users who have rated this movie so highly simp...,"[users, who, have, rated, this, movie, so, hig..."
1498,This is an exquisite film about the search for...,False,this is an exquisite film about the search for...,"[this, is, an, exquisite, film, about, the, se..."


In [49]:
sample_review = reviews_df.tokenized[1]
type(sample_review)

list

In [50]:
pwc_and_predicted = []

for index in range(config.PREVIOUS_WORDS_CONSIDERED, len(sample_review)):
    tokens = sample_review[index-config.PREVIOUS_WORDS_CONSIDERED:index]
    
    not_belonging = belong_to_vocabulary(tokens, vocabulary)

    if not_belonging is None:
        y_pred = model.predict(vectorize(tokens, word2vec), verbose=0)

        predicted_indices = np.argsort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]
        probabilities = np.sort(y_pred).reshape(len(vocabulary))[-num_proposed_word:]

        predicted_probability = {vocabulary[index]: prob for index, prob in zip(predicted_indices, probabilities)}
        
        pwc_and_predicted.append((tokens, predicted_probability))
        # if vocabulary[np.argmax(y_pred)] != "the":
        #     pwc_and_predicted.append((tokens, vocabulary[np.argmax(y_pred)]))

In [55]:
pwc_and_predicted

[(['the', 'favorite', 'film'],
  {'i': 0.5995596,
   'the': 0.6090629,
   'that': 0.6522759,
   'is': 0.67576826,
   'and': 0.7038415}),
 (['favorite', 'film', 'of'],
  {'and': 0.5412394,
   'this': 0.5544267,
   'his': 0.5647545,
   'a': 0.5832474,
   'the': 0.93613875}),
 (['film', 'of', 'the'],
  {'best': 0.7498231,
   'most': 0.763092,
   'worst': 0.77621627,
   'film': 0.79021233,
   'movie': 0.8514735}),
 (['which', 'was', 'also'],
  {'to': 0.5856237,
   'in': 0.6298851,
   'and': 0.64227915,
   'the': 0.7358801,
   'a': 0.7587391}),
 (['up', 'three', 'more'],
  {'the': 0.5864205,
   'to': 0.660791,
   'and': 0.718182,
   'of': 0.72854763,
   'than': 0.88650584}),
 (['best', 'film', 'audience'],
  {'a': 0.56604636,
   'of': 0.61137503,
   'in': 0.6415446,
   'the': 0.7455796,
   'and': 0.80563545}),
 (['and', 'best', 'film'],
  {'i': 0.6144221,
   'the': 0.61739063,
   'that': 0.679921,
   'is': 0.69340104,
   'and': 0.70720917}),
 (['best', 'film', 'of'],
  {'his': 0.50653636,
 