In [None]:
import pickle
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

from src.predictor import Predictor
from src.textvis import RGB, TextPainter
from src.embedding import TextIdCoverter
from src.model import get_model
from src.preprocessing import clean_doc

# load data

In [None]:
# load confing
import yaml
import pickle
with open('config.yaml', 'r') as f:
    conf = yaml.load(f)
MAX_SEQUENCE_LENGTH = conf["EMBEDDING"]["MAX_SEQUENCE_LENGTH"]
T = conf["PREDICT"]["T"]

In [12]:
# load model
model = tf.keras.models.load_model('rnn_v0.h5')

In [13]:
# load data
X_val = np.load('data/X_val.npy')
y_val = np.load('data/y_val.npy')

## load word_index
with open('data/word_index.pkl', 'rb') as f:
    word_index = pickle.load(f)
conv = TextIdCoverter(word_index)

# text example

In [14]:
raw_text = """It was an excellent performance by the actors and a great setting. Unfortunately, the plot was terrible. I hope that the actors find new projects."""
raw_text

'It was an excellent performance by the actors and a great setting. Unfortunately, the plot was terrible. I hope that the actors find new projects.'

# predict

In [15]:
p = Predictor(model, T)

e_i = conv.text2id(raw_text).reshape(1, MAX_SEQUENCE_LENGTH)
l = len(conv.id2text(e_i).split(' '))
p_seq, u_e_seq, u_a_seq, p_input = p.predict_with_uncertainty(e_i)

# define colour scheme

In [16]:
rgb = RGB()
tp = TextPainter()

colors = [(51, 153, 51),(255, 255, 255), (255, 255, 255), (255, 0, 0)]
colors2 = [(0, 0, 0), (220, 220, 220)] 
colors3 = [(0, 153, 255), (255, 255, 255), (255, 255, 255), (255, 153, 0)]

WORD_RELEVANCE = rgb.get_rgb_list(100, colors)
WORD_UNCERTAINTY = rgb.get_rgb_list(5, colors2)
SEQ_UNCERTAINTY = rgb.get_rgb_list(100, colors3)

# highlight relevant words

In [17]:
# compute word relevance
relevances = np.array(p_input)[0].mean(axis=0)[:, 1][-l:]
relevances_diff = list(np.insert(np.diff(relevances), 0, 0))

# normalize
max_rel = max(relevances_diff)
min_rel = min(relevances_diff)*-1
abs_max_rel = max(max_rel, min_rel, 0)

In [18]:
from numpy import linalg as LA
text = []

# fill missing words
raw_split = raw_text.split(' ')
removed_word_index = list(filter(lambda x: x is not None, [i if clean_doc(raw_split[i]) == '' else None for i in range(len(raw_split))]))
for i in removed_word_index:
    relevances_diff.insert(i, 0)

# visualize words 
for i in zip([int(round((float(i)/abs_max_rel)*50))+50 for i in relevances_diff], raw_text.split(' ')):
    text.append(tp.colour_background(i[1], WORD_RELEVANCE[i[0]]))
print(' '.join(text))

[48;2;255;255;255mIt[0m [48;2;255;255;255mwas[0m [48;2;255;255;255man[0m [48;2;51;153;51mexcellent[0m [48;2;255;255;255mperformance[0m [48;2;255;255;255mby[0m [48;2;255;255;255mthe[0m [48;2;255;255;255mactors[0m [48;2;255;255;255mand[0m [48;2;255;255;255ma[0m [48;2;167;211;167mgreat[0m [48;2;255;255;255msetting.[0m [48;2;255;130;130mUnfortunately,[0m [48;2;255;255;255mthe[0m [48;2;255;252;252mplot[0m [48;2;255;255;255mwas[0m [48;2;255;7;7mterrible.[0m [48;2;255;255;255mI[0m [48;2;255;255;255mhope[0m [48;2;255;255;255mthat[0m [48;2;255;255;255mthe[0m [48;2;255;255;255mactors[0m [48;2;255;255;255mfind[0m [48;2;255;255;255mnew[0m [48;2;246;250;246mprojects.[0m


# highlight uncertain words

In [19]:
# compute word uncertainty
u_e_seq = u_e_seq.reshape((MAX_SEQUENCE_LENGTH, 2))
u_a_seq = u_a_seq.reshape((MAX_SEQUENCE_LENGTH, 2))

u_e = u_e_seq[:, 1][-l:]
u_a = u_a_seq[:, 1][-l:]
u_t = u_e + u_a

uncertainty_diff = list(np.insert(np.diff(u_t), 0, 0))
u_t = list(u_t)

# normalize
max_u = max(uncertainty_diff)
min_u = min(uncertainty_diff)*-1
abs_max_u = max(max_u, min_u, 0)

In [20]:
text = []

# fill missing words
raw_split = raw_text.split(' ')
removed_word_index = list(filter(lambda x: x is not None, [i if clean_doc(raw_split[i]) == '' else None for i in range(len(raw_split))]))
for i in removed_word_index:
    uncertainty_diff.insert(i, 0)
    u_t.insert(i, u_t[i-1 if i > 1 else 0])

# visualize words 
for i in zip([int(round((float(i)/abs_max_u)*50))+50 for i in uncertainty_diff], raw_text.split(' '), u_t):
    text.append(tp.color_font(tp.colour_background(i[1], SEQ_UNCERTAINTY[i[0]]), WORD_UNCERTAINTY[int(round(i[2]*4*5))]))

print(' '.join(text))

[38;2;220;220;220m[48;2;255;255;255mIt[0m[0m [38;2;220;220;220m[48;2;255;255;255mwas[0m[0m [38;2;220;220;220m[48;2;255;255;255man[0m[0m [38;2;43;43;43m[48;2;0;153;255mexcellent[0m[0m [38;2;88;88;88m[48;2;255;255;255mperformance[0m[0m [38;2;88;88;88m[48;2;255;255;255mby[0m[0m [38;2;88;88;88m[48;2;255;255;255mthe[0m[0m [38;2;132;132;132m[48;2;255;255;255mactors[0m[0m [38;2;88;88;88m[48;2;255;255;255mand[0m[0m [38;2;88;88;88m[48;2;255;255;255ma[0m[0m [38;2;43;43;43m[48;2;229;244;255mgreat[0m[0m [38;2;43;43;43m[48;2;255;255;255msetting.[0m[0m [38;2;88;88;88m[48;2;255;244;229mUnfortunately,[0m[0m [38;2;88;88;88m[48;2;255;255;255mthe[0m[0m [38;2;132;132;132m[48;2;255;255;255mplot[0m[0m [38;2;132;132;132m[48;2;255;255;255mwas[0m[0m [38;2;220;220;220m[48;2;255;250;244mterrible.[0m[0m [38;2;220;220;220m[48;2;255;255;255mI[0m[0m [38;2;220;220;220m[48;2;255;255;255mhope[0m[0m [38;2;220;220;220m[48;2;255;255;255mthat[0m