### Laoding Liblaries and data

In [82]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import warnings
warnings.filterwarnings("ignore")


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [71]:
filename = 'deu.txt'
doc = load_doc(filename)
# doc

In [3]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [4]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [re_punc.sub('', w) for w in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [5]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)


In [6]:
# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[duck] => [kopf runter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[hide] => [versteck dich]
[hide] => [versteckt euch]
[stay] => [bleib]
[stop] => [stopp]
[stop] => [anhalten]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[do it] => [mache es]
[do it] => [tue es]
[go on] => [mach weiter]
[hello] => [hallo]
[hello] => [sers]
[hello] => [hallo]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i hid] => [ich versteckte mich]
[i hid] => [ich habe mich versteckt]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich versuche es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[oh no] => [oh nein]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[sorry] => [entschuldigung]
[ask me] =

### Split Text

In [7]:
from pickle import load
from pickle import dump
from numpy.random import shuffle


In [8]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))


In [9]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [10]:
# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [43]:
raw_dataset.shape


(277891, 3)

In [76]:
dataset

array([['tom can walk', 'tom kann laufen',
        'ccby france attribution tatoebaorg ck yorwba'],
       ['i like art', 'ich mag kunst',
        'ccby france attribution tatoebaorg ck yorwba'],
       ['well ask tom', 'wir werden tom fragen',
        'ccby france attribution tatoebaorg ck felixjp'],
       ...,
       ['watch my back', 'gib mir ruckendeckung',
        'ccby france attribution tatoebaorg spamster pfirsichbaeumchen'],
       ['im a doctor', 'ich bin arzt',
        'ccby france attribution tatoebaorg ck muiriel'],
       ['its a fad', 'das ist eine vorubergehende laune',
        'ccby france attribution tatoebaorg spamster pfirsichbaeumchen']],
      dtype='<U527')

### Train Neural Translation Model

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [11]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')


In [12]:
print(train.shape)
print(test.shape)

(9000, 3)
(1000, 3)


In [13]:
train

array([['tom can walk', 'tom kann laufen',
        'ccby france attribution tatoebaorg ck yorwba'],
       ['i like art', 'ich mag kunst',
        'ccby france attribution tatoebaorg ck yorwba'],
       ['well ask tom', 'wir werden tom fragen',
        'ccby france attribution tatoebaorg ck felixjp'],
       ...,
       ['he came', 'er kam',
        'ccby france attribution tatoebaorg piksea espi'],
       ['im a colonel', 'ich bin oberst',
        'ccby france attribution tatoebaorg ck pfirsichbaeumchen'],
       ['i never worry', 'ich mache mir nie sorgen',
        'ccby france attribution tatoebaorg ck mayabe']], dtype='<U527')

In [14]:
test

array([['tom is hated', 'tom wird gehasst',
        'ccby france attribution tatoebaorg ck wolfgangth'],
       ['see you again', 'auf wiedersehen',
        'ccby france attribution tatoebaorg ck muiriel'],
       ['everybody lies', 'alle lugen',
        'ccby france attribution tatoebaorg goracykabanos muiriel'],
       ...,
       ['watch my back', 'gib mir ruckendeckung',
        'ccby france attribution tatoebaorg spamster pfirsichbaeumchen'],
       ['im a doctor', 'ich bin arzt',
        'ccby france attribution tatoebaorg ck muiriel'],
       ['its a fad', 'das ist eine vorubergehende laune',
        'ccby france attribution tatoebaorg spamster pfirsichbaeumchen']],
      dtype='<U527')

In [18]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


In [19]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)


In [20]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))


English Vocabulary Size: 2172
English Max Length: 5
German Vocabulary Size: 3578
German Max Length: 8


In [77]:
eng_tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x212990941d0>

In [78]:
eng_tokenizer.word_index

{'i': 1,
 'tom': 2,
 'it': 3,
 'you': 4,
 'is': 5,
 'a': 6,
 'im': 7,
 'was': 8,
 'me': 9,
 'he': 10,
 'we': 11,
 'go': 12,
 'its': 13,
 'do': 14,
 'can': 15,
 'dont': 16,
 'are': 17,
 'ill': 18,
 'that': 19,
 'this': 20,
 'get': 21,
 'were': 22,
 'to': 23,
 'come': 24,
 'have': 25,
 'youre': 26,
 'be': 27,
 'up': 28,
 'the': 29,
 'like': 30,
 'my': 31,
 'in': 32,
 'who': 33,
 'toms': 34,
 'love': 35,
 'here': 36,
 'they': 37,
 'did': 38,
 'on': 39,
 'am': 40,
 'keep': 41,
 'she': 42,
 'take': 43,
 'us': 44,
 'need': 45,
 'stop': 46,
 'not': 47,
 'him': 48,
 'thats': 49,
 'want': 50,
 'now': 51,
 'how': 52,
 'no': 53,
 'well': 54,
 'let': 55,
 'home': 56,
 'hes': 57,
 'try': 58,
 'away': 59,
 'know': 60,
 'lost': 61,
 'saw': 62,
 'help': 63,
 'got': 64,
 'out': 65,
 'see': 66,
 'one': 67,
 'has': 68,
 'will': 69,
 'theyre': 70,
 'eat': 71,
 'down': 72,
 'too': 73,
 'look': 74,
 'hate': 75,
 'there': 76,
 'cant': 77,
 'lets': 78,
 'wait': 79,
 'just': 80,
 'what': 81,
 'so': 82,
 'stay'

In [79]:
ger_tokenizer.word_index

{'ich': 1,
 'tom': 2,
 'ist': 3,
 'sie': 4,
 'es': 5,
 'das': 6,
 'bin': 7,
 'wir': 8,
 'nicht': 9,
 'habe': 10,
 'er': 11,
 'du': 12,
 'hat': 13,
 'mich': 14,
 'zu': 15,
 'ein': 16,
 'war': 17,
 'mir': 18,
 'sind': 19,
 'dich': 20,
 'auf': 21,
 'uns': 22,
 'wer': 23,
 'werde': 24,
 'kann': 25,
 'die': 26,
 'eine': 27,
 'an': 28,
 'sich': 29,
 'ihr': 30,
 'haben': 31,
 'bist': 32,
 'gehen': 33,
 'hier': 34,
 'einen': 35,
 'mal': 36,
 'wird': 37,
 'was': 38,
 'wie': 39,
 'mag': 40,
 'ihn': 41,
 'in': 42,
 'liebe': 43,
 'jetzt': 44,
 'gut': 45,
 'mach': 46,
 'euch': 47,
 'aus': 48,
 'dir': 49,
 'lass': 50,
 'komm': 51,
 'mit': 52,
 'geht': 53,
 'den': 54,
 'geh': 55,
 'konnen': 56,
 'nach': 57,
 'da': 58,
 'hause': 59,
 'wurde': 60,
 'mein': 61,
 'will': 62,
 'werden': 63,
 'der': 64,
 'seid': 65,
 'kommen': 66,
 'so': 67,
 'sei': 68,
 'hast': 69,
 'brauche': 70,
 'weiter': 71,
 'bitte': 72,
 'fur': 73,
 'schon': 74,
 'hatte': 75,
 'gesehen': 76,
 'hasse': 77,
 'weg': 78,
 'noch': 79,
 '

In [21]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [22]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [23]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    # compile model
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model


In [37]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
# fit model
#checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1,save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY)
          , verbose=2)
model.save('model.h5')  # Save as an HDF5 file

English Vocabulary Size: 2172
English Max Length: 5
German Vocabulary Size: 3578
German Max Length: 8


Epoch 1/30
141/141 - 21s - 150ms/step - loss: 4.0528 - val_loss: 3.3117
Epoch 2/30
141/141 - 12s - 89ms/step - loss: 3.1780 - val_loss: 3.1548
Epoch 3/30
141/141 - 12s - 88ms/step - loss: 3.0032 - val_loss: 3.0114
Epoch 4/30
141/141 - 12s - 88ms/step - loss: 2.8351 - val_loss: 2.9130
Epoch 5/30
141/141 - 13s - 92ms/step - loss: 2.7012 - val_loss: 2.8121
Epoch 6/30
141/141 - 13s - 96ms/step - loss: 2.5586 - val_loss: 2.6999
Epoch 7/30
141/141 - 13s - 91ms/step - loss: 2.4130 - val_loss: 2.6056
Epoch 8/30
141/141 - 13s - 91ms/step - loss: 2.2769 - val_loss: 2.4946
Epoch 9/30
141/141 - 12s - 89ms/step - loss: 2.1401 - val_loss: 2.3930
Epoch 10/30
141/141 - 13s - 90ms/step - loss: 2.0033 - val_loss: 2.3306
Epoch 11/30
141/141 - 14s - 102ms/step - loss: 1.8789 - val_loss: 2.2526
Epoch 12/30
141/141 - 13s - 92ms/step - loss: 1.7562 - val_loss: 2.1852
Epoch 13/30
141/141 - 13s - 91ms/step - loss: 1.6442 - val_loss: 2.1280
Epoch 14/30
141/141 - 13s - 92ms/step - loss: 1.5407 - val_loss: 2.0857



### Evaluate Neural Translation Model

In [39]:
from numpy import argmax

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [31]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [32]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [72]:
# evaluate the skill of the model
def evaluate_model(model, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src, desc= raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [80]:
# evaluate_model(model, trainX, train)
# source = source.reshape((1, source.shape[0]))
# trainX.reshape((1, trainX.shape[0]))
print('Shape of the data is',trainX.shape[0])
print(trainX)
print('\n')

for i, source in enumerate(trainX):
    print(f'Index{i} :souurce{source}')

Shape of the data is 9000
[[   2   25  235 ...    0    0    0]
 [   1   40 1300 ...    0    0    0]
 [   8   63    2 ...    0    0    0]
 ...
 [  11  141    0 ...    0    0    0]
 [   1    7 3405 ...    0    0    0]
 [   1  199   18 ...    0    0    0]]


Index0 :souurce[  2  25 235   0   0   0   0   0]
Index1 :souurce[   1   40 1300    0    0    0    0    0]
Index2 :souurce[  8  63   2 201   0   0   0   0]
Index3 :souurce[   1 1891    6    0    0    0    0    0]
Index4 :souurce[  1  43 959   0   0   0   0   0]
Index5 :souurce[480  71   0   0   0   0   0   0]
Index6 :souurce[ 99   1  14 664   0   0   0   0]
Index7 :souurce[   1  481 1892    0    0    0    0    0]
Index8 :souurce[  4  13  41 386   0   0   0   0]
Index9 :souurce[19  4 34  0  0  0  0  0]
Index10 :souurce[  2   3  16 236   0   0   0   0]
Index11 :souurce[  1  17 482 560   0   0   0   0]
Index12 :souurce[  90   49   35 1301    0    0    0    0]
Index13 :souurce[ 12  32  64 665   0   0   0   0]
Index14 :souurce[278  75  11  

Index4250 :souurce[   1  431   52  124 1704  296  203    0]
Index4251 :souurce[  1 463 149   0   0   0   0   0]
Index4252 :souurce[   1    7  134 2625    0    0    0    0]
Index4253 :souurce[  1  17   9 266   0   0   0   0]
Index4254 :souurce[  1  25 100 507   0   0   0   0]
Index4255 :souurce[   1 1238    5    0    0    0    0    0]
Index4256 :souurce[   5   17 2626    0    0    0    0    0]
Index4257 :souurce[ 12  69 337   0   0   0   0   0]
Index4258 :souurce[23  3  4  0  0  0  0  0]
Index4259 :souurce[157  18  44  15   0   0   0   0]
Index4260 :souurce[  1  24 637   0   0   0   0   0]
Index4261 :souurce[   1   24 2627    0    0    0    0    0]
Index4262 :souurce[1702    4   22    0    0    0    0    0]
Index4263 :souurce[ 23  13 673   0   0   0   0   0]
Index4264 :souurce[   2 1558    0    0    0    0    0    0]
Index4265 :souurce[ 72 477   0   0   0   0   0   0]
Index4266 :souurce[784   4 101   0   0   0   0   0]
Index4267 :souurce[ 11 771   0   0   0   0   0   0]
Index4268 :souur

Index7723 :souurce[   1   17   21 1581    0    0    0    0]
Index7724 :souurce[   6   37 3196    0    0    0    0    0]
Index7725 :souurce[  4  13  35 167   0   0   0   0]
Index7726 :souurce[46 21  0  0  0  0  0  0]
Index7727 :souurce[ 12  69  18 699   0   0   0   0]
Index7728 :souurce[   1   40 1673    0    0    0    0    0]
Index7729 :souurce[211  20   0   0   0   0   0   0]
Index7730 :souurce[  1 455 292   0   0   0   0   0]
Index7731 :souurce[ 2 25 33  0  0  0  0  0]
Index7732 :souurce[   8  156 1400    0    0    0    0    0]
Index7733 :souurce[  2  17 393   0   0   0   0   0]
Index7734 :souurce[127   4  29   6  36  28   0   0]
Index7735 :souurce[ 2  3 91  0  0  0  0  0]
Index7736 :souurce[ 44 210 177   0   0   0   0   0]
Index7737 :souurce[  1   7  67 247   0   0   0   0]
Index7738 :souurce[  1 263  14 839   0   0   0   0]
Index7739 :souurce[3197    4    2    0    0    0    0    0]
Index7740 :souurce[ 30  65 901   0   0   0   0   0]
Index7741 :souurce[  1  10  58  67  27 953   0  

In [74]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [83]:
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, testX, test)




train
src=[tom kann laufen], target=[tom can walk], predicted=[tom can win]
src=[ich mag kunst], target=[i like art], predicted=[i like art]
src=[wir werden tom fragen], target=[well ask tom], predicted=[well ask tom]
src=[ich verwende das], target=[i use this], predicted=[i use this]
src=[ich liebe schnee], target=[i love snow], predicted=[i love snow]
src=[lies weiter], target=[keep reading], predicted=[keep reading]
src=[darf ich mich setzen], target=[can i sit down], predicted=[may i sit]
src=[ich gebe franzosischunterricht], target=[i teach french], predicted=[i cried french]
src=[sie hat ihn geschlagen], target=[she hit him], predicted=[she hit him]
src=[sind sie hier], target=[are you here], predicted=[are they here]
BLEU-1: 0.088656
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
test
src=[tom wird gehasst], target=[tom is hated], predicted=[tom will pay]
src=[auf wiedersehen], target=[see you again], predicted=[goodbye]
src=[alle lugen], target=[everybody lies], predicted=[