# Imports and Dependencies

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.datasets.base import Bunch
import sklearn.metrics
import sys

import numpy as np
import regex 
import string 
from tqdm import tqdm_notebook
from embedding import load_embedding
from utils import normalize_questions, perf_measure, printTrace



from keras.layers import LSTM, GRU, Dense, Dropout, Flatten, MaxPooling1D, Convolution1D, Input, concatenate, Lambda, Bidirectional, Input, Dense, Embedding, Conv2D, MaxPool2D, Reshape, Flatten, Dropout, Concatenate
from keras.layers.convolutional import Conv1D
from keras.models import Sequential, load_model, model_from_config
from keras.models import Model
import keras.backend as K
print('Available GPUs:' + str(K.tensorflow_backend._get_available_gpus()))


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Available GPUs:['/job:localhost/replica:0/task:0/device:GPU:0']


# Constants

In [2]:
Embedding_path = 'Embeddings/esTech_enTech.enesJ.vec'
Dataset_path = 'Dataset/'
Save_path = 'Weights/'
BATCH_SIZE = 1024
EMB_DIMS = 300
MAX_WORDS = 25
EPOCHS = 128

# Prepare Data

In [3]:
def pad_sentence(sentence,dims):
    try:
        if len(sentence) >= MAX_WORDS:
            return sentence[:MAX_WORDS]
        else:
            while(len(sentence) < MAX_WORDS):
                sentence = np.concatenate((sentence,[np.zeros(dims)]),axis=0)            
            return sentence
    except:
        print('Error in sentence: ' + str(sentence))
        raise ValueError('ReadingError')

def sentence_to_vectors(sentence,emb,default_vec,prefix=None):
    words = normalize_questions(sentence)
    if len(sentence)>0:
        vectors = []
        for word in words:
            try:
                if prefix:
                    vectors.append(emb.word_to_vector(prefix+'/'+word))
                else:
                    vectors.append(emb.word_to_vector(word))
            except KeyError as r:
                vectors.append(default_vec)
                #print('Missing word: ' + str(word))
        return pad_sentence(vectors, emb.dims)
    
    else:
        print('Error in sentence: ' + str(sentence))
        raise ValueError('ReadingError')

def dataset_to_vectors(dataset_path, emb, default_vec, prefix = None):
    datasetA = []
    datasetB = []
    golds = []

    with open(dataset_path,'r') as file:
        for line in tqdm_notebook(file):
            line = line.rstrip()
            try:
                q1,q2,g = line.split('\t')
                #print(q1)
                #print(q2)
                #print(g)
                qA = sentence_to_vectors(q1, emb, default_vec, prefix)
                qB = sentence_to_vectors(q2, emb, default_vec, prefix)
                datasetA.append(qA)
                datasetB.append(qB)
                golds.append(int(g))
            except:
                print('Error in line: ' + line)
                continue
            
    return Bunch(X=[np.asarray(datasetA),np.asarray(datasetB)], y=golds)


emb = load_embedding(Embedding_path, normalize_dimensionwise=False, length_normalize=False, lower=False)
EMB_DIMS = emb.dims
default_vec = np.mean(emb.vectors, axis=0)

train_set = dataset_to_vectors(Dataset_path+'train_set.csv', emb, default_vec, 'en')
dev_set = dataset_to_vectors(Dataset_path+'dev_set.csv', emb, default_vec, 'en')
test_set = dataset_to_vectors(Dataset_path+'test_set_corr.csv', emb, default_vec, 'es')

<2019-06-18 02:11:39.275751>  Loading embedding Embeddings/esTech_enTech.enesJ.vec: 99%


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Error in sentence: []
Error in line: ?	Given that PACER makes money from usage, why does the legal profession tolerate its absurd awfulness?	0
Error in sentence: []
Error in line: What is this?	o	0
Error in line: Mathematical Puzzles: What is () + () + () = 30 using 1,3,5,7,9,11,13,15?	How do I Simplify the following matrices:
Error in line: 1. -5 [-7 0 0 5]; 2. 3 [6 4 5 -5 3 1]?	0
Error in sentence: []
Error in line: FROM WHICH website CAN WE DOWNLOAD PORNOGRAPHY SAFELY AND free?	مين عايز هامبورجر و مين عايز سوسيس؟?	0



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Error in sentence: []
Error in line: …..	What do I do with this guy?	0
Error in sentence: []
Error in line: no	I have a BS and MPH and hate my job. I found that I love working with website design but is it worth going back to school?	0
Error in sentence: []
Error in line: What is the psychological explanation to the existence of Rudalis in time of Rajas in India?	ماذا علي ان افعل لكي اشعر بالسعادة و الرضى ؟	0



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# EVALUATION

In [4]:
def evaluate(model, dataset):
    
    gold_scores = dataset.y
    scores = [0 if x <0.5 else 1 for x in model.predict(dataset.X, batch_size=BATCH_SIZE)]
    
    result = sklearn.metrics.log_loss(gold_scores, scores)
    TP, FP, TN, FN = perf_measure(gold_scores, scores)
    acc = np.sum(np.array(gold_scores) == np.array(scores)) / len(gold_scores)
    #print('======================================')
    print('Log Loss: ' + str(result))
    print('Acc: ' + str(acc))
    print('TP: ' + str(TP) + '\tFP: ' + str(FP) + '\tTN: ' + str(TN) + '\tFN: ' + str(FN))
    #print('======================================')
    
    return result

# MODEL

In [5]:
def build_model_concat_LSTM():
    
    inputA = Input(shape=(MAX_WORDS,EMB_DIMS))
    inputB = Input(shape=(MAX_WORDS,EMB_DIMS))
    A = Model(inputs=inputA, outputs=inputA)
    B = Model(inputs=inputB, outputs=inputB)
    
    AB = concatenate([A.output, B.output])
    
    NN = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(AB)
    NN = Dense(1, activation='sigmoid')(NN)
    
    model = Model(inputs=[A.input, B.input], outputs=NN)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model


In [6]:
def build_model_LSTM_concat():
    
    inputA = Input(shape=(MAX_WORDS,EMB_DIMS))
    inputB = Input(shape=(MAX_WORDS,EMB_DIMS))
    
    A = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(inputA)
    B = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(inputB)
    
    A = Model(inputs=inputA, outputs=A)
    B = Model(inputs=inputB, outputs=B)
    
    AB = concatenate([A.output, B.output])
    
    NN = Dense(256, activation='sigmoid')(AB)
    NN = Dense(1, activation='sigmoid')(NN)
    
    model = Model(inputs=[A.input, B.input], outputs=NN)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model


In [7]:
def build_model_perceptron():
    
    inputA = Input(shape=(MAX_WORDS,EMB_DIMS))
    inputB = Input(shape=(MAX_WORDS,EMB_DIMS))
    A = Model(inputs=inputA, outputs=inputA)
    B = Model(inputs=inputB, outputs=inputB)
    
    AB = concatenate([A.output, B.output])
    
    NN = Dense(300, activation='sigmoid')(AB)
    NN = Dense(1, activation='sigmoid')(NN)
    
    model = Model(inputs=[A.input, B.input], outputs=NN)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

# RUN BASIC PERCEPTRON

In [8]:
model = build_model_concat_LSTM()

bestEN = sys.float_info.max
bestES = sys.float_info.max

for epoch in range(EPOCHS):
    printTrace('Epoch ' +str(epoch+1) + ' of ' +str(EPOCHS))
    model.fit(train_set.X, train_set.y, batch_size=BATCH_SIZE, epochs=1, verbose=1)
    print("DEVELOPMENT SET (ENGLISH)")
    
    result_en = evaluate(model,dev_set)
    
    if result_en < bestEN:
        bestEN = result_en
        model.save(Save_path+'bestEN_perceptron.h5')
    
    print("DEVELOPMENT SET (SPANISH)")
    result_es = evaluate(model,test_set)
    if result_es < bestES:
        bestEN = result_en
        model.save(Save_path+'bestES_perceptron.h5')

    
model.save(Save_path+'final_perceptron.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25, 300)      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 25, 300)      0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 25, 600)      0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 256)          746496      concatenate_1[0][0]              
__________

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.501790134860672
Acc: 0.7538499597996166
TP: 3390	FP: 1394	TN: 8799	FN: 2586
DEVELOPMENT SET (SPANISH)
Log Loss: 15.887845137633217
Acc: 0.54
TP: 5	FP: 1	TN: 49	FN: 45
<2019-06-18 02:19:09.034627>  Epoch 21 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.501788849095743
Acc: 0.7538499597996166
TP: 3364	FP: 1368	TN: 8825	FN: 2612
DEVELOPMENT SET (SPANISH)
Log Loss: 15.887845137633217
Acc: 0.54
TP: 5	FP: 1	TN: 49	FN: 45
<2019-06-18 02:19:15.951783>  Epoch 22 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.456954011102434
Acc: 0.7551487414187643
TP: 3860	FP: 1843	TN: 8350	FN: 2116
DEVELOPMENT SET (SPANISH)
Log Loss: 16.233224905608022
Acc: 0.53
TP: 3	FP: 0	TN: 50	FN: 47
<2019-06-18 02:19:22.869290>  Epoch 23 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.488994536501826
Acc: 0.7542210402622302
TP: 3822	FP: 1820	TN: 8373	FN: 2154
DEVELOPMENT SET (SPANISH)
Log Loss: 16.233224905608022
Acc: 0.53
TP: 3	FP: 0	TN: 50	FN: 47
<2019-06-18 

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.412117491724217
Acc: 0.7564475230379121
TP: 4322	FP: 2284	TN: 7909	FN: 1654
DEVELOPMENT SET (SPANISH)
Log Loss: 15.887853133607518
Acc: 0.54
TP: 6	FP: 2	TN: 48	FN: 44
<2019-06-18 02:21:55.775844>  Epoch 45 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.315964216475217
Acc: 0.7592306265075144
TP: 3795	FP: 1712	TN: 8481	FN: 2181
DEVELOPMENT SET (SPANISH)
Log Loss: 17.269396193429642
Acc: 0.5
TP: 1	FP: 1	TN: 49	FN: 49
<2019-06-18 02:22:02.738881>  Epoch 46 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.281796481671565
Acc: 0.7602201744078174
TP: 4014	FP: 1915	TN: 8278	FN: 1962
DEVELOPMENT SET (SPANISH)
Log Loss: 16.233240897556627
Acc: 0.53
TP: 5	FP: 2	TN: 48	FN: 45
<2019-06-18 02:22:09.732136>  Epoch 47 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.313830726603788
Acc: 0.7592924732512834
TP: 3849	FP: 1765	TN: 8428	FN: 2127
DEVELOPMENT SET (SPANISH)
Log Loss: 15.54245737368411
Acc: 0.55
TP: 6	FP: 1	TN: 49	FN: 44
<2019-06-18 02

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.37151793442294
Acc: 0.7576226111695219
TP: 4069	FP: 2012	TN: 8181	FN: 1907
DEVELOPMENT SET (SPANISH)
Log Loss: 15.887901109453328
Acc: 0.54
TP: 12	FP: 8	TN: 42	FN: 38
<2019-06-18 02:24:42.337872>  Epoch 69 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.46979421561392
Acc: 0.7547776609561506
TP: 4330	FP: 2319	TN: 7874	FN: 1646
DEVELOPMENT SET (SPANISH)
Log Loss: 15.197109589606512
Acc: 0.56
TP: 12	FP: 6	TN: 44	FN: 38
<2019-06-18 02:24:49.255841>  Epoch 70 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.326662969263197
Acc: 0.7589213927886697
TP: 4158	FP: 2080	TN: 8113	FN: 1818
DEVELOPMENT SET (SPANISH)
Log Loss: 15.88790910542763
Acc: 0.54
TP: 13	FP: 9	TN: 41	FN: 37
<2019-06-18 02:24:56.210815>  Epoch 71 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.480468934488174
Acc: 0.754468427237306
TP: 4207	FP: 2201	TN: 7992	FN: 1769
DEVELOPMENT SET (SPANISH)
Log Loss: 13.815550537835783
Acc: 0.6
TP: 15	FP: 5	TN: 45	FN: 35
<2019-06-18 0

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.30956607112831
Acc: 0.7594161667388212
TP: 4004	FP: 1918	TN: 8275	FN: 1972
DEVELOPMENT SET (SPANISH)
Log Loss: 15.542497353555618
Acc: 0.55
TP: 11	FP: 6	TN: 44	FN: 39
<2019-06-18 02:27:29.152222>  Epoch 93 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.362965430250712
Acc: 0.7578699981445977
TP: 3910	FP: 1849	TN: 8344	FN: 2066
DEVELOPMENT SET (SPANISH)
Log Loss: 15.197157565452324
Acc: 0.56
TP: 18	FP: 12	TN: 38	FN: 32
<2019-06-18 02:27:36.081472>  Epoch 94 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.307434460451786
Acc: 0.7594780134825901
TP: 4096	FP: 2009	TN: 8184	FN: 1880
DEVELOPMENT SET (SPANISH)
Log Loss: 14.160946297759192
Acc: 0.59
TP: 15	FP: 6	TN: 44	FN: 35
<2019-06-18 02:27:43.042559>  Epoch 95 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.431325873369193
Acc: 0.7558909023439916
TP: 3977	FP: 1948	TN: 8245	FN: 1999
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851769801503215
Acc: 0.57
TP: 19	FP: 12	TN: 38	FN: 31
<2019-0

Log Loss: 8.324515286525022
Acc: 0.7589832395324386
TP: 3925	FP: 1846	TN: 8347	FN: 2051
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851777797477519
Acc: 0.57
TP: 20	FP: 13	TN: 37	FN: 30
<2019-06-18 02:30:08.969307>  Epoch 116 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.322379966911193
Acc: 0.7590450862762076
TP: 3942	FP: 1862	TN: 8331	FN: 2034
DEVELOPMENT SET (SPANISH)
Log Loss: 13.815606509655895
Acc: 0.6
TP: 22	FP: 12	TN: 38	FN: 28
<2019-06-18 02:30:15.899981>  Epoch 117 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.570195134679295
Acc: 0.7518708639990105
TP: 4358	FP: 2394	TN: 7799	FN: 1618
DEVELOPMENT SET (SPANISH)
Log Loss: 16.233288873402433
Acc: 0.53
TP: 11	FP: 8	TN: 42	FN: 39
<2019-06-18 02:30:22.828835>  Epoch 118 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.26470902896368
Acc: 0.7607149483579689
TP: 4051	FP: 1944	TN: 8249	FN: 1925
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851729821631707
Acc: 0.57
TP: 14	FP: 7	TN: 43	FN: 36
<2019-06-18 02:30:29.770306>  E

In [9]:
print(" ===== LAST: ====")
mode = load_model(Save_path+'final_perceptron.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

print()
print()
print()

print(" ===== BEST IN DEV SET: ====")
mode = load_model(Save_path+'bestEN_perceptron.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

print()
print()
print()

print(" ===== BEST IN TEST SET: ====")
mode = load_model(Save_path+'bestES_perceptron.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

 ===== LAST: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.476196119350627
Acc: 0.7545921207248438
TP: 4197	FP: 2189	TN: 8004	FN: 1779
DEVELOPMENT SET (SPANISH)
Log Loss: 15.887925097376232
Acc: 0.54
TP: 15	FP: 11	TN: 39	FN: 35



 ===== BEST IN DEV SET: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.476196119350627
Acc: 0.7545921207248438
TP: 4197	FP: 2189	TN: 8004	FN: 1779
DEVELOPMENT SET (SPANISH)
Log Loss: 15.887925097376232
Acc: 0.54
TP: 15	FP: 11	TN: 39	FN: 35



 ===== BEST IN TEST SET: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.476196119350627
Acc: 0.7545921207248438
TP: 4197	FP: 2189	TN: 8004	FN: 1779
DEVELOPMENT SET (SPANISH)
Log Loss: 15.887925097376232
Acc: 0.54
TP: 15	FP: 11	TN: 39	FN: 35


15.887925097376232

# RUN CONCAT LSTM MODEL

In [10]:
model = build_model_concat_LSTM()

bestEN = sys.float_info.max
bestES = sys.float_info.max

for epoch in range(EPOCHS):
    printTrace('Epoch ' +str(epoch+1) + ' of ' +str(EPOCHS))
    model.fit(train_set.X, train_set.y, batch_size=BATCH_SIZE, epochs=1, verbose=1)
    print("DEVELOPMENT SET (ENGLISH)")
    
    result_en = evaluate(model,dev_set)
    
    if result_en < bestEN:
        bestEN = result_en
        model.save(Save_path+'bestEN_concat_LSTM.h5')
    
    print("DEVELOPMENT SET (SPANISH)")
    result_es = evaluate(model,test_set)
    if result_es < bestES:
        bestEN = result_en
        model.save(Save_path+'bestES_concat_LSTM.h5')

    
model.save(Save_path+'final_concat_LSTM.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 25, 300)      0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 25, 300)      0                                            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 25, 600)      0           input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 256)          746496      concatenate_2[0][0]              
__________

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.657761486817886
Acc: 0.7493351475044839
TP: 4030	FP: 2107	TN: 8086	FN: 1946
DEVELOPMENT SET (SPANISH)
Log Loss: 13.124775009937569
Acc: 0.62
TP: 17	FP: 5	TN: 45	FN: 33
<2019-06-18 02:34:13.012239>  Epoch 21 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.702619072959989
Acc: 0.7480363658853362
TP: 3994	FP: 2092	TN: 8101	FN: 1982
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851681845785896
Acc: 0.57
TP: 8	FP: 1	TN: 49	FN: 42
<2019-06-18 02:34:19.956087>  Epoch 22 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.446251104304672
Acc: 0.755457975137609
TP: 3413	FP: 1391	TN: 8802	FN: 2563
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851681845785896
Acc: 0.57
TP: 8	FP: 1	TN: 49	FN: 42
<2019-06-18 02:34:26.882866>  Epoch 23 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.538094672853077
Acc: 0.7527985651555446
TP: 3184	FP: 1205	TN: 8988	FN: 2792
DEVELOPMENT SET (SPANISH)
Log Loss: 12.779355262091256
Acc: 0.63
TP: 14	FP: 1	TN: 49	FN: 36
<2019-06-18

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.160030448416014
Acc: 0.763745438802647
TP: 3915	FP: 1759	TN: 8434	FN: 2061
DEVELOPMENT SET (SPANISH)
Log Loss: 12.088611718090249
Acc: 0.65
TP: 20	FP: 5	TN: 45	FN: 30
<2019-06-18 02:36:59.888206>  Epoch 45 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.405709406425336
Acc: 0.7566330632692189
TP: 4330	FP: 2289	TN: 7904	FN: 1646
DEVELOPMENT SET (SPANISH)
Log Loss: 12.088619714064551
Acc: 0.65
TP: 21	FP: 6	TN: 44	FN: 29
<2019-06-18 02:37:06.814895>  Epoch 46 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.117307786267762
Acc: 0.7649823736780259
TP: 3926	FP: 1750	TN: 8443	FN: 2050
DEVELOPMENT SET (SPANISH)
Log Loss: 12.433967498142149
Acc: 0.64
TP: 15	FP: 1	TN: 49	FN: 35
<2019-06-18 02:37:13.768598>  Epoch 47 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.200614823800592
Acc: 0.7625703506710372
TP: 3861	FP: 1724	TN: 8469	FN: 2115
DEVELOPMENT SET (SPANISH)
Log Loss: 11.743191970243934
Acc: 0.66
TP: 17	FP: 1	TN: 49	FN: 33
<2019-06-

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.384336083120594
Acc: 0.7572515307069083
TP: 4093	FP: 2042	TN: 8151	FN: 1883
DEVELOPMENT SET (SPANISH)
Log Loss: 12.088603722115947
Acc: 0.65
TP: 19	FP: 4	TN: 46	FN: 31
<2019-06-18 02:39:46.570131>  Epoch 69 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.275389533780125
Acc: 0.7604057146391242
TP: 4045	FP: 1943	TN: 8250	FN: 1931
DEVELOPMENT SET (SPANISH)
Log Loss: 14.506318069759695
Acc: 0.58
TP: 12	FP: 4	TN: 46	FN: 38
<2019-06-18 02:39:53.498206>  Epoch 70 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.499689530900005
Acc: 0.7539118065433855
TP: 4109	FP: 2112	TN: 8081	FN: 1867
DEVELOPMENT SET (SPANISH)
Log Loss: 12.779379250014161
Acc: 0.63
TP: 17	FP: 4	TN: 46	FN: 33
<2019-06-18 02:40:00.443579>  Epoch 71 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.279656068450505
Acc: 0.7602820211515864
TP: 3928	FP: 1828	TN: 8365	FN: 2048
DEVELOPMENT SET (SPANISH)
Log Loss: 12.779363258065557
Acc: 0.63
TP: 15	FP: 2	TN: 48	FN: 35
<2019-06

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.439884658008591
Acc: 0.7556435153689158
TP: 4263	FP: 2238	TN: 7955	FN: 1713
DEVELOPMENT SET (SPANISH)
Log Loss: 14.160906317887685
Acc: 0.59
TP: 10	FP: 1	TN: 49	FN: 40
<2019-06-18 02:42:33.317225>  Epoch 93 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.324511676492714
Acc: 0.7589832395324386
TP: 3852	FP: 1773	TN: 8420	FN: 2124
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851681845785897
Acc: 0.57
TP: 8	FP: 1	TN: 49	FN: 42
<2019-06-18 02:42:40.286341>  Epoch 94 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.217712216460452
Acc: 0.7620755767208857
TP: 4025	FP: 1896	TN: 8297	FN: 1951
DEVELOPMENT SET (SPANISH)
Log Loss: 14.506318069759695
Acc: 0.58
TP: 12	FP: 4	TN: 46	FN: 38
<2019-06-18 02:42:47.238419>  Epoch 95 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.215572940646831
Acc: 0.7621374234646546
TP: 3962	FP: 1832	TN: 8361	FN: 2014
DEVELOPMENT SET (SPANISH)
Log Loss: 13.815558533810085
Acc: 0.6
TP: 16	FP: 6	TN: 44	FN: 34
<2019-06-1

DEVELOPMENT SET (ENGLISH)
Log Loss: 8.230521710971065
Acc: 0.761704496258272
TP: 3874	FP: 1751	TN: 8442	FN: 2102
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851689841760198
Acc: 0.57
TP: 9	FP: 2	TN: 48	FN: 41
<2019-06-18 02:45:20.225664>  Epoch 117 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.362975073487702
Acc: 0.7578699981445977
TP: 4105	FP: 2044	TN: 8149	FN: 1871
DEVELOPMENT SET (SPANISH)
Log Loss: 13.815534545887179
Acc: 0.6
TP: 13	FP: 3	TN: 47	FN: 37
<2019-06-18 02:45:27.146584>  Epoch 118 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.322383725300991
Acc: 0.7590450862762076
TP: 4018	FP: 1938	TN: 8255	FN: 1958
DEVELOPMENT SET (SPANISH)
Log Loss: 14.160922309836288
Acc: 0.59
TP: 12	FP: 3	TN: 47	FN: 38
<2019-06-18 02:45:34.101537>  Epoch 119 of 128
Epoch 1/1
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.29889001703663
Acc: 0.7597254004576659
TP: 4100	FP: 2009	TN: 8184	FN: 1876
DEVELOPMENT SET (SPANISH)
Log Loss: 14.851713829683103
Acc: 0.57
TP: 12	FP: 5	TN: 45	FN: 38
<2019-06-

In [11]:
print(" ===== LAST: ====")
mode = load_model(Save_path+'final_concat_LSTM.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

print()
print()
print()

print(" ===== BEST IN DEV SET: ====")
mode = load_model(Save_path+'bestEN_concat_LSTM.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

print()
print()
print()

print(" ===== BEST IN TEST SET: ====")
mode = load_model(Save_path+'bestES_concat_LSTM.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

 ===== LAST: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.414235107343984
Acc: 0.7563856762941431
TP: 3947	FP: 1910	TN: 8283	FN: 2029
DEVELOPMENT SET (SPANISH)
Log Loss: 14.506318069759695
Acc: 0.58
TP: 12	FP: 4	TN: 46	FN: 38



 ===== BEST IN DEV SET: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.414235107343984
Acc: 0.7563856762941431
TP: 3947	FP: 1910	TN: 8283	FN: 2029
DEVELOPMENT SET (SPANISH)
Log Loss: 14.506318069759695
Acc: 0.58
TP: 12	FP: 4	TN: 46	FN: 38



 ===== BEST IN TEST SET: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.414235107343984
Acc: 0.7563856762941431
TP: 3947	FP: 1910	TN: 8283	FN: 2029
DEVELOPMENT SET (SPANISH)
Log Loss: 14.506318069759695
Acc: 0.58
TP: 12	FP: 4	TN: 46	FN: 38


14.506318069759695

# RUN LSTM CONCAT

In [12]:
model = build_model_LSTM_concat()

bestEN = sys.float_info.max
bestES = sys.float_info.max

for epoch in range(EPOCHS):
    printTrace('Epoch ' +str(epoch+1) + ' of ' +str(EPOCHS))
    model.fit(train_set.X, train_set.y, batch_size=BATCH_SIZE, epochs=1, verbose=2)
    print("DEVELOPMENT SET (ENGLISH)")
    
    result_en = evaluate(model,dev_set)
    
    if result_en < bestEN:
        bestEN = result_en
        model.save(Save_path+'bestEN_LSTM_concat.h5')
    
    print("DEVELOPMENT SET (SPANISH)")
    result_es = evaluate(model,test_set)
    if result_es < bestES:
        bestEN = result_en
        model.save(Save_path+'bestES_LSTM_concat.h5')
    
    
model.save(Save_path+'final_LSTM_concat.h5')


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 25, 300)      0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 25, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 256)          439296      input_5[0][0]                    
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 256)          439296      input_6[0][0]                    
__________________________________________________________________________________________________
concatenat

 - 9s - loss: 0.5145 - acc: 0.7422
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.38828054045528
Acc: 0.7281835611355062
TP: 3064	FP: 1483	TN: 8710	FN: 2912
DEVELOPMENT SET (SPANISH)
Log Loss: 17.269388197455342
Acc: 0.5
TP: 0	FP: 0	TN: 50	FN: 50
<2019-06-18 02:50:41.672258>  Epoch 23 of 128
Epoch 1/1
 - 9s - loss: 0.5112 - acc: 0.7446
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.435281111348308
Acc: 0.7268229327725895
TP: 3166	FP: 1607	TN: 8586	FN: 2810
DEVELOPMENT SET (SPANISH)
Log Loss: 17.269388197455342
Acc: 0.5
TP: 0	FP: 0	TN: 50	FN: 50
<2019-06-18 02:50:51.505778>  Epoch 24 of 128
Epoch 1/1
 - 9s - loss: 0.5067 - acc: 0.7460
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.501516372614924
Acc: 0.7249056837157524
TP: 3455	FP: 1927	TN: 8266	FN: 2521
DEVELOPMENT SET (SPANISH)
Log Loss: 17.269388197455342
Acc: 0.5
TP: 0	FP: 0	TN: 50	FN: 50
<2019-06-18 02:51:01.379678>  Epoch 25 of 128
Epoch 1/1
 - 9s - loss: 0.5030 - acc: 0.7473
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.324205226146193
Acc: 0.7300389634485744
TP:

 - 9s - loss: 0.4003 - acc: 0.8080
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.332777362959883
Acc: 0.7297915764734987
TP: 3812	FP: 2205	TN: 7988	FN: 2164
DEVELOPMENT SET (SPANISH)
Log Loss: 17.614791953353052
Acc: 0.49
TP: 1	FP: 2	TN: 48	FN: 49
<2019-06-18 02:55:18.131358>  Epoch 51 of 128
Epoch 1/1
 - 9s - loss: 0.3931 - acc: 0.8142
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.008058149350598
Acc: 0.7391922815263776
TP: 3350	FP: 1591	TN: 8602	FN: 2626
DEVELOPMENT SET (SPANISH)
Log Loss: 17.269396193429642
Acc: 0.5
TP: 1	FP: 1	TN: 49	FN: 49
<2019-06-18 02:55:28.023890>  Epoch 52 of 128
Epoch 1/1
 - 9s - loss: 0.3917 - acc: 0.8138
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.982424176222665
Acc: 0.7399344424516049
TP: 3349	FP: 1578	TN: 8615	FN: 2627
DEVELOPMENT SET (SPANISH)
Log Loss: 16.924000433506237
Acc: 0.51
TP: 1	FP: 0	TN: 50	FN: 49
<2019-06-18 02:55:37.915701>  Epoch 53 of 128
Epoch 1/1
 - 9s - loss: 0.3850 - acc: 0.8180
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.492999573145807
Acc: 0.7251530706908281


 - 9s - loss: 0.2856 - acc: 0.8725
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.858528807105465
Acc: 0.7435215535902034
TP: 3388	FP: 1559	TN: 8634	FN: 2588
DEVELOPMENT SET (SPANISH)
Log Loss: 16.92402442142914
Acc: 0.51
TP: 4	FP: 3	TN: 47	FN: 46
<2019-06-18 02:59:54.582506>  Epoch 79 of 128
Epoch 1/1
 - 9s - loss: 0.2814 - acc: 0.8743
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.012337838385278
Acc: 0.7390685880388398
TP: 3499	FP: 1742	TN: 8451	FN: 2477
DEVELOPMENT SET (SPANISH)
Log Loss: 16.233248893530927
Acc: 0.53
TP: 6	FP: 3	TN: 47	FN: 44
<2019-06-18 03:00:04.416896>  Epoch 80 of 128
Epoch 1/1
 - 9s - loss: 0.2773 - acc: 0.8756
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.11488475880392
Acc: 0.7360999443379306
TP: 3726	FP: 2017	TN: 8176	FN: 2250
DEVELOPMENT SET (SPANISH)
Log Loss: 16.924016425454838
Acc: 0.51
TP: 3	FP: 2	TN: 48	FN: 47
<2019-06-18 03:00:14.269388>  Epoch 81 of 128
Epoch 1/1
 - 9s - loss: 0.2739 - acc: 0.8770
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.023023832428935
Acc: 0.7387593543199951
T

 - 9s - loss: 0.2000 - acc: 0.9150
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.283643401100422
Acc: 0.7312140515801843
TP: 3766	FP: 2136	TN: 8057	FN: 2210
DEVELOPMENT SET (SPANISH)
Log Loss: 17.269412185378247
Acc: 0.5
TP: 3	FP: 3	TN: 47	FN: 47
<2019-06-18 03:04:30.816941>  Epoch 107 of 128
Epoch 1/1
 - 9s - loss: 0.2008 - acc: 0.9150
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.253734486377557
Acc: 0.7320799059929495
TP: 3712	FP: 2068	TN: 8125	FN: 2264
DEVELOPMENT SET (SPANISH)
Log Loss: 16.924056405326347
Acc: 0.51
TP: 8	FP: 7	TN: 43	FN: 42
<2019-06-18 03:04:40.715995>  Epoch 108 of 128
Epoch 1/1
 - 9s - loss: 0.1974 - acc: 0.9145
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.783752316836026
Acc: 0.7456861896221164
TP: 3168	FP: 1304	TN: 8889	FN: 2808
DEVELOPMENT SET (SPANISH)
Log Loss: 17.269412185378247
Acc: 0.5
TP: 3	FP: 3	TN: 47	FN: 47
<2019-06-18 03:04:50.612218>  Epoch 109 of 128
Epoch 1/1
 - 9s - loss: 0.1959 - acc: 0.9163
DEVELOPMENT SET (ENGLISH)
Log Loss: 9.225965836518261
Acc: 0.732883913661945

In [13]:
print(" ===== LAST: ====")
mode = load_model(Save_path+'final_LSTM_concat.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

print()
print()
print()

print(" ===== BEST IN DEV SET: ====")
mode = load_model(Save_path+'bestEN_LSTM_concat.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

print()
print()
print()

print(" ===== BEST IN TEST SET: ====")
mode = load_model(Save_path+'bestES_LSTM_concat.h5')
print("DEVELOPMENT SET (ENGLISH)")
evaluate(model,dev_set)
print("DEVELOPMENT SET (SPANISH)")
evaluate(model,test_set)

 ===== LAST: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.99737897975158
Acc: 0.7395015152452223
TP: 3383	FP: 1619	TN: 8574	FN: 2593
DEVELOPMENT SET (SPANISH)
Log Loss: 16.92402442142914
Acc: 0.51
TP: 4	FP: 3	TN: 47	FN: 46



 ===== BEST IN DEV SET: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.99737897975158
Acc: 0.7395015152452223
TP: 3383	FP: 1619	TN: 8574	FN: 2593
DEVELOPMENT SET (SPANISH)
Log Loss: 16.92402442142914
Acc: 0.51
TP: 4	FP: 3	TN: 47	FN: 46



 ===== BEST IN TEST SET: ====
DEVELOPMENT SET (ENGLISH)
Log Loss: 8.99737897975158
Acc: 0.7395015152452223
TP: 3383	FP: 1619	TN: 8574	FN: 2593
DEVELOPMENT SET (SPANISH)
Log Loss: 16.92402442142914
Acc: 0.51
TP: 4	FP: 3	TN: 47	FN: 46


16.92402442142914

In [14]:
[0 if x <0.3 else 1 for x in model.predict(test_set.X, batch_size=BATCH_SIZE)]

[0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [15]:
test_set.X[0][1]

array([[ 0.0317035 ,  0.0260565 , -0.15104701, ..., -0.0778832 ,
        -0.0101717 , -0.01387   ],
       [-0.16914999, -0.0610012 , -0.0664161 , ...,  0.0448085 ,
         0.149555  ,  0.076156  ],
       [-0.15206701, -0.0355937 , -0.0981824 , ..., -0.0969076 ,
         0.191608  ,  0.0768526 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])