In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import get_input, get_input_share_tokenizer

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Activation, Bidirectional, Dropout
from keras.layers.embeddings import Embedding

from tensorflow.keras import regularizers
from io import BytesIO

In [3]:
def create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units, l2_kernel, l2_recurrent, l2_activity, dropout):
    
    X_input = Input(shape = (max_length_sequence, ))
    embedding_layer = Embedding(input_dim = vocabulary_length, output_dim = emb_dim,
                                trainable = True, mask_zero = True)(X_input)
    
    X = LSTM(units = lstm_units, return_sequences = False,
            kernel_regularizer = regularizers.l2(l2_kernel),
            recurrent_regularizer = regularizers.l2(l2_recurrent),
            activity_regularizer = regularizers.l2(l2_activity))(embedding_layer)
    
    X = Dropout(rate = dropout)(X)
    X = Dense(units = 1)(X)
    X = Activation('sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

In [138]:
#model = create_model(10000, 500, 100, lstm_units = 128, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0.01, dropout = 0.5)
#model.summary()

In [139]:
def grid_search(X, dataset, epochs, models):
    
    Y = dataset.label.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)

    mean_dev_accuracy = np.zeros(np.size(models))
    
    i = 0
    for model in models:
    
        iterations = 5
        dev_accuracy_shuffle_split = np.zeros(iterations)
        shuffle = ShuffleSplit(n_splits = iterations, test_size = 0.2)
        
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
        w = BytesIO()
        model.save_weights(w) # Save initial weights
        
        j = 0
        for train, dev in shuffle.split(X_train, y_train):
            Xtrain = X_train[train]
            Ytrain = y_train[train]
            Xdev = X_train[dev]
            Ydev = y_train[dev]

            model.fit(Xtrain, Ytrain, epochs = epochs, batch_size = 32, shuffle = True) # Fit model

            loss, accuracy_val = model.evaluate(Xdev, Ydev) # Validate model
            dev_accuracy_shuffle_split[j] = accuracy_val

            model.load_weights(w) # Restore initial weights
            
            j += 1
              
        mean_dev_accuracy[i] = round(np.mean(dev_accuracy_shuffle_split), 3)
        std = round(np.std(dev_accuracy_shuffle_split), 3)
        print('Model ' + str(i) +' --> dev_acc: ' + str(mean_dev_accuracy[i]) + ' +- ' + str(std))
        i += 1
    
    # test
    best_model_index = np.argmax(mean_dev_accuracy)
    best_model = models[best_model_index]
    best_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile best model
    best_model.fit(X_train, y_train, epochs = epochs, batch_size = 32, shuffle = True) # Fit best model
    y_pred = best_model.predict(X_test) # Test best model
    y_pred = y_pred > 0.5 # Sigmoid activation function
    accuracy_test = accuracy_score(y_test, y_pred)
    
    return best_model_index, round(accuracy_test, 3)

In [81]:
dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
dataset

Unnamed: 0,text,label
0,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",1
1,"La palabra ""haiga"", aceptada por la RAE La Rea...",1
2,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,1
3,UNAM capacitará a maestros para aprobar prueba...,0
4,Alerta: pretenden aprobar libros escolares con...,1
5,Un paso más cerca de hacer los exámenes 'onlin...,0
6,UNAM REALIZARÁ PRUEBAS ANTIDOPING A ESTUDIANTE...,1
7,Niño de *NUMBER* años se prepara para entrar a...,0
8,*NUMBER* palabras que creíamos inaceptables y ...,0
9,LIMITARÁN EL TIEMPO DE EGRESO EN FILOSOFÍA Y S...,1


#### Models

In [82]:
vocabulary_length = 10000
max_length_sequence = 500
emb_dim = 100
language = 'spanish'
epochs = 7

In [83]:
X, df = get_input(dataset, stemming = False, remove_stopwords = True, 
                      vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
print(X.shape)

(2571, 500)


In [74]:
models = []

model_0 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 32, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model_1 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 64, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model_2 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 128, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)

model_3 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 32, l2_kernel = 1, l2_recurrent = 1, l2_activity = 0.1, dropout = 0)
model_4 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 64, l2_kernel = 1, l2_recurrent = 1, l2_activity = 0.1, dropout = 0)
model_5 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 128, l2_kernel = 1, l2_recurrent = 1, l2_activity = 0.1, dropout = 0)

models.append(model_0)
models.append(model_1)
models.append(model_2)
models.append(model_3)
models.append(model_4)
models.append(model_5)

In [76]:
best_model_index, test_acc = grid_search(X, df, epochs, models)
print('best_model_index: ' + str(best_model_index), 'test acc: ' + str(test_acc))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 0 --> dev_acc: 0.731 +- 0.017
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 1 --> dev_acc: 0.728 +- 0.014
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 4 --> dev_acc: 0.569 +- 0.054
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 5 --> dev_acc: 0.524 +- 0.075
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
best_model_index: 0 test acc: 0.74


New Iterations

In [98]:
epochs = 7
models = []

model_0 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 2, l2_kernel = 0.5, l2_recurrent = 0.5, l2_activity = 0, dropout = 0.5)
model_1 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 6, l2_kernel = 0, l2_recurrent = 0, l2_activity = 0, dropout = 0.5)
model_2 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 8, l2_kernel = 0.1, l2_recurrent = 0.1, l2_activity = 0, dropout = 0.5)

models.append(model_0)
models.append(model_1)
models.append(model_2)

In [99]:
best_model_index, test_acc = grid_search(X, df, epochs, models)
print('best_model_index: ' + str(best_model_index), 'test acc: ' + str(test_acc))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 0 --> dev_acc: 0.585 +- 0.055
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 1 --> dev_acc: 0.706 +- 0.024
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/

In [100]:
epochs = 25
models = []

model_0 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 8, l2_kernel = 1, l2_recurrent = 1, l2_activity = 0, dropout = 0)
model_1 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 16, l2_kernel = 1, l2_recurrent = 1, l2_activity = 0, dropout = 0)
model_2 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 32, l2_kernel = 1, l2_recurrent = 1, l2_activity = 0, dropout = 0)

models.append(model_0)
models.append(model_1)
models.append(model_2)

In [101]:
best_model_index, test_acc = grid_search(X, df, epochs, models)
print('best_model_index: ' + str(best_model_index), 'test acc: ' + str(test_acc))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoc

Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
E

Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Model 2 --> dev_acc: 0.692 +- 0.037
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25

In [102]:
epochs = 10
models = []

model_0 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 16, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0.5)
model_1 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 32, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0.5)
model_2 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 64, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0.5)
model_3 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 64, l2_kernel = 0.1, l2_recurrent = 0.1, l2_activity = 0, dropout = 0.5)


models.append(model_0)
models.append(model_1)
models.append(model_2)
models.append(model_3)

In [103]:
best_model_index, test_acc = grid_search(X, df, epochs, models)
print('best_model_index: ' + str(best_model_index), 'test acc: ' + str(test_acc))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model 0 --> dev_acc: 0.731 +- 0.028
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

Epoch 10/10
Model 2 --> dev_acc: 0.72 +- 0.027
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model 3 --> dev_acc: 0.713 +- 0.047
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
best_model_index: 0 test acc: 0.713


#### Test Model with English Dataset

In [127]:
dataset = pd.read_csv('../data/Merged/english_dataset.csv')
dataset

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1
5,The number of cases of cops brutalizing and ki...,1
6,Donald Trump spent a good portion of his day a...,1
7,In the wake of yet another court decision that...,1
8,Many people have raised the alarm regarding th...,1
9,Just when you might have thought we d get a br...,1


In [140]:
vocabulary_length = 10000
max_length_sequence = 1500
emb_dim = 100
language = 'english'
epochs = 4

In [141]:
X, df = get_input(dataset, stemming = False, remove_stopwords = True, 
                      vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
print(X.shape)

(51233, 1500)


In [142]:
models = []

model_0 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 2, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model_1 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 4, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model_2 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 8, l2_kernel = 0.1, l2_recurrent = 0.1, l2_activity = 0, dropout = 0)
model_3 = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 8, l2_kernel = 0, l2_recurrent = 0, l2_activity = 0, dropout = 0.5)

models.append(model_0)
models.append(model_1)
models.append(model_2)
models.append(model_3)

In [143]:
best_model_index, test_acc = grid_search(X, df, epochs, models)
print('best_model_index: ' + str(best_model_index), 'test acc: ' + str(test_acc))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Model 0 --> dev_acc: 0.948 +- 0.009
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Model 1 --> dev_acc: 0.95 +- 0.02
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Model 2 --> dev_acc: 0.93 +- 0.055
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Model 3 --> dev_acc: 0.949 +- 0.014
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
best_model_index: 1

#### Train Model with English Dataset and Evaluate with Translated Dataset

In [146]:
english_dataset = pd.read_csv('../data/Merged/english_dataset.csv')
english_dataset

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1
5,The number of cases of cops brutalizing and ki...,1
6,Donald Trump spent a good portion of his day a...,1
7,In the wake of yet another court decision that...,1
8,Many people have raised the alarm regarding th...,1
9,Just when you might have thought we d get a br...,1


In [147]:
translated_dataset = pd.read_csv('../data/Merged/spanish_t_dataset.csv')
translated_dataset

Unnamed: 0,text,label
0,"RAE WILL INCLUDE THE WORD ""LADY"" IN THE SPANIS...",1
1,"The word ""haiga"", accepted by the RAE The Roya...",1
2,YORDI ROSADO WILL WRITE AND DESIGN THE NEW SEP...,1
3,UNAM will train teachers to pass the Pisa test...,0
4,Alert: they intend to approve school books wit...,1
5,One step closer to taking exams 'online'\nAbou...,0
6,UNAM WILL PERFORM ANTI-DOPING TESTS TO STUDENT...,1
7,* NUMBER * year-old boy prepares to enter coll...,0
8,* NUMBER * words that we thought unacceptable ...,0
9,THEY WILL LIMIT THE TIME OF EXIT IN PHILOSOPHY...,1


In [148]:
vocabulary_length = 10000
max_length_sequence = 1500
emb_dim = 100
language = 'english'
epochs = 7

In [149]:
X_train, df = get_input(english_dataset, stemming = False, remove_stopwords = True, 
                      vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
print(X_train.shape)

(51233, 1500)


In [150]:
X_test, df = get_input(translated_dataset, stemming = False, remove_stopwords = True, 
                      vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
print(X_test.shape)

(2571, 1500)


In [151]:
model = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 4, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model

In [152]:
Y_train = english_dataset.label.values
model.fit(X_train, Y_train, epochs = epochs, batch_size = 32, shuffle = True) # Fit model

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.callbacks.History at 0x1c42661290>

In [153]:
Y_test = translated_dataset.label.values
loss, acc = model.evaluate(X_test, Y_test)
print(loss, round(acc, 3))

1.766238255057526 0.532


##### Sharing Tokenizer

In [4]:
vocabulary_length = 10000
max_length_sequence = 1500
emb_dim = 100
language = 'english'
epochs = 7

In [5]:
english_dataset = pd.read_csv('../data/Merged/english_dataset.csv')
translated_dataset = pd.read_csv('../data/Merged/spanish_t_dataset.csv')

In [6]:
X_train, X_test, df1, df2 = get_input_share_tokenizer(english_dataset, translated_dataset, stemming = False, remove_stopwords = True, 
                      vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
print(X_train.shape, X_test.shape)

(51233, 1500) (2571, 1500)


In [7]:
model = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = 4, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
Y_train = english_dataset.label.values
model.fit(X_train, Y_train, epochs = epochs, batch_size = 32, shuffle = True) # Fit model
Y_test = translated_dataset.label.values
loss, acc = model.evaluate(X_test, Y_test)
print(loss, round(acc, 3))


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
1.4473208038318208 0.567
