In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import get_input_RNN

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Activation, Bidirectional
from keras.layers.embeddings import Embedding

#from keras.wrappers.scikit_learn import KerasClassifier

In [3]:
def create_model_one_hidden_layer(vocab_length, max_length_sequence, emb_dim, bidirectional):
    
    X_input = Input(shape = (max_length_sequence, ))
    embedding_layer = Embedding(input_dim = vocab_length, output_dim = emb_dim,
                                trainable = True, mask_zero = True)(X_input)
    
    if bidirectional:
        X = Bidirectional(LSTM(units = 128, return_sequences = False))(embedding_layer)
    else:
        X = LSTM(units = 128, return_sequences = False)(embedding_layer)
    X = Dense(units = 1)(X)
    X = Activation('sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

In [4]:
def execute_model(X, dataset, epochs, vocabulary_length, max_length_sequence, emb_dim, bidirectional):
    
    Y = dataset.label.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)

    iterations = 5
    dev_accuracy = np.zeros(iterations)
    models = []
    shuffle = ShuffleSplit(n_splits = iterations, test_size = 0.2)
    
    j = 0
    for train, dev in shuffle.split(X_train, y_train):
        Xtrain = X_train[train]
        Ytrain = y_train[train]
        Xdev = X_train[dev]
        Ydev = y_train[dev]
        
        model = create_model_one_hidden_layer(vocabulary_length, max_length_sequence, emb_dim, bidirectional) # Create model
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
        model.fit(Xtrain, Ytrain, epochs = epochs, batch_size = 32, shuffle = True) # Fit model
        
        loss, accuracy_val = model.evaluate(Xdev, Ydev) # Validate model
        dev_accuracy[j] = accuracy_val
        
        models.append(model)
    
        j += 1
        
    best_model = models[np.argmax(dev_accuracy)]
    y_pred = best_model.predict(X_test) # Test best model
    y_pred = y_pred > 0.5 # Sigmoid activation function
    accuracy_test = accuracy_score(y_test, y_pred)
    
    return round(np.mean(dev_accuracy), 3), round(np.std(dev_accuracy), 3), round(accuracy_test, 3)

In [5]:
dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
dataset.head()

Unnamed: 0,text,label
0,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",1
1,"La palabra ""haiga"", aceptada por la RAE La Rea...",1
2,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,1
3,UNAM capacitará a maestros para aprobar prueba...,0
4,Alerta: pretenden aprobar libros escolares con...,1


In [6]:
#model = create_model(vocab_length = 40000, max_length_sequence = 2900, emb_dim = 50, bidirectional = True)
#model.summary()

In [None]:
vocabulary_length = 40000
max_length_sequence = 2900
emb_dim = 50
language = 'spanish'
epochs = 10

bidirectional = [False, False, True, True]
stemming_stopwords = [False, True, False, True]

for i in range(4):    
    X, df = get_input_RNN(dataset, stemming = stemming_stopwords[i], remove_stopwords = stemming_stopwords[i],
                           vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
    
    dev_accuracy, std_dev_accuracy, test_accuracy = execute_model(X, df, epochs, vocabulary_length, max_length_sequence, emb_dim, bidirectional[i])
    print('BRNN: ' + str(bidirectional[i]), 'Stemming_Remove_StopWords: ' + str(stemming_stopwords[i]), dev_accuracy, std_dev_accuracy, test_accuracy)
    print('\n')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
BRNN: False Stemming_Remove_StopWords: False 0.706 0.016 0.732


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
BRNN: False Stemming_Remove_StopWords: True 0.71 0.02 0.738


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
BRNN: True Stemming_Remove_StopWords: False 0.717 0.017 0.701


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10