In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import get_input_RNN

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Activation, Bidirectional
from keras.layers.embeddings import Embedding

#from keras.wrappers.scikit_learn import KerasClassifier

In [16]:
def create_model(vocab_length, max_length_sequence, emb_dim, bidirectional):
    
    X_input = Input(shape = (max_length_sequence, ))
    embedding_layer = Embedding(input_dim = vocab_length, output_dim = emb_dim,
                                trainable = True, mask_zero = True)(X_input)
    
    if bidirectional:
        X = Bidirectional(LSTM(units = 128, return_sequences = False))(embedding_layer)
    else:
        X = LSTM(units = 128, return_sequences = False)(embedding_layer)
    X = Dense(units = 1)(X)
    X = Activation('sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

In [17]:
def execute_model(model, X, dataset, epochs):
    
    Y = dataset.label.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)

    iterations = 5
    dev_accuracy = np.zeros(iterations)
    test_accuracy = np.zeros(iterations)
    shuffle = ShuffleSplit(n_splits = iterations, test_size = 0.2)
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    j = 0
    for train, dev in shuffle.split(X_train, y_train):
        Xtrain = X_train[train]
        Ytrain = y_train[train]
        Xdev = X_train[dev]
        Ydev = y_train[dev]
        
        model.fit(Xtrain, Ytrain, epochs = epochs, batch_size = 32, shuffle = True) # Fit model
        
        loss, accuracy_val = model.evaluate(Xdev, Ydev) # Validate model
        dev_accuracy[j] = round(accuracy_val, 3)
        
        y_pred = model.predict(X_test) # Test model
        y_pred = y_pred > 0.5 # Sigmoid activation function
        accuracy_test = accuracy_score(y_test, y_pred)
        test_accuracy[j] = round(accuracy_test, 3)
        
        j += 1
    
    return np.mean(dev_accuracy), np.std(dev_accuracy), np.max(test_accuracy)

In [18]:
dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
dataset.head()

Unnamed: 0,text,label
0,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",1
1,"La palabra ""haiga"", aceptada por la RAE La Rea...",1
2,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,1
3,UNAM capacitará a maestros para aprobar prueba...,0
4,Alerta: pretenden aprobar libros escolares con...,1


In [19]:
#model = create_model(vocab_length = 40000, max_length_sequence = 2900, emb_dim = 50, bidirectional = True)
#model.summary()

In [20]:
vocabulary_length = 40000
max_length_sequence = 2900
emb_dim = 50
language = 'spanish'
epochs = 10

bidirectional = [False, False, True, True]
stemming_stopwords = [False, True, False, True]

for i in range(4):
    model = create_model(vocab_length = vocabulary_length, max_length_sequence = max_length_sequence, emb_dim = emb_dim, bidirectional = bidirectional[i])
    
    X, df = get_input_RNN(dataset, stemming = stemming_stopwords[i], remove_stopwords = stemming_stopwords[i],
                           vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
    
    dev_accuracy, std_dev_accuracy, test_accuracy = execute_model(model, X, df, epochs)
    print(bidirectional[i], 'Stemming_Remove_StopWords: ' + str(stemming_stopwords[i]), dev_accuracy, dev_accuracy, test_accuracy)
    print('\n')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

KeyboardInterrupt: 