In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import get_input_RNN

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Activation, Bidirectional
from keras.layers.embeddings import Embedding

from tensorflow.keras import regularizers

In [None]:
def create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units, l2_kernel, l2_bias, l2_activity):
    
    X_input = Input(shape = (max_length_sequence, ))
    embedding_layer = Embedding(input_dim = vocabulary_length, output_dim = emb_dim,
                                trainable = True, mask_zero = True)(X_input)
    
    X = LSTM(units = lstm_units, return_sequences = False,
            kernel_regularizer = regularizers.l2(l2_kernel),
            bias_regularizer = regularizers.l2(l2_bias),
            activity_regularizer=regularizers.l2(l2_activity))(embedding_layer)
    
    X = Dense(units = 1)(X)
    X = Activation('sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

In [None]:
#model = create_model(10000, 500, 100, lstm_units = 64, l2_kernel = 0.01, l2_bias = 0.01, l2_activity = 0.01)
#model.summary()

In [None]:
def grid_search(X, dataset, epochs, vocabulary_length, max_length_sequence, emb_dim, 
                lstm_units, l2_kernel, l2_bias, l2_activity):
    
    Y = dataset.label.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)

    mean_dev_accuracy = np.zeros(np.size(lstm_units))
    
    for i in range(np.size(lstm_units)):
    
        iterations = 5
        dev_accuracy_shuffle_split = np.zeros(iterations)
        shuffle = ShuffleSplit(n_splits = iterations, test_size = 0.2)

        j = 0
        for train, dev in shuffle.split(X_train, y_train):
            Xtrain = X_train[train]
            Ytrain = y_train[train]
            Xdev = X_train[dev]
            Ydev = y_train[dev]

            model = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = lstm_units[i], 
                                 l2_kernel = l2_kernel, l2_bias = l2_bias, l2_activity = l2_activity) # Create model
            
            model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
            model.fit(Xtrain, Ytrain, epochs = epochs, batch_size = 32, shuffle = True) # Fit model

            loss, accuracy_val = model.evaluate(Xdev, Ydev) # Validate model
            dev_accuracy_shuffle_split[j] = accuracy_val

            j += 1
              
        mean_dev_accuracy[i] = round(np.mean(dev_accuracy_shuffle_split), 3)
        std = round(np.std(dev_accuracy_shuffle_split), 3)
        print('LSTM: ' + str(lstm_units[i]) +' --> dev_acc: ' + str(mean_dev_accuracy[i]) + ' +- ' + str(std))
    
    # test
    best_model_index = np.argmax(mean_dev_accuracy)
    
    best_model = create_model(vocabulary_length, max_length_sequence, emb_dim, lstm_units = lstm_units[best_model_index], 
                                 l2_kernel = l2_kernel, l2_bias = l2_bias, l2_activity = l2_activity) # Create best model
    
    best_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile best model
    best_model.fit(X_train, y_train, epochs = epochs, batch_size = 32, shuffle = True) # Fit best model
    y_pred = best_model.predict(X_test) # Test best model
    y_pred = y_pred > 0.5 # Sigmoid activation function
    accuracy_test = accuracy_score(y_test, y_pred)
    
    return lstm_units[best_model_index], round(accuracy_test, 3)

In [None]:
dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
dataset

#### Models

In [None]:
vocabulary_length = 10000
max_length_sequence = 500
emb_dim = 100
language = 'spanish'
epochs = 5

In [None]:
lstm_units = [16, 32, 64]

l2_kernel = 0.1
l2_bias = 0.1
l2_activity = 0.1

In [None]:
X, df = get_input_RNN(dataset, stemming = False, remove_stopwords = True, 
                      vocabulary_length = vocabulary_length, max_length_sequence = max_length_sequence, language = language)
print(X.shape)

In [None]:
best_model, test_acc = grid_search(X, df, epochs, vocabulary_length, max_length_sequence, emb_dim, 
                lstm_units, l2_kernel, l2_bias, l2_activity)
print('LSTM :' + str(best_model), 'test_acc: ' + str(test_acc))