In [20]:
import tensorflow as tf
print("TF version: ", tf.__version__)

TF version:  2.3.1


In [21]:
from transformers import AutoConfig, AutoModel, TFAutoModel, AutoTokenizer

In [22]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import remove_stop_words, text_normalization, add_data_portion

from keras.models import Model
from keras.layers import Input, Dense, LSTM, Dropout, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras import regularizers

#### Load Dataset

In [23]:
spanish_dataset = pd.read_csv('../data/Merged/spanish_dataset.csv')
print(spanish_dataset.shape)

(2571, 2)


In [24]:
max_length_sequence = 200

#### Load BETO Model

In [25]:
config = AutoConfig.from_pretrained('../data/bert_beto/config.json')
#model = AutoModel.from_pretrained('../data/bert_beto/model.ckpt-2000000.index', from_tf=True, config=config)
beto = TFAutoModel.from_pretrained('../data/bert_beto/pytorch_model.bin', from_pt=True, config=config)
print(config, beto)

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 31002
}
 <transformers.modeling_tf_bert.TFBertModel object at 0x0000021EBA71FE50>


In [26]:
#tokenizer = AutoTokenizer.from_pretrained('../data/bert_beto/vocab.txt')
tokenizer = AutoTokenizer.from_pretrained('../data/bert_beto/vocab.txt', do_lower_case=True, add_special_tokens=True,
                                          max_length=max_length_sequence, pad_to_max_length=True)
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x21ebbf42e80>

#### Prepare Data

In [27]:
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.encode(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [28]:
def normalize_and_tokenize_data(df, max_length_sequence):
    text_normalization(df) # Normalize text
    remove_stop_words(df, language = 'spanish', get_tokenize = False) # Remove stop words [and Tokenize texts]
    
    all_sentences = df['text'].values
    all_words = []
    for sent in all_sentences:
        temp = []
        temp.append('[CLS]')
        i = 0
        for w in tokenizer.tokenize(sent):
            i+=1
            if i == (max_length_sequence - 1): break
            temp.append(w)
        temp.append('[SEP]')
        all_words.append(temp)

    return all_words

In [29]:
def get_inputs(spanish_dataset, max_length_sequence):

    all_words = normalize_and_tokenize_data(spanish_dataset, max_length_sequence)

    input_ids = np.zeros((len(all_words), max_length_sequence))
    input_masks = np.zeros((len(all_words), max_length_sequence))
    input_segments = np.zeros((len(all_words), max_length_sequence))

    for i in range(len(all_words)):
        input_ids[i,:] = np.array(get_ids(all_words[i], tokenizer, max_length_sequence)).reshape(1,-1)
        input_masks[i,:] = np.array(get_masks(all_words[i], max_length_sequence)).reshape(1,-1)
        input_segments[i,:] = np.array(get_segments(all_words[i], max_length_sequence)).reshape(1,-1)

    input_ids = input_ids.astype(int)
    return (input_ids, input_masks, input_segments)

In [30]:
input_ids, input_masks, input_segments = get_inputs(spanish_dataset, max_length_sequence)
print(input_ids.shape, input_masks.shape, input_segments.shape)

(2571, 200) (2571, 200) (2571, 200)


In [31]:
print(input_ids[0])
print(input_masks[0])
print(input_segments[0])

[    4 29047  6249 30956  3269 10296 18116  6949 10406  1083  4543  1105
  1626  2322  1498 10406 30956  2078  2570  7105 10406  1519 29047  1785
  1130 27832  1637 30957  3180  4898  3120 14825 30957  1935  6249 30956
 17576 10296  2035  3269 18116  6949 10406  1083  1785  1130 24861  1092
 10296  5868 30956  9869  1626  2322  1498  1626  3756  2397  3382 30957
  3269  7288  8173  6949  7035 14327  8636  4293  3603  1688  3469  6949
  7035  8173 10406  1083  8636  5508  3464  1105  1845 11469  5181 17576
 30958 30380 11712  8173  8636  2570  2338  2053  2749  1497 10296 18972
 30956  1665  6949  4500  8168  7035  2652  3758 30956  9869  1626  2322
  1498  9179 30956 10194  2397 25065 30958  1973  1556 30956  5334 10296
 15067 30956 29310  1524 26635 30957 25719  4412  1207  1524  1916  2322
 17576 15903 10799  3269 12630  6949  7035  1894  6584 18116  4002 10406
  1519 13981  1170  3269  3207 14327  8636  9620  8362 30957 27832 12126
  3308  4543  1105  3207  8581 30956 18116 30958 24

#### Models

In [32]:
batch_size = 32

In [62]:
def create_model_CNN(max_length_sequence, filters, kernel_size, dense_units, l2_kernel):
    
    input_word_ids = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="segment_ids")
    
    embedding_layer = beto(input_word_ids, attention_mask=input_mask, token_type_ids=segment_ids)
    
    X = Conv1D(filters = filters, kernel_size = kernel_size, activation = 'relu',
              kernel_regularizer = regularizers.l2(l2_kernel))(embedding_layer[0])
    X = MaxPooling1D(pool_size = 2)(X)
    X = Flatten()(X)
    X = Dense(units = dense_units, activation = 'relu')(X)
    X = Dense(units = 1, activation = 'sigmoid')(X)
                          
    model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = X)
    
    for layer in model.layers[:4]:
        layer.trainable = False
    
    return(model)

In [33]:
def create_model_RNN(max_length_sequence, lstm_units, l2_kernel, l2_recurrent, l2_activity, dropout):
    
    input_word_ids = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="segment_ids")
    
    embedding_layer = beto(input_word_ids, attention_mask=input_mask, token_type_ids=segment_ids)
    
    X = LSTM(units = lstm_units, return_sequences = False,
            kernel_regularizer = regularizers.l2(l2_kernel),
            recurrent_regularizer = regularizers.l2(l2_recurrent),
            activity_regularizer = regularizers.l2(l2_activity))(embedding_layer[0])
    
    X = Dropout(rate = dropout)(X)
    X = Dense(units = 1, activation = 'sigmoid')(X)
                          
    model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = X)
    
    for layer in model.layers[:4]:
        layer.trainable = False
                          
    return(model)

#### Execute Models

In [34]:
epochs = 30

max_length_sequence = 50

In [35]:
test_size = 0.1
Y = spanish_dataset.label.values

input_ids_tr, input_ids_te, input_masks_tr, input_masks_te, input_segments_tr, input_segments_te, y_tr, y_te = train_test_split(
    input_ids, input_masks, input_segments, Y, test_size = test_size, shuffle = True)

In [36]:
print(len(input_ids_tr), len(input_ids_te))
print(len(input_masks_tr), len(input_masks_te))
print(len(input_segments_tr), len(input_segments_te))
print(len(y_tr), len(y_te))

2313 258
2313 258
2313 258
2313 258


RNN

In [37]:
RNN = create_model_RNN(max_length_sequence = max_length_sequence, lstm_units = 8, l2_kernel = 0, l2_recurrent = 0, l2_activity = 0, dropout = 0.5)
RNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
RNN.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 200)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 200)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   ((None, 200, 768), ( 109850880   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [38]:
RNN.fit([input_ids_tr, input_masks_tr, input_segments_tr], y_tr, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
loss, acc = RNN.evaluate([input_ids_te, input_masks_te, input_segments_te], y_te) # Evaluate model
print(loss, round(acc, 3))
RNN.save_weights('../data/Weights/BETO_RNN.h5')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
0.6316079497337341 0.74
