In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
print("TF version: ", tf.__version__)

TF version:  2.3.1


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import remove_stop_words, text_normalization

from keras.models import Model
from keras.layers import Input, Dense, LSTM, Dropout, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras import regularizers

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aalvarez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aalvarez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Load datasets

In [3]:
english_dataset = pd.read_csv('../data/Merged/english_dataset.csv')
#english_dataset = pd.read_csv('../data/English_1/small_english_dataset.csv')
translated_dataset = pd.read_csv('../data/Merged/spanish_t_dataset.csv')
print(english_dataset.shape, translated_dataset.shape)

(51233, 2) (2571, 2)


#### Get BERT model from TensorFlow Hub

In [4]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable = False) # 386.84 MB
bert_layer

<tensorflow_hub.keras_layer.KerasLayer at 0x218a1eac7c0>

In [5]:
FullTokenizer = bert.bert_tokenization.FullTokenizer

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

#### Utility Functions

In [6]:
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [7]:
def normalize_and_tokenize_data(df, max_length_sequence):
    text_normalization(df) # Normalize text
    remove_stop_words(df, language = 'english', get_tokenize = False) # Remove stop words [and Tokenize texts]
    
    all_sentences = df['text'].values
    all_words = []
    for sent in all_sentences:
        temp = []
        temp.append('[CLS]')
        i = 0
        for w in tokenizer.tokenize(sent):
            i+=1
            if i == (max_length_sequence - 1): break
            temp.append(w)
        temp.append('[SEP]')
        all_words.append(temp)

    return all_words

#### Prepare data

In [8]:
batch_size = 32
max_length_sequence = 50

In [9]:
all_words = normalize_and_tokenize_data(english_dataset, max_length_sequence)

input_ids = np.zeros((len(all_words), max_length_sequence))
input_masks = np.zeros((len(all_words), max_length_sequence))
input_segments = np.zeros((len(all_words), max_length_sequence))

for i in range(len(all_words)):
    input_ids[i,:] = np.array(get_ids(all_words[i], tokenizer, max_length_sequence)).reshape(1,-1)
    input_masks[i,:] = np.array(get_masks(all_words[i], max_length_sequence)).reshape(1,-1)
    input_segments[i,:] = np.array(get_segments(all_words[i], max_length_sequence)).reshape(1,-1)

In [10]:
print(input_ids.shape, input_masks.shape, input_segments.shape)

(51233, 50) (51233, 50) (51233, 50)


In [11]:
print(input_ids[0])
print(input_masks[0])
print(input_segments[0])

[  101.  6221.  8398.  4299.  4841.  3407.  2047.  2095.  2681.  2612.
  2507. 11245.  6716.  5223.  2869.  9841. 21821.  2102.  8275.  2739.
  2865.  2280.  4507.  2265.  2732.  2028.  3105.  2406.  5901.  7502.
  6428. 25670.  2215.  4299.  2814.  6793.  6716.  5223.  2869.  2130.
  9841. 21821.  2102.  8275.  2739.  2865.  3407.  7965.  2047.   102.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


#### Models

In [12]:
def create_model_CNN(max_length_sequence, filters, kernel_size, dense_units, l2_kernel):
    
    input_word_ids = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_length_sequence, ), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    X = Conv1D(filters = filters, kernel_size = kernel_size, activation = 'relu',
              kernel_regularizer = regularizers.l2(l2_kernel))(sequence_output)
    X = MaxPooling1D(pool_size = 2)(X)
    X = Flatten()(X)
    X = Dense(units = dense_units, activation = 'relu')(X)
    X = Dense(units = 1, activation = 'sigmoid')(X)
                          
    model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = X)
                          
    return(model)

#### Execute Models

#### Train and Validation with English Dataset

In [14]:
test_size = 0.2
Y = english_dataset.label.values

input_ids_tr, input_ids_te, input_masks_tr, input_masks_te, input_segments_tr, input_segments_te, y_tr, y_te = train_test_split(
    input_ids, input_masks, input_segments, Y, test_size = test_size, shuffle = True)

In [15]:
print(len(input_ids_tr), len(input_ids_te))
print(len(input_masks_tr), len(input_masks_te))
print(len(input_segments_tr), len(input_segments_te))
print(len(y_tr), len(y_te))

40986 10247
40986 10247
40986 10247
40986 10247


In [16]:
CNN = create_model_CNN(max_length_sequence = max_length_sequence, filters = 16, kernel_size = 10, dense_units = 4, l2_kernel = 0)
CNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
CNN.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 50)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 50)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [13]:
epochs = 7

In [18]:
CNN.fit([input_ids_tr, input_masks_tr, input_segments_tr], y_tr, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
loss, acc = CNN.evaluate([input_ids_te, input_masks_te, input_segments_te], y_te) # Evaluate model
print(loss, round(acc, 3))
CNN.save_weights('../data/Weights/BERT_CNN.h5')

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
0.20074966549873352 0.957


#### Train with English Dataset and Evaluate with Translated Dataset

In [14]:
input_ids_tr = input_ids
input_masks_tr = input_masks
input_segments_tr = input_segments
y_tr = english_dataset.label.values

In [15]:
all_words = normalize_and_tokenize_data(translated_dataset, max_length_sequence)

input_ids = np.zeros((len(all_words), max_length_sequence))
input_masks = np.zeros((len(all_words), max_length_sequence))
input_segments = np.zeros((len(all_words), max_length_sequence))

for i in range(len(all_words)):
    input_ids[i,:] = np.array(get_ids(all_words[i], tokenizer, max_length_sequence)).reshape(1,-1)
    input_masks[i,:] = np.array(get_masks(all_words[i], max_length_sequence)).reshape(1,-1)
    input_segments[i,:] = np.array(get_segments(all_words[i], max_length_sequence)).reshape(1,-1)

In [16]:
print(input_ids.shape, input_masks.shape, input_segments.shape)

(2571, 50) (2571, 50) (2571, 50)


In [17]:
input_ids_te = input_ids
input_masks_te = input_masks
input_segments_te = input_segments
y_te = translated_dataset.label.values

In [18]:
print(len(input_ids_tr), len(input_ids_te))
print(len(input_masks_tr), len(input_masks_te))
print(len(input_segments_tr), len(input_segments_te))
print(len(y_tr), len(y_te))

51233 2571
51233 2571
51233 2571
51233 2571


In [19]:
CNN = create_model_CNN(max_length_sequence = max_length_sequence, filters = 16, kernel_size = 10, dense_units = 4, l2_kernel = 0)
CNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model

In [20]:
CNN.fit([input_ids_tr, input_masks_tr, input_segments_tr], y_tr, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
loss, acc = CNN.evaluate([input_ids_te, input_masks_te, input_segments_te], y_te) # Evaluate model
print(round(loss, 3), round(acc, 3))
CNN.save_weights('../data/Weights/BERT_CNN_Val_TData.h5')

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
3.872 0.53
