In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import remove_stop_words, text_normalization

In [3]:
import sys
if 'google.colab' in sys.modules:
    print ("setting tensorflow version in colab")
    %tensorflow_version 1.x
import tensorflow as tf
tf.__version__

'1.15.0'

In [4]:
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer

from keras.models import Model
from keras.layers import Input, Lambda, Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import Flatten, Conv1D, MaxPooling1D
from tensorflow.keras import regularizers

from keras import backend as K

Using TensorFlow backend.


In [5]:
english_dataset= pd.read_csv('../data/Merged/english_dataset.csv')
translated_dataset = pd.read_csv('../data/Merged/spanish_t_dataset.csv')
print(english_dataset.shape, translated_dataset.shape)

(51233, 2) (2571, 2)


In [6]:
def get_X(df):
    text_normalization(df) # Normalize text
    remove_stop_words(df, language, get_tokenize = True) # Remove stop words [and Tokenize texts]

    # Padding text
    new_X = []
    for seq in df.text:
        new_seq = []
        for i in range(max_length_sequence):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("PADword")
        new_X.append(new_seq)

    return(new_X)

In [7]:
max_length_sequence = 50
language = 'english'

In [7]:
X_english = get_X(english_dataset)
X_translated = get_X(translated_dataset)

Y_english = english_dataset.label.values
Y_translated = translated_dataset.label.values

In [8]:
print(len(X_english), len(X_english[0]))
print(len(X_translated), len(X_translated[0]))

51233 50
2571 50


#### Get ELMo from TensorFlow Hub

In [8]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/3", trainable = False)
elmo_model

<tensorflow_hub.module.Module at 0x2df764f3b00>

#### Models

In [9]:
batch_size = 32

In [10]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_length_sequence])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [22]:
def create_model_RNN(max_length_sequence, lstm_units, l2_kernel, l2_recurrent, l2_activity, dropout):
    
    X_input = Input(shape = (max_length_sequence, ), dtype=tf.string)
    
    embedding_layer = Lambda(ElmoEmbedding, output_shape = (max_length_sequence, 1024))(X_input)
    
    X = LSTM(units = lstm_units, return_sequences = False,
            kernel_regularizer = regularizers.l2(l2_kernel),
            recurrent_regularizer = regularizers.l2(l2_recurrent),
            activity_regularizer = regularizers.l2(l2_activity))(embedding_layer)
    
    X = Dropout(rate = dropout)(X)
    X = Dense(units = 1, activation = 'sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

In [11]:
def create_model_CNN(max_length_sequence, filters, kernel_size, dense_units, l2_kernel):
    
    X_input = Input(shape = (max_length_sequence, ), dtype=tf.string)
    
    embedding_layer = Lambda(ElmoEmbedding, output_shape = (max_length_sequence, 1024))(X_input)
    
    X = Conv1D(filters = filters, kernel_size = kernel_size, activation = 'relu',
              kernel_regularizer = regularizers.l2(l2_kernel))(embedding_layer)
    X = MaxPooling1D(pool_size = 2)(X)
    X = Flatten()(X)
    X = Dense(units = dense_units, activation = 'relu')(X)
    X = Dense(units = 1, activation = 'sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

#### Execute Models

#### Train and Validation with English Dataset

In [12]:
def fit_batch_size(X_train, Y_train, X_test, Y_test):
    fit_batch_size_train = int(len(X_train)/batch_size) * batch_size
    X_train = X_train[:fit_batch_size_train]
    Y_train = Y_train[:fit_batch_size_train]

    fit_batch_size_test = int(len(X_test)/batch_size) * batch_size
    X_test = X_test[:fit_batch_size_test]
    Y_test = Y_test[:fit_batch_size_test]
    
    return(X_train, X_test, Y_train, Y_test)

In [23]:
epochs = 7
test_size = 0.2
X_train, X_test, Y_train, Y_test = train_test_split(X_english, Y_english, test_size = test_size, shuffle = True)

In [42]:
X_train, X_test, Y_train, Y_test = fit_batch_size(X_train, Y_train, X_test, Y_test)
print(len(X_train), len(Y_train), len(X_test), len(Y_test))

40960 40960 10240 10240


RNN

In [26]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    RNN = create_model_RNN(max_length_sequence = max_length_sequence, lstm_units = 8, l2_kernel = 0, l2_recurrent = 0, l2_activity = 0, dropout = 0.5)
    RNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
    RNN.summary() 
    RNN.fit(np.array(X_train), Y_train, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
    loss, acc = RNN.evaluate(np.array(X_test), Y_test) # Evaluate model
    print(loss, round(acc, 3))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50)                0         
_________________________________________________________________
lambda_3 (Lambda)            (None, 50, 1024)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 33056     
_________________________________________________________________
dropout_1 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
Total params: 33,065
Trainable params: 33,065
Non-trainable params: 0
_________________________________________________________________
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
0.08423896026215516 0.973


CNN

In [17]:
epochs = 5

In [18]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    CNN = create_model_CNN(max_length_sequence = max_length_sequence, filters = 16, kernel_size = 10, dense_units = 4, l2_kernel = 0)
    CNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
    CNN.summary() 
    CNN.fit(np.array(X_train), Y_train, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
    loss, acc = CNN.evaluate(np.array(X_test), Y_test) # Evaluate model
    print(loss, round(acc, 3))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 50, 1024)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 41, 16)            163856    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 20, 16)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 320)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 1284      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5   










Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.1422757276566699 0.957


#### Train with English Dataset and Validation with Translated Dataset

In [13]:
epochs = 7

In [None]:
X_train = X_english
Y_train = Y_english
X_test = X_translated
Y_test = Y_translated
print(len(X_train), len(Y_train), len(X_test), len(Y_test))

In [45]:
X_train, X_test, Y_train, Y_test = fit_batch_size(X_train, Y_train, X_test, Y_test)
print(len(X_train), len(Y_train), len(X_test), len(Y_test))

51232 51232 2560 2560


In [46]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    CNN = create_model_CNN(max_length_sequence = max_length_sequence, filters = 16, kernel_size = 10, dense_units = 4, l2_kernel = 0)
    CNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
    CNN.summary()
    CNN.fit(np.array(X_train), Y_train, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
    loss, acc = CNN.evaluate(np.array(X_test), Y_test) # Evaluate model
    print(round(loss, 3), round(acc, 3))
    CNN.save_weights('../data/Weights/ELMo_CNN.h5')

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 50)                0         
_________________________________________________________________
lambda_4 (Lambda)            (None, 50, 1024)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 41, 16)            163856    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 20, 16)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 320)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 1284      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 5   

#### Train with English Dataset and Validation with Translated Dataset - Mixing 2500 Samples

In [15]:
# add n_samples from data_2 to data_1
def add_data_portion(data_1, data_2, n_samples):
    df_slice = data_2.sample(n_samples)
    df_rest = data_2.loc[~data_2.index.isin(df_slice.index)]
    df_extended = data_1.append(df_slice, ignore_index = True)
    
    df_rest.reset_index(inplace = True)
    del(df_rest['index'])
    df_extended.reset_index(inplace = True)
    del(df_extended['index'])
    
    return df_extended, df_rest

In [16]:
english_dataset= pd.read_csv('../data/Merged/english_dataset.csv')
translated_dataset = pd.read_csv('../data/Merged/spanish_t_dataset.csv')
print(english_dataset.shape, translated_dataset.shape)

(51233, 2) (2571, 2)


In [17]:
train_dataset, test_dataset = add_data_portion(english_dataset, translated_dataset, 2500) # add 2500 samples from data_2 to data_1
print(train_dataset.shape, test_dataset.shape)

(53733, 2) (71, 2)


In [18]:
X_train = get_X(train_dataset)
X_test = get_X(test_dataset)

Y_train = train_dataset.label.values
Y_test = test_dataset.label.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply((lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x)))


In [19]:
print(len(X_train), len(X_train[0]))
print(len(X_test), len(X_test[0]))

53733 50
71 50


In [20]:
X_train, X_test, Y_train, Y_test = fit_batch_size(X_train, Y_train, X_test, Y_test)
print(len(X_train), len(Y_train), len(X_test), len(Y_test))

53728 53728 64 64


In [21]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    CNN = create_model_CNN(max_length_sequence = max_length_sequence, filters = 16, kernel_size = 10, dense_units = 4, l2_kernel = 0)
    CNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
    CNN.summary()
    CNN.fit(np.array(X_train), Y_train, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
    loss, acc = CNN.evaluate(np.array(X_test), Y_test) # Evaluate model
    print(round(loss, 3), round(acc, 3))
    CNN.save_weights('../data/Weights/ELMo_CNN_Plus_2500.h5')

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 50, 1024)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 41, 16)            163856    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 20, 16)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 320)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 1284      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5   










Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
1.283 0.703
