In [None]:
import sys
if 'google.colab' in sys.modules:
    print ("setting tensorflow version in colab")
    %tensorflow_version 1.x
import tensorflow as tf
tf.__version__

setting tensorflow version in colab
TensorFlow 1.x selected.


'1.15.2'

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data_preprocessing import remove_stop_words, text_normalization

import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer

from keras.models import Model
from keras.layers import Input, Lambda, Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import Flatten, Conv1D, MaxPooling1D
from tensorflow.keras import regularizers

from keras import backend as K

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using TensorFlow backend.


In [None]:
english_dataset = pd.read_csv('small_english_dataset.csv')
translated_dataset = pd.read_csv('spanish_t_dataset.csv')
print(english_dataset.shape, translated_dataset.shape)

(6335, 2) (2571, 2)


In [None]:
def get_X(df):
  text_normalization(df) # Normalize text
  remove_stop_words(df, language, get_tokenize = True) # Remove stop words [and Tokenize texts]

  # Padding text
  new_X = []
  for seq in df.text:
      new_seq = []
      for i in range(max_length_sequence):
          try:
              new_seq.append(seq[i])
          except:
              new_seq.append("PADword")
      new_X.append(new_seq)

  return(new_X)

In [None]:
max_length_sequence = 100
language = 'english'

In [None]:
X_english = get_X(english_dataset)
X_translated = get_X(translated_dataset)

Y_english = english_dataset.label.values
Y_translated = translated_dataset.label.values

In [None]:
print(len(X_english), len(X_english[0]))
print(len(X_translated), len(X_translated[0]))

6335 100
2571 100


Get ELMo from TensorFlow Hub

In [None]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/3", trainable = False)
elmo_model

<tensorflow_hub.module.Module at 0x7f7e67ebcb38>

Models

In [None]:
batch_size = 32

In [None]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_length_sequence])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [None]:
def create_model_RNN(max_length_sequence, lstm_units, l2_kernel, l2_recurrent, l2_activity, dropout):
    
    X_input = Input(shape = (max_length_sequence, ), dtype=tf.string)
    
    embedding_layer = Lambda(ElmoEmbedding, output_shape = (max_length_sequence, 1024))(X_input)
    
    X = LSTM(units = lstm_units, return_sequences = False,
            kernel_regularizer = regularizers.l2(l2_kernel),
            recurrent_regularizer = regularizers.l2(l2_recurrent),
            activity_regularizer = regularizers.l2(l2_activity))(embedding_layer)
    
    X = Dropout(rate = dropout)(X)
    X = Dense(units = 1, activation = 'sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

In [None]:
def create_model_CNN(max_length_sequence, filters, kernel_size, dense_units, l2_kernel):
    
    X_input = Input(shape = (max_length_sequence, ), dtype=tf.string)
    
    embedding_layer = Lambda(ElmoEmbedding, output_shape = (max_length_sequence, 1024))(X_input)
    
    X = Conv1D(filters = filters, kernel_size = kernel_size, activation = 'relu',
              kernel_regularizer = regularizers.l2(l2_kernel))(embedding_layer)
    X = MaxPooling1D(pool_size = 2)(X)
    X = Flatten()(X)
    X = Dense(units = dense_units, activation = 'relu')(X)
    X = Dense(units = 1, activation = 'sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

Execute Models

Train and Validation with English Dataset

In [None]:
epochs = 7
test_size = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(X_english, Y_english, test_size = test_size, shuffle = True)

In [None]:
print(len(X_train), len(X_test))

5701 634


In [None]:
fit_batch_size_train = int(len(X_train)/batch_size) * batch_size
X_train = X_train[:fit_batch_size_train]
Y_train = Y_train[:fit_batch_size_train]
print(len(X_train), len(Y_train))

fit_batch_size_test = int(len(X_test)/batch_size) * batch_size
X_test = X_train[:fit_batch_size_test]
Y_test = Y_train[:fit_batch_size_test]
print(len(X_test), len(Y_test))

5696 5696
608 608


RNN

In [None]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  RNN = create_model_RNN(max_length_sequence = max_length_sequence, lstm_units = 8, l2_kernel = 0, l2_recurrent = 0, l2_activity = 0, dropout = 0.5)
  RNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
  RNN.summary() 
  RNN.fit(np.array(X_train), Y_train, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
  loss, acc = RNN.evaluate(np.array(X_test), Y_test) # Evaluate model
  print(loss, round(acc, 3))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 100, 1024)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 33056     
_________________________________________________________________
dropout_1 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9         
Total params: 33,065
Trainable params: 33,065
Non-trainable params: 0
_________________________________________________________________












Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
0.12808730864995405 0.954


CNN

In [None]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  CNN = create_model_CNN(max_length_sequence = max_length_sequence, filters = 16, kernel_size = 10, dense_units = 4, l2_kernel = 0)
  CNN.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # Compile model
  CNN.summary() 
  CNN.fit(np.array(X_train), Y_train, epochs = epochs, batch_size = batch_size, shuffle = True) # Fit model
  loss, acc = CNN.evaluate(np.array(X_test), Y_test) # Evaluate model
  print(loss, round(acc, 3))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 100, 1024)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 91, 16)            163856    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 45, 16)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 720)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 2884      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5   

InvalidArgumentError: ignored

Train with English Dataset and Validation with Translated Dataset.