In [2]:
import numpy as np
import pandas as pd

from pretrained_embedding import get_input_plus_embedding_vectors
from grid_search_three_subsets import grid_search

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Activation, Bidirectional, Dropout
from keras.layers.embeddings import Embedding

from tensorflow.keras import regularizers

In [3]:
def create_model(vocabulary_length, max_length_sequence, emb_dim, embedding_vectors, lstm_units, 
                 l2_kernel, l2_recurrent, l2_activity, dropout):
    
    X_input = Input(shape = (max_length_sequence, ))
    embedding_layer = Embedding(input_dim = vocabulary_length, output_dim = emb_dim, weights=[embedding_vectors],
                                trainable = False, mask_zero = True)(X_input)
    
    X = LSTM(units = lstm_units, return_sequences = False,
            kernel_regularizer = regularizers.l2(l2_kernel),
            recurrent_regularizer = regularizers.l2(l2_recurrent),
            activity_regularizer = regularizers.l2(l2_activity))(embedding_layer)
    
    X = Dropout(rate = dropout)(X)
    X = Dense(units = 1)(X)
    X = Activation('sigmoid')(X)
                          
    model = Model(inputs = X_input, outputs = X)
                          
    return(model)

In [4]:
english_dataset = pd.read_csv('../data/Merged/english_dataset.csv')
english_dataset

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1
...,...,...
51228,The State Department told the Republican Natio...,0
51229,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
51230,Anti-Trump Protesters Are Tools of the Oligar...,1
51231,"ADDIS ABABA, Ethiopia —President Obama convene...",0


In [5]:
vocabulary_length = 10000
max_length_sequence = 1500
emb_dim = 300
language = 'english'
embedding_file_path = '../data/GloVe_Embedding/glove.6B.300d.txt'

In [6]:
X, df, embedding_vectors = get_input_plus_embedding_vectors(english_dataset, embedding_file_path, 
                                                           vocabulary_length, max_length_sequence, emb_dim, language)

In [10]:
model = create_model(vocabulary_length, max_length_sequence, emb_dim, embedding_vectors, 8, 0.01, 0.01, 0, 0.5)
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1500)              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 1500, 300)         3000000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 8)                 9888      
_________________________________________________________________
dropout_5 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
_________________________________________________________________
activation_5 (Activation)    (None, 1)                 0         
Total params: 3,009,897
Trainable params: 9,897
Non-trainable params: 3,000,000
_____________________________________________

In [7]:
models = []

model_0 = create_model(vocabulary_length, max_length_sequence, emb_dim, embedding_vectors, lstm_units = 2, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model_1 = create_model(vocabulary_length, max_length_sequence, emb_dim, embedding_vectors, lstm_units = 4, l2_kernel = 0.01, l2_recurrent = 0.01, l2_activity = 0, dropout = 0)
model_2 = create_model(vocabulary_length, max_length_sequence, emb_dim, embedding_vectors, lstm_units = 8, l2_kernel = 0.1, l2_recurrent = 0.1, l2_activity = 0, dropout = 0)
model_3 = create_model(vocabulary_length, max_length_sequence, emb_dim, embedding_vectors, lstm_units = 8, l2_kernel = 0, l2_recurrent = 0, l2_activity = 0, dropout = 0.5)

models.append(model_0)
models.append(model_1)
models.append(model_2)
models.append(model_3)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
Y = df.label.values
epochs = 7
batch_size = 32
iterations = 5
test_size = 0.2

In [11]:
best_model_index, test_acc = grid_search(X, Y, models, epochs, batch_size, iterations, test_size)
print('best_model_index: ' + str(best_model_index), 'test acc: ' + str(test_acc))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 0 --> dev_acc: 0.927 +- 0.008
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 1 --> dev_acc: 0.935 +- 0.006
Epoch 1/7


Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 2 --> dev_acc: 0.866 +- 0.03
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model 3 --> dev_acc: 0.962 +- 0.006
Epoch 1/7


Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
best_model_index: 3 test acc: 0.924


#### Train Model with English Dataset and Evaluate with Translated Dataset