In [11]:
import random as python_random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers import Embedding, LSTM, Dropout, Bidirectional, MaxPooling1D, Conv1D
from keras.initializers import Constant
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf
import keras
import pickle
import keras.backend as K
import pandas as pd

In [12]:
def read_corpus(corpus_file):
    '''Read in data set and returns docs and labels'''
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            tokens = line.strip()
            documents.append(tokens.split("\t")[0])
            # binary problem: NOT, OFF
            labels.append(tokens.split("\t")[1])
    return documents, labels

def read_word_emb(embeddings_file):
    '''Read embeddings dictionary file'''
    with open(embeddings_file, 'rb') as f:
        loaded_dict = pickle.load(f)
    return loaded_dict

def get_emb_matrix(voc, emb):
    '''Get embedding matrix given vocab and the embeddings'''
    num_tokens = len(voc) + 2
    word_index = dict(zip(voc, range(len(voc))))
    # Bit hacky, get embedding dimension from the word "the"
    embedding_dim = len(emb["the"])
    # Prepare embedding matrix to the correct size
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = emb.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    # Final matrix with pretrained embeddings that we can feed to embedding layer
    return embedding_matrix

def test_set_predict(model, X_test, Y_test, ident):
    '''Do predictions and measure accuracy on our own test set (that we split off train)'''
    # Get predictions using the trained model
    Y_pred = model.predict(X_test)
    # Finally, convert to labels to get scores with sklearn
    Y_pred=(Y_pred.flatten()>0.5)*1
    # If you have gold data, you can calculate accuracy
    Y_test = Y_test.flatten()
    print('Accuracy on own {1} set: {0}'.format(round(accuracy_score(Y_test, Y_pred), 3), ident))
    return Y_pred

def get_f1(y_true, y_pred): #taken from old keras source code
    '''for getting f1 scores during training'''
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

def scheduler(epoch, lr):
    '''learning rate scheduler'''
    if epoch < 7:
        return lr
    else:
        return lr * tf.math.exp(-0.1)
    
def train_model(model, X_train, Y_train, X_dev, Y_dev, batch_size, epochs):
    '''Train the model here'''
    verbose = 1
    batch_size = batch_size
    epochs = epochs
    # Early stopping
    callback1 = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    #Learning rate scheduler using function
    callback2 = tf.keras.callbacks.LearningRateScheduler(scheduler)
    #Assigning class weights for imbalanced classification
    class_weight = {0: 1.,
                1: 2.}
    # Finally fit the model to our data
    model.fit(X_train, Y_train, verbose=verbose, epochs=epochs, callbacks=[callback1,callback2], batch_size=batch_size, validation_data=(X_dev, Y_dev),
             class_weight=class_weight)
    return model

In [13]:
#Setting seeds for reproducibility
np.random.seed(1234)
tf.random.set_seed(1234)
python_random.seed(1234)

In [14]:
# Read in the data
X_train, Y_train = read_corpus("datasets/train_preprocessed.txt")
X_dev, Y_dev = read_corpus("datasets/val_preprocessed.txt")

# Transform words to indices using a vectorizer
vectorizer = TextVectorization(standardize=None, output_sequence_length=50)
# Use train and dev to create vocab - could also do just train
text_ds = tf.data.Dataset.from_tensor_slices(X_train + X_dev)
with tf.device('/cpu:0'):
    vectorizer.adapt(text_ds)
    
# Dictionary mapping words to idx
voc = vectorizer.get_vocabulary()

#changing labels to binary
encoder = LabelBinarizer()
Y_train_bin = encoder.fit_transform(Y_train)  # Use encoder.classes_ to find mapping back
Y_dev_bin = encoder.fit_transform(Y_dev)

# Transform input to vectorized input
X_train_vect = vectorizer(np.array([[s] for s in X_train])).numpy()
X_dev_vect = vectorizer(np.array([[s] for s in X_dev])).numpy()

2022-11-03 13:41:21.322639: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


## Model architecture

In [9]:
## Best model architecture used after experimenting
def create_model(Y_train, emb_matrix, lr):
    '''Create the Keras model to use'''
    
    loss_function = 'binary_crossentropy'
    optim = Adam(learning_rate=lr)
    
    # Take embedding dim and size from emb_matrix
    embedding_dim = len(emb_matrix[0])
    num_tokens = len(emb_matrix)
    
    # Now build the model
    model = Sequential()
    model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=Constant(emb_matrix),trainable=False))
    model.add(LSTM(embedding_dim, dropout=0.2))
    model.add(Dense(units=1, activation="sigmoid"))
    # Compile model using our settings, check for accuracy
    model.compile(loss=loss_function, optimizer=optim, metrics=[get_f1])
    return model

## Glove 200d

In [400]:
#Read embeddings
embeddings_ft = read_word_emb("embeddings/glove_200d.pkl")
#embeddings matrix
emb_matrix = get_emb_matrix(voc, embeddings_ft)
# Create model
model = create_model(Y_train, emb_matrix, lr=0.00001)
# Train the model
model = train_model(model, X_train_vect, Y_train_bin, X_dev_vect, Y_dev_bin, 32, 50)
y_preds=test_set_predict(model, X_dev_vect, Y_dev_bin, "dev")
print("F1 score on dev set (macro):",f1_score(Y_dev_bin.flatten(),y_preds,average='macro'))
print("Accuracy on dev set (macro):",accuracy_score(Y_dev_bin.flatten(),y_preds))
print("Conf Matrix: ", classification_report(Y_dev_bin.flatten(), y_preds))

Epoch 1/50


2022-10-30 17:37:47.041678: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:37:47.244434: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:37:47.740464: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 17:37:59.883037: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:37:59.962119: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


2022-10-30 17:40:50.922663: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:40:50.973813: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.718
F1 score on dev set (macro): 0.7022489705416535
Accuracy on dev set (macro): 0.718
Conf Matrix:                precision    recall  f1-score   support

           0       0.81      0.73      0.77       648
           1       0.58      0.69      0.63       352

    accuracy                           0.72      1000
   macro avg       0.70      0.71      0.70      1000
weighted avg       0.73      0.72      0.72      1000



## Other Embeddings and Dimensions

In [405]:
emb_file=["embeddings/glove_100d.pkl","embeddings/glove_50d.pkl",
          "embeddings/glove_25d.pkl","embeddings/fasttext.pkl"]

for emb in emb_file:
    embeddings = read_word_emb(emb)
    emb_matrix = get_emb_matrix(voc, embeddings)
    
    # Create model
    model = create_model(Y_train, emb_matrix,0.0001)
    # Train the model
    model = train_model(model, X_train_vect, Y_train_bin, X_dev_vect, Y_dev_bin, 32, 50)
    y_preds=test_set_predict(model, X_dev_vect, Y_dev_bin, "dev")
    print(emb)
    print("F1 score on dev set (macro):",f1_score(Y_dev_bin.flatten(),y_preds,average='macro'))
    print("Accuracy on dev set (macro):",accuracy_score(Y_dev_bin.flatten(),y_preds))
    print("Conf Matrix: ", classification_report(Y_dev_bin.flatten(), y_preds))
    print("\n\n")

Epoch 1/50


2022-10-30 17:51:27.283999: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:51:27.482309: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:51:27.952570: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 17:51:37.264567: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:51:37.346724: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


2022-10-30 17:53:14.798648: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:53:14.852226: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.732
embeddings/glove_100d.pkl
F1 score on dev set (macro): 0.7132045750291076
Accuracy on dev set (macro): 0.732
Conf Matrix:                precision    recall  f1-score   support

           0       0.81      0.76      0.79       648
           1       0.61      0.68      0.64       352

    accuracy                           0.73      1000
   macro avg       0.71      0.72      0.71      1000
weighted avg       0.74      0.73      0.73      1000




Epoch 1/50


2022-10-30 17:53:16.408459: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:53:16.600289: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:53:17.087753: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 17:53:25.624323: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:53:25.705831: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


2022-10-30 17:54:54.495170: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:54:54.546098: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.715
embeddings/glove_50d.pkl
F1 score on dev set (macro): 0.6980391616719519
Accuracy on dev set (macro): 0.715
Conf Matrix:                precision    recall  f1-score   support

           0       0.81      0.73      0.77       648
           1       0.58      0.68      0.63       352

    accuracy                           0.71      1000
   macro avg       0.69      0.71      0.70      1000
weighted avg       0.73      0.71      0.72      1000




Epoch 1/50


2022-10-30 17:54:56.611253: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:54:56.840667: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:54:57.364878: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 17:55:06.421804: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:55:06.510506: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


2022-10-30 17:56:35.126189: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:56:35.179386: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.615
embeddings/glove_25d.pkl
F1 score on dev set (macro): 0.6144135229684349
Accuracy on dev set (macro): 0.615
Conf Matrix:                precision    recall  f1-score   support

           0       0.84      0.50      0.63       648
           1       0.47      0.82      0.60       352

    accuracy                           0.61      1000
   macro avg       0.65      0.66      0.61      1000
weighted avg       0.71      0.61      0.62      1000




Epoch 1/50


2022-10-30 17:56:36.830564: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:56:37.051252: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:56:37.579725: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 17:56:50.080373: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:56:50.166298: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


2022-10-30 17:57:43.518486: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 17:57:43.572252: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.745
embeddings/fasttext.pkl
F1 score on dev set (macro): 0.7295517240282158
Accuracy on dev set (macro): 0.745
Conf Matrix:                precision    recall  f1-score   support

           0       0.83      0.76      0.79       648
           1       0.62      0.72      0.66       352

    accuracy                           0.74      1000
   macro avg       0.73      0.74      0.73      1000
weighted avg       0.76      0.74      0.75      1000






## Optimizing Learning Rate

In [36]:
lrs=[0.01,0.05,0.001,0.005,0.0001,0.0005,0.00001,0.00005]
emb_file="embeddings/fasttext.pkl"

embeddings = read_word_emb(emb_file)
emb_matrix = get_emb_matrix(voc, embeddings)

for l_r in lrs:
    # Create model
    model = create_model(Y_train, emb_matrix,lr=l_r)
    # Train the model
    model = train_model(model, X_train_vect, Y_train_bin, X_dev_vect, Y_dev_bin, 32, 50)
    y_preds=test_set_predict(model, X_dev_vect, Y_dev_bin, "dev")
    print(l_r)
    print("F1 score on dev set (macro):",f1_score(Y_dev_bin.flatten(),y_preds,average='macro'))
    print("Accuracy on dev set (macro):",accuracy_score(Y_dev_bin.flatten(),y_preds))
    print("Conf Matrix: ", classification_report(Y_dev_bin.flatten(), y_preds))
    print("\n\n")

Epoch 1/50


2022-10-30 20:28:11.521539: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:28:11.645868: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/383 [..............................] - ETA: 7:47 - loss: 0.9329 - get_f1: 0.2000

2022-10-30 20:28:11.839234: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:28:21.600657: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:28:21.640659: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

2022-10-30 20:29:01.445360: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:29:01.472094: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.532
0.01
F1 score on dev set (macro): 0.5023732965855152
Accuracy on dev set (macro): 0.532
Conf Matrix:                precision    recall  f1-score   support

           0       0.65      0.60      0.62       648
           1       0.36      0.41      0.38       352

    accuracy                           0.53      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.55      0.53      0.54      1000




Epoch 1/50


2022-10-30 20:29:02.632472: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:29:02.726573: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  3/383 [..............................] - ETA: 11s - loss: 3.2759 - get_f1: 0.2374 

2022-10-30 20:29:02.852774: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:29:12.231341: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:29:12.272312: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50

2022-10-30 20:29:41.291960: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:29:41.318838: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.37
0.05
F1 score on dev set (macro): 0.30011198208286677
Accuracy on dev set (macro): 0.37
Conf Matrix:                precision    recall  f1-score   support

           0       0.75      0.04      0.08       648
           1       0.36      0.97      0.52       352

    accuracy                           0.37      1000
   macro avg       0.55      0.51      0.30      1000
weighted avg       0.61      0.37      0.23      1000




Epoch 1/50


2022-10-30 20:29:42.462672: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:29:42.560069: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  3/383 [..............................] - ETA: 12s - loss: 0.9368 - get_f1: 0.3525 

2022-10-30 20:29:42.692430: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:29:53.010428: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:29:53.051523: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
 7/32 [=====>........................] - ETA: 0s

2022-10-30 20:30:23.497623: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:30:23.524037: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.47
0.001
F1 score on dev set (macro): 0.46808510638297873
Accuracy on dev set (macro): 0.47
Conf Matrix:                precision    recall  f1-score   support

           0       0.64      0.41      0.50       648
           1       0.35      0.58      0.44       352

    accuracy                           0.47      1000
   macro avg       0.50      0.50      0.47      1000
weighted avg       0.54      0.47      0.48      1000




Epoch 1/50


2022-10-30 20:30:24.664617: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:30:24.759210: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  3/383 [..............................] - ETA: 11s - loss: 0.9480 - get_f1: 0.4498 

2022-10-30 20:30:24.891469: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:30:34.932994: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:30:34.974553: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
 6/32 [====>.........................] - ETA: 0s 

2022-10-30 20:32:32.667577: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:32:32.695175: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.718
0.005
F1 score on dev set (macro): 0.7089855359619576
Accuracy on dev set (macro): 0.718
Conf Matrix:                precision    recall  f1-score   support

           0       0.85      0.69      0.76       648
           1       0.57      0.77      0.66       352

    accuracy                           0.72      1000
   macro avg       0.71      0.73      0.71      1000
weighted avg       0.75      0.72      0.72      1000




Epoch 1/50


2022-10-30 20:32:33.911648: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:32:34.008144: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  3/383 [..............................] - ETA: 11s - loss: 0.9325 - get_f1: 0.3402 

2022-10-30 20:32:34.145940: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:32:43.719330: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:32:43.759259: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

2022-10-30 20:33:32.541247: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:33:32.567593: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.739
0.0001
F1 score on dev set (macro): 0.724820263350171
Accuracy on dev set (macro): 0.739
Conf Matrix:                precision    recall  f1-score   support

           0       0.83      0.75      0.79       648
           1       0.61      0.73      0.66       352

    accuracy                           0.74      1000
   macro avg       0.72      0.74      0.72      1000
weighted avg       0.75      0.74      0.74      1000




Epoch 1/50


2022-10-30 20:33:33.895958: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:33:33.994685: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/383 [..............................] - ETA: 6:20 - loss: 0.9316 - get_f1: 0.2105

2022-10-30 20:33:34.140514: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:33:43.683758: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:33:43.726693: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
 7/32 [=====>........................] - ETA: 0s

2022-10-30 20:34:23.601672: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:34:23.629665: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.737
0.0005
F1 score on dev set (macro): 0.7234952305555642
Accuracy on dev set (macro): 0.737
Conf Matrix:                precision    recall  f1-score   support

           0       0.84      0.74      0.78       648
           1       0.60      0.73      0.66       352

    accuracy                           0.74      1000
   macro avg       0.72      0.74      0.72      1000
weighted avg       0.75      0.74      0.74      1000




Epoch 1/50


2022-10-30 20:34:24.992973: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:34:25.093609: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/383 [..............................] - ETA: 7:35 - loss: 0.9330 - get_f1: 0.1176

2022-10-30 20:34:25.247200: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:34:34.996225: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:34:35.038595: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
 6/32 [====>.........................] - ETA: 0s

2022-10-30 20:36:32.517419: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:36:32.545217: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.675
1e-05
F1 score on dev set (macro): 0.6695001520299301
Accuracy on dev set (macro): 0.675
Conf Matrix:                precision    recall  f1-score   support

           0       0.84      0.62      0.71       648
           1       0.53      0.78      0.63       352

    accuracy                           0.68      1000
   macro avg       0.68      0.70      0.67      1000
weighted avg       0.73      0.68      0.68      1000




Epoch 1/50


2022-10-30 20:36:33.871431: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:36:33.975734: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/383 [..............................] - ETA: 7:17 - loss: 0.9324 - get_f1: 0.4444

2022-10-30 20:36:34.137016: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:36:44.240222: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:36:44.283283: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
 6/32 [====>.........................] - ETA: 0s

2022-10-30 20:37:34.853697: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:37:34.881614: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.738
5e-05
F1 score on dev set (macro): 0.7233660648294795
Accuracy on dev set (macro): 0.738
Conf Matrix:                precision    recall  f1-score   support

           0       0.83      0.75      0.79       648
           1       0.61      0.72      0.66       352

    accuracy                           0.74      1000
   macro avg       0.72      0.73      0.72      1000
weighted avg       0.75      0.74      0.74      1000






## Adam vs SGD Optimizer

In [37]:
def create_model(Y_train, emb_matrix, lr):
    '''Create the Keras model to use'''
    
    loss_function = 'binary_crossentropy'
    optim = SGD(learning_rate=lr)
    
    # Take embedding dim and size from emb_matrix
    embedding_dim = len(emb_matrix[0])
    num_tokens = len(emb_matrix)
    
    # Now build the model
    model = Sequential()
    model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=Constant(emb_matrix),trainable=False))
    model.add(LSTM(embedding_dim, dropout=0.2))
    model.add(Dense(units=1, activation="sigmoid"))
    # Compile model using our settings, check for accuracy
    model.compile(loss=loss_function, optimizer=optim, metrics=[get_f1])
    return model

def train_model(model, X_train, Y_train, X_dev, Y_dev, batch_size, epochs):
    '''Train the model here'''
    verbose = 1
    batch_size = batch_size
    epochs = epochs
    # Early stopping patience changed to 5
    callback1 = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    #Learning Rate Scheduler
    callback2 = tf.keras.callbacks.LearningRateScheduler(scheduler)
    # Class weights assigned for imbalance 
    class_weight = {0: 1.,
                1: 2.}
    model.fit(X_train, Y_train, verbose=verbose, epochs=epochs, callbacks=[callback1,callback2], batch_size=batch_size, validation_data=(X_dev, Y_dev),
             class_weight=class_weight)
    return model

emb_file="embeddings/fasttext.pkl"

embeddings = read_word_emb(emb_file)
emb_matrix = get_emb_matrix(voc, embeddings)
model = create_model(Y_train, emb_matrix,lr=0.001)
# Train the model
model = train_model(model, X_train_vect, Y_train_bin, X_dev_vect, Y_dev_bin, 32, 50)
y_preds=test_set_predict(model, X_dev_vect, Y_dev_bin, "dev")
print("SGD")
print("F1 score on dev set (macro):",f1_score(Y_dev_bin.flatten(),y_preds,average='macro'))
print("Accuracy on dev set (macro):",accuracy_score(Y_dev_bin.flatten(),y_preds))
print("Conf Matrix: ", classification_report(Y_dev_bin.flatten(), y_preds))
print("\n\n")

Epoch 1/50


2022-10-30 20:37:56.404725: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:37:56.498215: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/383 [..............................] - ETA: 6:19 - loss: 0.9314 - get_f1: 0.1176

2022-10-30 20:37:56.662290: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-30 20:38:06.366874: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:38:06.409477: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
 6/32 [====>.........................] - ETA: 0s

2022-10-30 20:40:29.067591: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-10-30 20:40:29.095541: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.607
SGD
F1 score on dev set (macro): 0.46514239052771256
Accuracy on dev set (macro): 0.607
Conf Matrix:                precision    recall  f1-score   support

           0       0.65      0.87      0.74       648
           1       0.35      0.13      0.19       352

    accuracy                           0.61      1000
   macro avg       0.50      0.50      0.47      1000
weighted avg       0.54      0.61      0.55      1000






## Test Set Scores

In [15]:
def create_model(Y_train, emb_matrix, lr):
    '''Create the Keras model to use'''
    
    loss_function = 'binary_crossentropy'
    optim = Adam(learning_rate=lr)
    
    # Take embedding dim and size from emb_matrix
    embedding_dim = len(emb_matrix[0])
    num_tokens = len(emb_matrix)
    
    # Now build the model
    model = Sequential()
    model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=Constant(emb_matrix),trainable=False))
    model.add(LSTM(embedding_dim, dropout=0.2))
    model.add(Dense(units=1, activation="sigmoid"))
    # Compile model using our settings, check for accuracy
    model.compile(loss=loss_function, optimizer=optim, metrics=[get_f1])
    return model

def train_model(model, X_train, Y_train, X_dev, Y_dev, batch_size, epochs):
    '''Train the model here'''
    verbose = 1
    batch_size = batch_size
    epochs = epochs
    # Early stopping patience changed to 5
    callback1 = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    #Learning Rate Scheduler
    callback2 = tf.keras.callbacks.LearningRateScheduler(scheduler)
    # Class weights assigned for imbalance 
    class_weight = {0: 1.,
                1: 2.}
    model.fit(X_train, Y_train, verbose=verbose, epochs=epochs, callbacks=[callback1,callback2], batch_size=batch_size, validation_data=(X_dev, Y_dev),
             class_weight=class_weight)
    return model

emb_file="embeddings/fasttext.pkl"

embeddings = read_word_emb(emb_file)
emb_matrix = get_emb_matrix(voc, embeddings)

# Create model
model = create_model(Y_train, emb_matrix,lr=0.0001)
# Train the model
model = train_model(model, X_train_vect, Y_train_bin, X_dev_vect, Y_dev_bin, 32, 50)
y_preds=test_set_predict(model, X_dev_vect, Y_dev_bin, "dev")
print("F1 score on dev set (macro):",f1_score(Y_dev_bin.flatten(),y_preds,average='macro'))
print("Accuracy on dev set (macro):",accuracy_score(Y_dev_bin.flatten(),y_preds))
print("Conf Matrix: ", classification_report(Y_dev_bin.flatten(), y_preds))
print("\n\n")

Epoch 1/50


2022-11-03 13:45:44.636305: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-03 13:45:44.789702: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/383 [..............................] - ETA: 8:41 - loss: 0.9329 - get_f1: 0.2000

2022-11-03 13:45:45.004406: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-11-03 13:45:55.370216: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-03 13:45:55.421328: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
 4/32 [==>...........................] - ETA: 0s

2022-11-03 13:46:49.231714: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-03 13:46:49.260091: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.747
F1 score on dev set (macro): 0.7322113730782462
Accuracy on dev set (macro): 0.747
Conf Matrix:                precision    recall  f1-score   support

           0       0.84      0.76      0.80       648
           1       0.62      0.73      0.67       352

    accuracy                           0.75      1000
   macro avg       0.73      0.74      0.73      1000
weighted avg       0.76      0.75      0.75      1000






In [16]:
# Read in test set and vectorize
X_test, Y_test = read_corpus("datasets/test_preprocessed.txt")
Y_test_bin = encoder.fit_transform(Y_test)
X_test_vect = vectorizer(np.array([[s] for s in X_test])).numpy()

In [17]:
y_preds=test_set_predict(model, X_test_vect, Y_test_bin, "test")
print("F1 score on test set (macro):",f1_score(Y_test_bin.flatten(),y_preds,average='macro'))

Accuracy on own test set: 0.78
F1 score on test set (macro): 0.7325901442999782
