In [37]:
import random as python_random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers import Embedding, LSTM, Dropout, Bidirectional, MaxPooling1D, Conv1D
from keras.initializers import Constant
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf
import keras
import pickle
import keras.backend as K
import pandas as pd
import sys
import re
import io

In [38]:
"""
preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()




In [39]:
def read_corpus(corpus_file):
    '''Read in data set and returns docs and labels'''
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            tokens = line.strip()
            documents.append(tokens.split("\t")[0])
            # binary problem: NOT, OFF
            labels.append(tokens.split("\t")[1])
    return documents, labels

def read_word_emb(embeddings_file,voc):
    '''Read embeddings dictionary file'''
    fin = io.open(embeddings_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        if tokens[0] in voc:
            data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

def get_emb_matrix(voc, emb):
    '''Get embedding matrix given vocab and the embeddings'''
    num_tokens = len(voc) + 2
    word_index = dict(zip(voc, range(len(voc))))
    # Bit hacky, get embedding dimension from the word "the"
    embedding_dim = len(emb["the"])
    # Prepare embedding matrix to the correct size
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = emb.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    # Final matrix with pretrained embeddings that we can feed to embedding layer
    return embedding_matrix

def test_set_predict(model, X_test, Y_test, ident):
    '''Do predictions and measure accuracy on our own test set (that we split off train)'''
    # Get predictions using the trained model
    Y_pred = model.predict(X_test)
    # Finally, convert to labels to get scores with sklearn
    Y_pred=(Y_pred.flatten()>0.5)*1
    # If you have gold data, you can calculate accuracy
    Y_test = Y_test.flatten()
    print('Accuracy on own {1} set: {0}'.format(round(accuracy_score(Y_test, Y_pred), 3), ident))
    return Y_pred

def get_f1(y_true, y_pred): #taken from old keras source code
    '''for getting f1 scores during training'''
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

def scheduler(epoch, lr):
    '''learning rate scheduler'''
    if epoch < 7:
        return lr
    else:
        return lr * tf.math.exp(-0.1)
    
def train_model(model, X_train, Y_train, X_dev, Y_dev, batch_size, epochs):
    '''Train the model here'''
    verbose = 1
    batch_size = batch_size
    epochs = epochs
    # Early stopping
    callback1 = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    #Learning rate scheduler using function
    callback2 = tf.keras.callbacks.LearningRateScheduler(scheduler)
    #Assigning class weights for imbalanced classification
    class_weight = {0: 1.,
                1: 2.}
    # Finally fit the model to our data
    model.fit(X_train, Y_train, verbose=verbose, epochs=epochs, callbacks=[callback1,callback2], batch_size=batch_size, validation_data=(X_dev, Y_dev),
             class_weight=class_weight)
    return model

In [40]:
#Setting seeds for reproducibility
np.random.seed(1234)
tf.random.set_seed(1234)
python_random.seed(1234)

In [41]:
# Read in the data
X_train, Y_train = read_corpus("datasets/train.tsv")
X_dev, Y_dev = read_corpus("datasets/val.tsv")

X_train = [tokenize(x) for x in X_train]
X_dev = [tokenize(x) for x in X_dev]

# Transform words to indices using a vectorizer
vectorizer = TextVectorization(standardize=None, output_sequence_length=50)
# Use train and dev to create vocab - could also do just train
text_ds = tf.data.Dataset.from_tensor_slices(X_train + X_dev)
with tf.device('/cpu:0'):
    vectorizer.adapt(text_ds)
    
# Dictionary mapping words to idx
voc = vectorizer.get_vocabulary()

#changing labels to binary
encoder = LabelBinarizer()
Y_train_bin = encoder.fit_transform(Y_train)  # Use encoder.classes_ to find mapping back
Y_dev_bin = encoder.fit_transform(Y_dev)

# Transform input to vectorized input
X_train_vect = vectorizer(np.array([[s] for s in X_train])).numpy()
X_dev_vect = vectorizer(np.array([[s] for s in X_dev])).numpy()

2022-11-03 17:45:14.474998: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [42]:
## Best model architecture used after experimenting
def create_model(Y_train, emb_matrix, lr):
    '''Create the Keras model to use'''
    
    loss_function = 'binary_crossentropy'
    optim = Adam(learning_rate=lr)
    
    # Take embedding dim and size from emb_matrix
    embedding_dim = len(emb_matrix[0])
    num_tokens = len(emb_matrix)
    
    # Now build the model
    model = Sequential()
    model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=Constant(emb_matrix),trainable=False))
    model.add(LSTM(embedding_dim, dropout=0.2))
    model.add(Dense(units=1, activation="sigmoid"))
    # Compile model using our settings, check for accuracy
    model.compile(loss=loss_function, optimizer=optim, metrics=[get_f1])
    return model

In [43]:
#Read embeddings
embeddings_ft = read_word_emb("embeddings/glove.twitter.27B.100d.txt", voc)
#embeddings matrix
emb_matrix = get_emb_matrix(voc, embeddings_ft)
# Create model
model = create_model(Y_train, emb_matrix, lr=0.0001)
# Train the model
model = train_model(model, X_train_vect, Y_train_bin, X_dev_vect, Y_dev_bin, 32, 50)
y_preds=test_set_predict(model, X_dev_vect, Y_dev_bin, "dev")
print("F1 score on dev set (macro):",f1_score(Y_dev_bin.flatten(),y_preds,average='macro'))
print("Accuracy on dev set (macro):",accuracy_score(Y_dev_bin.flatten(),y_preds))
print("Conf Matrix: ", classification_report(Y_dev_bin.flatten(), y_preds))

Epoch 1/50


2022-11-03 17:49:34.577470: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-03 17:49:34.682585: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  3/383 [..............................] - ETA: 10s - loss: 0.9308 - get_f1: 0.4664 

2022-11-03 17:49:34.850336: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-11-03 17:49:42.099417: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-03 17:49:42.140137: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

2022-11-03 17:51:10.079973: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-03 17:51:10.109230: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Accuracy on own dev set: 0.661
F1 score on dev set (macro): 0.658978081042501
Accuracy on dev set (macro): 0.661
Conf Matrix:                precision    recall  f1-score   support

           0       0.86      0.57      0.69       648
           1       0.51      0.83      0.63       352

    accuracy                           0.66      1000
   macro avg       0.69      0.70      0.66      1000
weighted avg       0.74      0.66      0.67      1000

