# Benchmarking and optimization: ELMo

This notebook performs benchmarking using the ELMo model.

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np
import pandas as pd
import email
import nltk
from nltk.corpus import stopwords
import re
import os

In [None]:
sess = tf.Session()
K.set_session(sess)

In [None]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable = True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.elmo = hub.Module(
            "https://tfhub.dev/google/elmo/2",
            trainable=self.trainable,
            name=f"{self.name}_module"
        )
        self.trainable_weights += K.tf.trainable_variables(
            scope=f"^{self.name}_module/.*"
        )
        super(ElmoEmbeddingLayer, self).build(input_shape)
    
    def call(self, x, mask=None):
        result = self.elmo(
            K.squeeze(K.cast(x, tf.string), axis=1),
            as_dict=True,
            signature="default"
        )["default"]
        return result
    
    def compute_output_shape(self, input_shape):
        return input_shape[0], self.dimensions

In [None]:
def convert_data(raw_data, header):
    converted_data, labels = [], []
    for i in range(raw_data.shape[0]):
        out = " ".join(raw_data[i])
        converted_data.append(out)
        labels.append(header[i])
    converted_data = np.array(converted_data, dtype=object)[:, np.newaxis]
    return converted_data, np.array(labels)

In [None]:
def build_model():
    input_text = layers.Input(shape=(1,), dtype="string")
    embedding = ElmoEmbeddingLayer()(input_text)
    dense = layers.Dense(256, activation="relu")(embedding)
    pred = layers.Dense(1, activation="sigmoid")(dense)
    model = Model(inputs=[input_text], outputs=pred)
    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )
    model.summary()
    return model

## Enron and fraudulent emails datasets

In [None]:
enron_filepath = "../data/enron-email-dataset/emails.csv"
# We will preserve the typo in the filename as that is how it appears on Kaggle.
fraud_filepath = "../data/fraudulent-email-corpus/fradulent_emails.txt"

In [None]:
emails = pd.read_csv(enron_filepath)
emails.shape

In [None]:
emails.head()

In [None]:
def extract_messages(df):
    messages = []
    for item in df["message"]:
        e = email.message_from_string(item)
        message_body = e.get_payload()
        messages.append(message_body)
    return messages

In [None]:
bodies = extract_messages(emails)

In [None]:
bodies_df = pd.DataFrame(bodies)
bodies_df.head()

In [None]:
with open(fraud_filepath, "r", encoding="latin1") as infile:
    data = infile.read()
fraud_emails = data.split("From r")
len(fraud_emails)

In [None]:
fraud_bodies = extract_messages(
    pd.DataFrame(fraud_emails, columns=["message"], dtype=str)
)
fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])
fraud_bodies_df.head()

In [None]:
print(fraud_bodies_df[0][0])

In [None]:
Nsamp = 1000
maxtokens = 50
maxtokenlen = 20

In [None]:
def tokenize(row):
    if row in [None, ""]:
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

In [None]:
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r"[\W\d]", "", token)
            token = token[:maxtokenlen]
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

In [None]:
nltk.download("stopwords")

In [None]:
english_stopwords = stopwords.words("english")

In [None]:
def stop_word_removal(row):
    token = [token for token in row if token not in english_stopwords]
    token = filter(None, token)
    return token

In [None]:
EnronEmails = bodies_df.iloc[:, 0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)
SpamEmails = fraud_bodies_df.iloc[:, 0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)
raw_data = pd.concat((SpamEmails, EnronEmails), axis=0).values

In [None]:
print(raw_data.shape)
print(raw_data[:5])

In [None]:
Categories = ["spam", "notspam"]
header = [1] * Nsamp + [0] * Nsamp

In [None]:
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p]
    header = np.asarray(header)[p]
    return data, header

### ELMo training

In [None]:
raw_data, header = unison_shuffle_data(raw_data, header)
idx = int(0.7 * raw_data.shape[0])
train_x, train_y = convert_data(raw_data[:idx], header[:idx])
test_x, test_y = convert_data(raw_data[idx:], header[idx:])

In [None]:
train_x[:5]

In [None]:
model = build_model()
model.fit(
    train_x,
    train_y,
    validation_data=(test_x, test_y),
    epochs=5,
    batch_size=32
)

## IMDB movie reviews dataset

In [None]:
def load_data(path):
    data, sentiments = [], []
    for folder, sentiment in (("neg", 0), ("pos", 1)):
        folder = os.path.join(path, folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder, name), "r") as reader:
                text = reader.read()
            text = tokenize(text)
            text = stop_word_removal(text)
            text = reg_expressions(text)
            data.append(text)
            sentiments.append(sentiment)
    data_np = np.array(data)
    data, sentiments = unison_shuffle_data(data_np, sentiments)
    return data, sentiments

In [None]:
train_path = os.path.join("..", "data", "aclImdb", "train")
raw_data, raw_header = load_data(train_path)

In [None]:
print(raw_data.shape)
print(len(raw_header))

In [None]:
random_indices = np.random.choice(
    range(len(raw_header)), size=(Nsamp * 2,),
    replace=False
)
raw_data = raw_data[random_indices]
raw_header = raw_header[random_indices]

In [None]:
# Ensure roughly balanced class distribution.
unique_elements, counts_elements = np.unique(raw_header, return_counts=True)
print(unique_elements)
print(counts_elements)

### ELMo training

In [None]:
raw_data, raw_header = unison_shuffle_data(raw_data, raw_header)
idx = int(0.7 * raw_data.shape[0])
train_x, train_y = convert_data(raw_data[:idx], raw_header[:idx])
test_x, test_y = convert_data(raw_data[idx:], raw_header[idx:])

In [None]:
train_x[:5]

In [None]:
train_y[:5]

In [None]:
model = build_model()
model.fit(
    train_x,
    train_y,
    validation_data=(test_x, test_y),
    epochs=5,
    batch_size=32
)