In [46]:
import os
from typing import List

import numpy as np
import pandas as pd
import seaborn as sns
from comet_ml import Experiment
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [47]:
df = pd.read_csv('../data/tweets_50.csv')
X = df['text_tokenized']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0)
class_names = y.unique()

In [48]:
batch_size = 128
max_tokens = 20_000
output_sequence_length= 400
embedding_dim = 50



def make_vectorizer(max_tokens: int, output_sequence_length: int) -> TextVectorization:
    vect = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    text_ds = tf.data.Dataset.from_tensor_slices(X_train.values).batch(batch_size)
    vect.adapt(text_ds)
    return vect


def get_embedding_matrix(embedding_dim, num_tokens, word_index):
    path_to_glove_file = os.path.join(
        os.path.expanduser("~"), f"Documents/Datasets/glove.6B.{embedding_dim}d.txt"
    )
    embeddings_index = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    print("Found {} word vectors.".format(len(embeddings_index)))
    hits = 0
    misses = 0
    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix


def make_embedding_layer(voc: List[str], embedding_dim: int) -> layers.Embedding:
    word_index = dict(zip(voc, range(len(voc))))
    num_tokens = len(voc) + 2

    # just testing if word_index works
    test = ["the", "cat", "sat", "on", "the", "mat"]
    print("word_index test:")
    print(test, " --> ", [word_index[w] for w in test])

    embedding_matrix = get_embedding_matrix(embedding_dim, num_tokens, word_index)

    return layers.Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )


vectorizer = make_vectorizer(max_tokens=max_tokens,
                             output_sequence_length=output_sequence_length)
vocabulary = vectorizer.get_vocabulary()
embedding_layer = make_embedding_layer(vocabulary, embedding_dim=embedding_dim)


word_index test:
['the', 'cat', 'sat', 'on', 'the', 'mat']  -->  [4, 1170, 2063, 17, 4, 8835]
Found 400000 word vectors.
Converted 18523 words (1477 misses)


In [49]:
n_layers = 5
activation = 'selu'
kernel_initializer = 'lecun_normal'
l2_lambda = 1e-3
dropout=0.2
units=2000

# class_weight = {0: 1., 1: 1., 2: 1., 3: 1}
class_weight = None
epochs = 200
optimizer = 'nadam'

def make_model():
    regularizers = keras.regularizers.l2(l2=l2_lambda)
    string_input = keras.Input(shape=(1,), dtype="string")
    x = vectorizer(string_input)
    embed = embedding_layer(x)
    x = layers.Flatten()(embed)
    
    # create fully connected layer
    for _ in range(n_layers):
        x = layers.Dense(
            units=units,
            activation=activation,
            kernel_initializer=kernel_initializer,
            kernel_regularizer=regularizers
            )(x)
        x = layers.Dropout(dropout)(x)
    output_ = layers.Dense(len(class_names), activation="softmax")(x)
    model = keras.Model(string_input, output_)
    model.summary()

    model.compile(
        loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["acc"]
    )
    return model

In [44]:
project_name = 'nlp_embeddings_mlp'
experiment = Experiment(
    project_name=project_name,
    auto_param_logging=True,
    # auto_histogram_weight_logging=True,
    auto_histogram_gradient_logging=True,
    auto_histogram_activation_logging=True,
    api_key="HeH9EtfDC2KUlCOjeQaU1CuOM",
    workspace="henrystoll",
)
params = {
    'n_layers': n_layers,
    'batch_size': batch_size,
    'max_tokens': max_tokens,
    'output_sequence_length': output_sequence_length,
    'embedding_dim': embedding_dim,
    'units': units,
    'activation': activation,
    'kernel_initializer': kernel_initializer,
    'l2_lambda': l2_lambda,
    'dropout': dropout,
    'class_weight': class_weight,
    'optimizer': optimizer,
    'epochs': epochs,
}

experiment.log_parameters(params)

model = make_model()
keras.utils.plot_model(model, "model-mlp.png", show_shapes=True)
experiment.log_asset("model-mlp.png")

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_acc',
    patience=10,
    verbose=1,
    restore_best_weights=True)

with experiment.train():
    history = model.fit(X_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=(X_val, y_val),
                        # class_weight=class_weight,
                        verbose=1,
                        callbacks=[early_stopping])

with experiment.test():
    loss, accuracy = model.evaluate(X_test, y_test)
    print(loss, accuracy)
    metrics = {
        'loss': loss,
        'accuracy': accuracy
    }
    experiment.log_metrics(metrics)

y_predicted = model.predict(X_test)
y_predicted = y_predicted.argmax(axis=1)
experiment.log_confusion_matrix(y_test.to_numpy(), y_predicted)

experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/henrystoll/nlp-embeddings-mlp/ada5e08b4c724dfba45b77f102ea2988
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     train_acc               : 0.39267829060554504
COMET INFO:     train_batch_acc [6]     : (0.0859375, 0.4296875)
COMET INFO:     train_batch_loss [6]    : (17.834327697753906, 112.7439193725586)
COMET INFO:     train_epoch_duration    : 311.7163088310001
COMET INFO:     train_loss              : 61.89064407348633
COMET INFO:     train_val_acc           : 0.3314344882965088
COMET INFO:     train_val_loss          : 19.35145378112793
COMET INFO:     validate_batch_acc [2]  : (0.35085228085517883, 0.421875)
COMET INFO:     validate_batch_loss [2] : (17.731243133544922, 19.21815299987793)
COMET INFO:   Others:
COMET INFO:     train_tr