In [None]:
import os
from typing import List

import numpy as np
import pandas as pd
import seaborn as sns
from comet_ml import Experiment
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
train_df = pd.read_csv('../data/tweets_50_shuffled_train.csv')
test_df = pd.read_csv('../data/tweets_50_shuffled_test.csv')
X_train = train_df['text_tokenized']
y_train = train_df['label']
X_test = test_df['text_tokenized']
y_test = test_df['label']

class_names = y_train.unique()

In [None]:
batch_size = 32
max_tokens = 30_000
output_sequence_length=2000
embedding_dim = 100



def make_vectorizer(max_tokens: int, output_sequence_length: int) -> TextVectorization:
    vect = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    text_ds = tf.data.Dataset.from_tensor_slices(X_train.values).batch(batch_size)
    vect.adapt(text_ds)
    return vect


def get_embedding_matrix(embedding_dim, num_tokens, word_index):
    path_to_glove_file = os.path.join(
        os.path.expanduser("~"), f"Documents/Datasets/glove.6B.{embedding_dim}d.txt"
    )
    embeddings_index = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    print("Found {} word vectors.".format(len(embeddings_index)))
    hits = 0
    misses = 0
    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix


def make_embedding_layer(voc: List[str], embedding_dim: int) -> layers.Embedding:
    word_index = dict(zip(voc, range(len(voc))))
    num_tokens = len(voc) + 2

    # just testing if word_index works
    test = ["the", "cat", "sat", "on", "the", "mat"]
    print("word_index test:")
    print(test, " --> ", [word_index[w] for w in test])

    embedding_matrix = get_embedding_matrix(embedding_dim, num_tokens, word_index)

    return layers.Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )


vectorizer = make_vectorizer(max_tokens=max_tokens,
                             output_sequence_length=output_sequence_length)
vocabulary = vectorizer.get_vocabulary()
embedding_layer = make_embedding_layer(vocabulary, embedding_dim=embedding_dim)


In [None]:
epochs = 200
optimizer = 'nadam'
activation = 'relu'
kernel_initializer = 'he_normal'
l2_lambda = 1e-3
# dropout=0.3
dropout=0.4
filters = 64
n_convs_depth = 1
# kernel_size = 3
kernel_sizes = [1, 2, 3]
n_convs_parallel = len(kernel_sizes)
padding='same'
pool_size = 2
# pool_size = 5
strides = 1
# strides = 2
# class_weight = {0: 1.5, 1: 1., 2: 2., 3: 3}
class_weight = None
rnn = 'GRU'
units = 64
rnn_dropout=0.2

def make_model():
    regularizers = keras.regularizers.l2(l2=l2_lambda)
    convs = []
    string_input = keras.Input(shape=(1,), dtype="string")
    x = vectorizer(string_input)
    embed = embedding_layer(x)
    for kernel_size in kernel_sizes:
        # for _ in range(n_convs_depth):
        x = layers.Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            kernel_initializer=kernel_initializer,
            kernel_regularizer=regularizers,
            padding=padding
        )(embed)
        x = layers.MaxPooling1D(pool_size=pool_size, strides=strides)(x)
        convs.append(layers.GRU(units, dropout=rnn_dropout, recurrent_dropout=rnn_dropout)(x))
#         convs.append(layers.GlobalMaxPooling1D()(x))
        

    x = layers.Concatenate()(convs)
#     x = layers.Dense(n_convs_parallel*filters, activation=activation, kernel_regularizer=regularizers)(x)
#     x = layers.Dropout(dropout)(x)
    output_ = layers.Dense(len(class_names), activation="softmax")(x)
    model = keras.Model(string_input, output_)
    model.summary()

    model.compile(
        loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["acc"]
    )
    return model
make_model()

In [5]:
project_name = 'nlp_split'
experiment = Experiment(
    project_name=project_name,
    auto_param_logging=True,
    # auto_histogram_weight_logging=True,
    auto_histogram_gradient_logging=True,
    auto_histogram_activation_logging=True,
    api_key="HeH9EtfDC2KUlCOjeQaU1CuOM",
    workspace="henrystoll",
)
params = {
    'batch_size': batch_size,
    'max_tokens': max_tokens,
    'output_sequence_length': output_sequence_length,
    'embedding_dim': embedding_dim,
    'filters': filters,
    'kernel_sizes': kernel_sizes,
    'pool_size': pool_size,
    'padding': padding,
    'strides': strides,
    'n_convs_depth': n_convs_depth,
    'n_convs_parallel': n_convs_parallel,
    'activation': activation,
    'kernel_initializer': kernel_initializer,
    'l2_lambda': l2_lambda,
    'dropout': dropout,
    'class_weight': class_weight,
    'optimizer': optimizer,
    'epochs': epochs,
    'rnn': rnn,
    'units': units,
    'rnn_dropout': rnn_dropout,
    'bert_model_name': 'CNN [1, 2, 3] -> LSTM '
}

experiment.log_parameters(params)

model = make_model()
# keras.utils.plot_model(model, "model.png", show_shapes=True)
# experiment.log_asset("model.png")

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_acc',
    patience=5,
    verbose=1,
    restore_best_weights=True)

with experiment.train():
    model.summary()
    history = model.fit(X_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        class_weight=class_weight,
                        verbose=1,
                        callbacks=[early_stopping])

with experiment.test():
    loss, accuracy = model.evaluate(X_test, y_test)
    print('acc : {:.3f}'.format(accuracy))
    metrics = {
        'loss': loss,
        'accuracy': accuracy
    }
    experiment.log_metrics(metrics)

y_predicted = model.predict(X_test)
y_predicted = y_predicted.argmax(axis=1)
experiment.log_confusion_matrix(y_test.to_numpy(), y_predicted)

experiment.end()

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/henrystoll/nlp-split/d4197df0371f47b4aa56554f0b340ecc



Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
text_vectorization (TextVectori (None, 2000)         0           input_2[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 2000, 100)    3000200     text_vectorization[1][0]         
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 2000, 64)     6464        embedding[1][0]                  
____________________________________________________________________________________________

COMET INFO: Ignoring automatic log_parameter('verbose') because 'keras:verbose' is in COMET_LOGGING_PARAMETERS_IGNORE


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200

KeyboardInterrupt: 