In [1]:
import pandas as pd
import numpy as np

import utils
from gensim.models import KeyedVectors


# nltk.download("popular")



In [2]:
data_df = utils.load_annotated_discussions_data(keep_punctuation=True)
messages = data_df[utils.COL_MESSAGE]
target = data_df[utils.COL_TARGET]

In [3]:
glove_file = "../models/glove.6B.100d.txt"

In [4]:
glove = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [5]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [6]:
messages_ds = tf.data.Dataset.from_tensor_slices(messages).batch(128)
vectorizer = TextVectorization(max_tokens=1625, output_sequence_length=250)
vectorizer.adapt(messages_ds)

In [7]:
vocab = vectorizer.get_vocabulary()
word_index = dict(zip(vocab, range(len(vocab))))

In [8]:
embedding_dim = 100
n_tokens = len(vocab) + 2
misses = []
embedding_matrix = np.zeros((n_tokens, embedding_dim))
for word, index in word_index.items():
    if word in glove:
        embedding_matrix[index] = glove[word]
    else:
        misses.append(word)

print(len(misses))


37


In [31]:
from sklearn import model_selection

index_to_target = dict(enumerate(set(target)))
target_to_index = {v: k for k, v in index_to_target.items()}
target_index = target.map(target_to_index)
(
    messages_train,
    messages_test,
    target_train,
    target_test,
) = model_selection.train_test_split(
    messages, target_index, stratify=target_index, test_size=0.25, random_state=1
)

x_train = vectorizer(np.array([[s] for s in messages_train])).numpy()
x_val = vectorizer(np.array([[s] for s in messages_test])).numpy()

y_train = np.array(target_train)
y_val = np.array(target_test)

In [136]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
from tensorflow.keras import regularizers

model = tf.keras.models.Sequential()
model.add(
    Embedding(
        n_tokens,
        embedding_dim,
        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )
)
model.add(
    layers.Conv1D(
        256,
        3,
        activation="relu",
        kernel_regularizer=regularizers.l2(1e-8),
        # bias_regularizer=regularizers.l2(1e-3),
    )
)
model.add(layers.MaxPooling1D(3))
# model.add(layers.Dropout(0.2))
model.add(
    layers.Conv1D(
        256,
        3,
        activation="relu",
        kernel_regularizer=regularizers.l2(1e-8),
        # bias_regularizer=regularizers.l2(1e-3),
    )
)
model.add(layers.GlobalMaxPooling1D())
# model.add(layers.Dropout(0.2))
model.add(
    layers.Dense(
        128,
        activation="relu",
        kernel_regularizer=regularizers.l2(1e-4),
        # bias_regularizer=regularizers.l2(1e-3),
    )
)
model.add(layers.Dropout(0.4))
model.add(
    layers.Dense(
        len(set(target)),
        activation="softmax",
        kernel_regularizer=regularizers.l2(1e-4),
        # bias_regularizer=regularizers.l2(1e-3),
    )
)
model.summary()

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_41 (Embedding)     (None, None, 100)         160500    
_________________________________________________________________
conv1d_83 (Conv1D)           (None, None, 256)         77056     
_________________________________________________________________
max_pooling1d_48 (MaxPooling (None, None, 256)         0         
_________________________________________________________________
conv1d_84 (Conv1D)           (None, None, 256)         196864    
_________________________________________________________________
global_max_pooling1d_34 (Glo (None, 256)               0         
_________________________________________________________________
dense_72 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_48 (Dropout)         (None, 128)             

In [137]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])

In [138]:
model.fit(x_train, y_train, batch_size=128, epochs=40, validation_data=(x_val, y_val))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fdb0909a640>