In [48]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [49]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow import keras

from tensorflow.keras import layers

In [50]:
df = pd.read_csv('../data/tweets_50.csv')
X = df['text_tokenized']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0) # 0.8 * 0.25 = 0.2

# train = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
# validation = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))
# test = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))

class_names = y.unique().tolist()
class_names

[2, 0, 1, 3]

In [51]:
vectorizer = TextVectorization(max_tokens=40000, output_sequence_length=400)
text_ds = tf.data.Dataset.from_tensor_slices(X_train.values).batch(128)
vectorizer.adapt(text_ds)
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 't', 'to', 'the']

In [52]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
# just testing if word_index works
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]

[4, 1170, 2063, 17, 4, 8835]

In [53]:
embedding_dim = 200

path_to_glove_file = os.path.join(
    os.path.expanduser("~"), f"Documents/Datasets/glove.6B.{embedding_dim}d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found {} word vectors.".format(len(embeddings_index)))

num_tokens = len(voc) + 2
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

embedding_layer = layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

Found 400000 word vectors.
Converted 32150 words (7850 misses)


In [54]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
# int_sequences_input = keras.Input(shape=(None,), dtype="int64")(x)
# embedded_sequences = embedding_layer(int_sequences_input)
x = embedding_layer(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
output_ = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(string_input, output_)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization_4 (TextVe (None, 400)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 400, 200)          8000400   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 396, 128)          128128    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 79, 128)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 75, 128)           82048     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 15, 128)           0   

In [55]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(X_train, y_train, batch_size=128, epochs=20, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc191165610>