In [1]:
import sys
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow import keras

sys.path.append('../src/')

In [2]:
from preprocessors.pretrained_embeddings import Pretrained


In [3]:
pre_embeds = Pretrained(200, 'glove27b')
pre_embeds.create_from_file('/embeddings/glove.twitter.27B.200d.txt')

In [4]:
pre_embeds.embeddings_index.get('pippa')[:50]

array([-0.17313 , -0.083534,  0.094943,  0.27862 , -0.09849 , -0.64505 ,
       -0.034571, -0.033253, -0.14127 ,  0.77595 , -0.50909 ,  0.48752 ,
       -0.11292 ,  0.12668 ,  0.43857 ,  0.25239 ,  0.034386, -0.19116 ,
        0.1735  , -0.081495,  0.15889 , -0.46626 , -0.036711,  0.30366 ,
        0.26433 ,  0.59701 ,  0.45844 , -0.24775 , -0.35584 ,  0.037122,
        0.18221 ,  0.46557 , -0.44061 , -0.17489 ,  0.10444 ,  0.42832 ,
        0.37302 , -0.017115, -0.33439 , -0.071264,  0.77562 , -0.6526  ,
        0.47253 ,  0.32325 ,  0.095131,  0.13935 ,  0.074671,  0.31263 ,
       -0.53981 ,  0.041234], dtype=float32)

In [15]:
tmp = {}
tmp.get('james') == None

True

In [20]:
class TextData:
    def __init__(self, embedding_index, embedding_dim, batch_size=64, buffer_size=10000, output_seq_length=250, max_tokens=10000):
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.train_inputs = []
        self.train_labels = []
        self.test_inputs = []
        self.test_labels = []
        self.embedding_index = embedding_index
        self.embedding_dim = embedding_dim
        self.output_seq_length = output_seq_length
        self.vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=self.output_seq_length)
        self.missed_words = []
        self.label_index = {}
        
    def load_from_csv(self, path, delim, has_header, text_col, label_col, array, label):
        with open(path) as f:
            contents = f.read()
            lines = contents.split("\n")
            data = [row.split(delim) for row in lines if row != '']

            
            if has_header:
                header, data = data[0], data[1:]
                input_index, label_index = header.index(text_col), header.index(label_col)
            else:
                input_index, label_index = text_col, label_col
            
            for row in data:
                array.append(row[input_index])
                label.append(row[label_index])
                
    def train_data_csv(self, path, delim, has_header, text_col, label_col):
        self.load_from_csv(path, delim, has_header, text_col, label_col, self.train_inputs, self.train_labels)
        
    def test_data_csv(self, path, delim, has_header, text_col, label_col):
        self.load_from_csv(path, delim, has_header, text_col, label_col, self.test_inputs, self.test_labels)
    
    def auto_encode_labels(self):
        counter = 0
        current_index = 0
        for label in self.train_labels:
            if self.label_index.get(label) == None:
                self.label_index[label] = counter
                counter += 1
            self.train_labels[current_index] = self.label_index.get(label)
            current_index +=1
        
                
                
    
    def create_training_tensors(self):
        self.training_data = tf.data.Dataset.from_tensor_slices((self.train_inputs, self.train_labels)).shuffle(self.buffer_size).batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
        self.testing_data = tf.data.Dataset.from_tensor_slices((self.test_inputs, self.test_labels)).batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
    
    def create_embedding_matrix(self):
        voc = self.vectorizer.get_vocabulary()
        hits = 0
        misses = 0
        num_tokens = len(voc) + 2
        self.embedding_matrix = np.zeros((num_tokens, self.embedding_dim))
        for word, i in self.word_index.items():
            embedding_vector = self.embedding_index.get(word)
            if embedding_vector is not None:
                self.embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                self.missed_words.append(word)
                misses += 1
                
        self.embedding_layer = Embedding(
            num_tokens,
            self.embedding_dim,
            embeddings_initializer=keras.initializers.Constant(self.embedding_matrix),
            trainable=False
        )
        
        print("converted {} words ({} misses)".format(hits, misses))
                
    def brew(self):
        if len(self.train_inputs) > 0:
            self.auto_encode_labels()
            self.create_training_tensors()
            self.vectorizer.adapt(self.training_data.map(lambda text, label: text))
            voc = self.vectorizer.get_vocabulary()
            self.word_index = dict(zip(voc, range(len(voc))))
            self.create_embedding_matrix()
        

In [21]:
text = TextData(pre_embeds.embeddings_index, pre_embeds.dim)
text.train_data_csv('/data/joy_train.txt', delim='\t', has_header=True, text_col='Tweet', label_col='Intensity Class')
text.test_data_csv('/data/joy_test.txt', delim='\t', has_header=True, text_col='Tweet', label_col='Intensity Class')

In [22]:
print("number of training samples: {}".format(len(text.train_inputs)))
print("number of testing samples:  {}".format(len(text.test_inputs)))

number of training samples: 1616
number of testing samples:  290


In [25]:
text.brew()


converted 4208 words (1629 misses)


In [32]:
from tensorflow.keras import layers

def uncompiled_model(embedding_matrix, embedding_dim, num_tokens, num_classes):
    model = keras.Sequential([
        Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False),
        layers.Conv1D(128, 5, activation='relu'),
        layers.MaxPooling1D(5),
        layers.Conv1D(128, 5, activation="relu"),
        layers.MaxPooling1D(5),
        layers.Conv1D(128, 5, activation="relu"),
        layers.GlobalMaxPooling1D(),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

In [33]:
model = uncompiled_model(text.embedding_matrix, text.embedding_dim, len(text.vectorizer.get_vocabulary()) + 2, 4)

In [47]:
# x_train = text.vectorizer(np.array([s for s in text.train_inputs])).numpy()
# x_val = text.vectorizer(np.array([s for s in text.train_labels])).numpy()

x_train = text.vectorizer(np.array([[s] for s in text.train_inputs])).numpy()
y_train = np.array(text.train_labels)

In [48]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc", keras.metrics.SparseCategoricalAccuracy()]
)
model.fit(x_train, y_train, batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f6850465c88>

In [62]:
string_input = keras.Input(shape=(1,), dtype="string")
text2 = TextData(pre_embeds.embeddings_index, pre_embeds.dim)

text2.train_inputs = ["what is this"]
text2.train_labels = [1]
text2.brew()
text2.missed_words
x = text2.vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)
# x = text.vectorizer(string_input)
# preds = model(x)
# end_to_end_model = keras.Model(string_input, preds)

# probabilities = end_to_end_model.predict(
#     [["This day seems to be going just okay"]]
# )

# np.argmax(probabilities[0])

converted 3 words (2 misses)


In [209]:
text.embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.49349999,  0.35698   ,  0.66068   , ...,  0.17705999,
        -0.53694999, -0.29699001],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [183]:
text.vectorizer([['I need to clean this up']])

<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[  5, 107,   3, 783,  17,  34,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
   

In [148]:
text.create_training_tensors()
text.training_data

<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.string)>

In [149]:
for example, label in text.training_data.take(1):
    print('texts:\n', example.numpy()[:2])

texts:
 [b'@StephaliciousD afternoon delight'
 b"Your lion's heart\\nWill protect you under stormy skies\\nAnd I will always be listening for your laughter and your tears"]


In [229]:
text.vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'a',
 'i',
 'and',
 'you',
 'is',
 'of',
 'in',
 'that',
 'be',
 'my',
 'so',
 'it',
 'for',
 'this',
 'me',
 'with',
 'on',
 'happy',
 'your',
 'at',
 'but',
 'im',
 'just',
 'lively',
 'its',
 'have',
 'love',
 'not',
 'by',
 'was',
 'up',
 'amp',
 'smile',
 'day',
 'will',
 'all',
 'hilarious',
 'good',
 'as',
 'are',
 'watch',
 'when',
 'optimism',
 'like',
 'if',
 'amazing',
 'about',
 'can',
 'he',
 'from',
 'laughter',
 'make',
 'out',
 'more',
 'we',
 'glee',
 'dont',
 'her',
 'broadcast',
 'musically',
 'his',
 'get',
 'or',
 'time',
 'see',
 'what',
 'they',
 'life',
 'know',
 'one',
 'do',
 'smiling',
 'how',
 'always',
 'now',
 'cheer',
 'she',
 'new',
 'an',
 'some',
 'much',
 'today',
 'because',
 'want',
 'over',
 'who',
 'u',
 'has',
 'feel',
 'our',
 'joyful',
 'rejoice',
 'delight',
 'youre',
 'still',
 'sparkling',
 'people',
 'got',
 'cheerful',
 'breezy',
 'pleasing',
 'no',
 'cheering',
 'need',
 'hilarity',
 'great',
 'animated',
 '