In [1]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14464988342089127054
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4183621632
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7389182106827825085
physical_device_desc: "device: 0, name: GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5"
]


In [1]:
import os 
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [2]:
import tensorflow as tf
import numpy as np

In [3]:
seed=101
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
            directory='corona_tweets/train',
            labels='inferred',
            seed=seed,
            subset='training',
            validation_split=0.2)

Found 40994 files belonging to 5 classes.
Using 32796 files for training.


In [4]:
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
            directory='corona_tweets/train',
            labels='inferred',
            seed=seed,
            subset='validation',
            validation_split=0.2)

Found 40994 files belonging to 5 classes.
Using 8198 files for validation.


In [5]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization()
#adapt sets the layers vocabulary
encoder.adapt(train_ds.map(lambda text, label: text))

In [6]:
vocab = np.array(encoder.get_vocabulary())
print(vocab[:20])

['' '[UNK]' 'covid' 'prices' 'food' 'store' 'supermarket' 'grocery'
 'people' 'amp' 'consumer' 'shopping' 'online' 'get' 'need' 'pandemic'
 'us' 'workers' 'panic' 'like']


In [7]:
model = tf.keras.Sequential([
    #encoder conversts text to tokens of sequence indices
    encoder,
    #converts series of words to a series of vector embeddings
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.GRU(512, return_sequences=True),
    tf.keras.layers.GRU(256, return_sequences=True),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.GRU(64, return_sequences=True),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5)
])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 512)         16961536  
_________________________________________________________________
gru (GRU)                    (None, None, 512)         1575936   
_________________________________________________________________
gru_1 (GRU)                  (None, None, 256)         591360    
_________________________________________________________________
gru_2 (GRU)                  (None, None, 128)         148224    
_________________________________________________________________
gru_3 (GRU)                  (None, None, 64)          37248     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0

In [8]:
epochs = 6
loss = tf.keras.losses.SparseCategoricalCrossentropy(name='sparse_categorical_crossentropy', from_logits=True)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=loss, metrics=['accuracy'])

In [9]:
history = model.fit(x=train_ds,
                    validation_data=val_ds,
                    epochs=epochs)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [10]:
import pickle
outfile = open('history_gru', 'wb')
pickle.dump(history.history, outfile)
outfile.close()

In [10]:
test_ds = tf.keras.preprocessing.text_dataset_from_directory(
            directory='corona_tweets/test',
            labels='inferred')

Found 40994 files belonging to 5 classes.


In [11]:
model.evaluate(test_ds)



[0.4006543755531311, 0.8826901316642761]

In [None]:
#1: [0.41042381525039673, 0.8773478865623474]
#2: [0.3899401128292084, 0.8850807547569275]
#3: [0.4006543755531311, 0.8826901316642761]

In [16]:
#averaging test loss and accuracy over three attempts
ave_result = np.add([0.41042381525039673, 0.8773478865623474],np.add([0.3899401128292084, 0.8850807547569275],[0.4006543755531311, 0.8826901316642761]))/3
ave_result

array([0.40033943, 0.88170626])