<a href="https://colab.research.google.com/github/hellojustxn/nlp-projects/blob/main/classify_days.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!python -m pip install tensorflow_text
import pathlib
import collections
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import utils
import math
import random
from sklearn.metrics import accuracy_score

from datetime import datetime
from packaging import version
%load_ext tensorboard

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."


TensorFlow version:  2.6.0


# Generate data
Populate an list of days and an list of labels

In [2]:
days_dict = {
    0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"
}

days_arr = []
labels_arr = []
DATASET_SIZE = 90000
BATCH_SIZE = 64
# DATASET_SIZE = 90000
# BATCH_SIZE = 128

for i in range(DATASET_SIZE):
  label = [1, 1, 1, 1, 1, 1, 1]
  for i in range(random.randint(0,6)):
    random_day = random.randint(0,6)
    label[random_day] = 0
  event = [days_dict[index] + ", " if label[index] == 1 else "" for index, value in enumerate(label)]
  random.shuffle(event)
  days_arr.append(["".join(event)])
  labels_arr.append(label)

print(len(days_arr))
print(len(labels_arr))



90000
90000


In [3]:
for event in days_arr[0:10]:
  print(event)

['Sunday, Friday, Thursday, Saturday, Wednesday, ']
['Sunday, Friday, Wednesday, Tuesday, ']
['Wednesday, Saturday, Friday, ']
['Thursday, Friday, Sunday, Wednesday, Monday, Saturday, Tuesday, ']
['Sunday, Thursday, Monday, Saturday, Tuesday, ']
['Thursday, Tuesday, Sunday, ']
['Thursday, Saturday, Friday, ']
['Monday, Thursday, Sunday, ']
['Friday, Wednesday, Thursday, Tuesday, Monday, Sunday, ']
['Thursday, Wednesday, Sunday, Friday, Monday, Tuesday, Saturday, ']


# Vectorize each line

In [4]:
days_sequence = [ event[0].lower().replace(',', '').split() for event in days_arr]
print(len(days_sequence))

90000


In [5]:
for sequence in days_sequence[0:5]:
  print(sequence)

['sunday', 'friday', 'thursday', 'saturday', 'wednesday']
['sunday', 'friday', 'wednesday', 'tuesday']
['wednesday', 'saturday', 'friday']
['thursday', 'friday', 'sunday', 'wednesday', 'monday', 'saturday', 'tuesday']
['sunday', 'thursday', 'monday', 'saturday', 'tuesday']


Create a vocabulary

In [6]:
vocab, index = {}, 1 # Start the index at one since 0 is denoted as padding
vocab['<pad>'] = 0
for i in range(7):
  vocab[days_dict[i].lower()] = i+1

print(len(vocab))
print(vocab)

8
{'<pad>': 0, 'monday': 1, 'tuesday': 2, 'wednesday': 3, 'thursday': 4, 'friday': 5, 'saturday': 6, 'sunday': 7}


Create a reverse vocabulary

In [7]:
reverse_vocab = {vocab[key]: key for key in vocab}
print(len(reverse_vocab))
print(reverse_vocab)

8
{0: '<pad>', 1: 'monday', 2: 'tuesday', 3: 'wednesday', 4: 'thursday', 5: 'friday', 6: 'saturday', 7: 'sunday'}


Vectorize the sentence

In [8]:
text_vector = []
for sequence in days_sequence:
  tmp = []
  for day in sequence:
    tmp.append(vocab[day])
  text_vector.append(tmp)

print(len(text_vector))

90000


In [9]:
for sequence in text_vector[0:5]:
  print(sequence)

[7, 5, 4, 6, 3]
[7, 5, 3, 2]
[3, 6, 5]
[4, 5, 7, 3, 1, 6, 2]
[7, 4, 1, 6, 2]


In [10]:
#  text_ragged_tf = tf.ragged.constant(text_vector)
#  text_tensor = text_ragged_tf.to_tensor(default_value=0, shape=[None,11])

# TODO: Try padding manually instead of creating a ragged tensor
# Pad manually

SHAPE = 15
text_tensor = []
for sequence in text_vector:
  while len(sequence) != SHAPE:
    sequence.append(0)

print(type(text_vector))
print(len(text_vector))
print(text_vector)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
print(type(labels_arr))
print(len(labels_arr))
print(labels_arr)

<class 'list'>
90000
[[0, 0, 1, 1, 1, 1, 1], [0, 1, 1, 0, 1, 0, 1], [0, 0, 1, 0, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 1, 0, 1, 1], [0, 1, 0, 1, 0, 0, 1], [0, 0, 0, 1, 1, 1, 0], [1, 0, 0, 1, 0, 0, 1], [1, 1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 0, 0], [0, 1, 1, 0, 0, 1, 0], [1, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 0, 1, 0], [0, 1, 0, 0, 1, 1, 1], [0, 0, 1, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [0, 1, 0, 1, 0, 0, 0], [0, 1, 1, 0, 0, 1, 0], [0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1, 0], [1, 1, 1, 0, 1, 1, 1], [0, 1, 0, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1], [0, 0, 1, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 0, 0, 1, 1], [0, 1, 0, 0, 0, 1, 0], [1, 0, 1, 1, 1, 1, 0], [1, 1, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1, 1], [1, 1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1, 0], [1, 0, 0, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 0,

In [12]:
for text in text_tensor[0:12]:
  print(text)

# Prepare training data

In [13]:
labeled_text_ds = tf.data.Dataset.from_tensor_slices((text_vector, labels_arr))
# labeled_text_ds = tf.data.Dataset.from_tensor_slices((text_tensor, labels_arr))
print(type(labeled_text_ds))
print(tf.data.experimental.cardinality(labeled_text_ds)) # Dataset contains 100 tensors, unable to get shape of each tensor

<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
tf.Tensor(90000, shape=(), dtype=int64)


In [14]:
for text_tensor in labeled_text_ds.take(5):
  print(text_tensor)

(<tf.Tensor: shape=(15,), dtype=int32, numpy=array([7, 5, 4, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, <tf.Tensor: shape=(7,), dtype=int32, numpy=array([0, 0, 1, 1, 1, 1, 1], dtype=int32)>)
(<tf.Tensor: shape=(15,), dtype=int32, numpy=array([7, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, <tf.Tensor: shape=(7,), dtype=int32, numpy=array([0, 1, 1, 0, 1, 0, 1], dtype=int32)>)
(<tf.Tensor: shape=(15,), dtype=int32, numpy=array([3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, <tf.Tensor: shape=(7,), dtype=int32, numpy=array([0, 0, 1, 0, 1, 1, 0], dtype=int32)>)
(<tf.Tensor: shape=(15,), dtype=int32, numpy=array([4, 5, 7, 3, 1, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, <tf.Tensor: shape=(7,), dtype=int32, numpy=array([1, 1, 1, 1, 1, 1, 1], dtype=int32)>)
(<tf.Tensor: shape=(15,), dtype=int32, numpy=array([7, 4, 1, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, <tf.Tensor: shape=(7,), dtype=int32, numpy=array([1, 1, 0, 1, 0, 1, 1], dtype=int32)>)


In [15]:
for text_tensor in labeled_text_ds.take(1):
  print(f"text shape: {text_tensor[0].shape}, label shape: {text_tensor[1].shape}")

text shape: (15,), label shape: (7,)


# Split the data to train and validate

In [16]:
# %%script false 

train_data = labeled_text_ds.skip(int(.2*DATASET_SIZE))
validation_data = labeled_text_ds.take(int(.2*DATASET_SIZE))
print(tf.data.experimental.cardinality(validation_data))
print(tf.data.experimental.cardinality(train_data))

tf.Tensor(18000, shape=(), dtype=int64)
tf.Tensor(72000, shape=(), dtype=int64)


In [17]:
%%script false 
for train_tensor in train_data.take(10):
  print(f"Text Shape: {train_tensor[0].shape} ---> Text Data: {train_tensor[0]}")

In [18]:
%%script false 
for train_tensor in train_data.take(3):
  print(train_tensor)

In [19]:
BUFFER_SIZE=65000

train_data = train_data.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([SHAPE], [None]))
validation_data = validation_data.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([SHAPE], [None]),)

Check the type of train data

In [20]:
print(type(train_data))

<class 'tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset'>


Check the shape of the first element in train data

In [21]:
sample_text, label = next(iter(train_data))

print(sample_text[0])
print(label[0])

vocab['sunday']

tf.Tensor([6 7 1 5 4 2 3 0 0 0 0 0 0 0 0], shape=(15,), dtype=int32)
tf.Tensor([1 1 1 1 1 1 1], shape=(7,), dtype=int32)


7

In [22]:
AUTOTUNE = tf.data.AUTOTUNE


def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

# Build the model

In [23]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)


inputs = keras.Input(shape=(SHAPE)) # Create a layer that will take in the inputs object
inputs.shape
inputs.dtype

# Deeper = more steps
dense = layers.Dense(20, activation="relu") # The number of nodes by trial and error 
x = dense(inputs)
x = layers.Dense(20, activation="relu")(x) 
x = layers.Dense(20, activation="relu")(x) 
x = layers.Dense(20, activation="relu")(x) 


# # tut
# dense = layers.Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu')
# outputs = layers.Dense(7, activation="sigmoid")(dense)

outputs = layers.Dense(7, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="day_of_the_week_model")

model.compile(
    optimizer='adam',
    metrics=["accuracy"],
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
)

In [24]:
%tensorboard --logdir logs/scalars


Reusing TensorBoard on port 6006 (pid 20375), started 1:55:01 ago. (Use '!kill 20375' to kill it.)

<IPython.core.display.Javascript object>

In [25]:

training_history = model.fit(train_data,
                             validation_data=validation_data,
                             callbacks=[tensorboard_callback],
                             epochs=300,
                            )
print("Average test loss: ", np.average(training_history.history['loss']))

Epoch 1/300


  '"`binary_crossentropy` received `from_logits=True`, but the `output`'


Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 7

In [26]:
model.summary()

Model: "day_of_the_week_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
dense (Dense)                (None, 20)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_2 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_3 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_4 (Dense)              (None, 7)                 147       
Total params: 1,727
Trainable params: 1,727
Non-trainable params: 0
___________________________________________

# Test

In [27]:
def encode(text):
  tokens = text.lower().replace(',', '').split()
  res = [ vocab[token] for token in tokens ]
  while len(res) < SHAPE:
    res.append(0)
  return res

encoded_str = encode("friday, saturday, Sunday monday")

prediction = model.predict([encoded_str])
print("Encoded String: ", encoded_str)
print("Predicted value")
print("----------")
for index, value in enumerate(prediction[0]):
  if value > 0.85:
    print(f"{reverse_vocab[index+1]}: {value}")
print(prediction[0])
# print(f"String: {example_string})
# Encoded string: {encode(example_string)} \n\
# Prediction: {prediction}")
print('\n')
print("True Value")
print("----------")
# print(np.array(label[0]))
for encoding in encoded_str:
  if encoding != 0:
    print(f"{reverse_vocab[encoding]}")

print("max", max(prediction[0]))

Encoded String:  [5, 6, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Predicted value
----------
monday: 1.0
friday: 1.0
saturday: 1.0
sunday: 1.0
[1.0000000e+00 1.2126982e-01 1.9046456e-09 7.0096231e-09 1.0000000e+00
 1.0000000e+00 1.0000000e+00]


True Value
----------
friday
saturday
sunday
monday
max 1.0


In [28]:
test_days_arr = []
test_labels_arr = []
TEST_DATASET_SIZE = 5000

for i in range(TEST_DATASET_SIZE):
  label = [1, 1, 1, 1, 1, 1, 1]
  for i in range(random.randint(0,6)):
    random_day = random.randint(0,6)
    label[random_day] = 0
  event = [days_dict[index] + ", " if label[index] == 1 else "" for index, value in enumerate(label)]
  random.shuffle(event)
  test_days_arr.append(["".join(event)])
  test_labels_arr.append(label)

test_days_tensor = tf.data.Dataset.from_tensor_slices(test_days_arr)
test_labels_tensor = tf.data.Dataset.from_tensor_slices(test_labels_arr)

# Encode and pad
test_days_arr = [ encode(sequence[0]) for sequence in test_days_arr]

In [29]:
yhat = model.predict(test_days_arr)
yhat = yhat.round()

In [30]:
# calculate accuracy
acc = accuracy_score(test_labels_arr, yhat)
# store result
print('>%.3f' % acc)

>0.974
