In [92]:
import numpy as np
import tensorflow as tf
import keras
from keras import layers
import json

In [93]:
features = []
labels = []

with open("../../data/domain1_train.json") as f:
    for line in f:
        # read line by line
        data = json.loads(line)
        
        # add values
        features.append(data["text"])
        labels.append(data["label"])

with open("../../data/domain2_train.json") as f:
    for line in f:
        # read line by line
        data = json.loads(line)
        
        # add values
        features.append(data["text"])
        labels.append(data["label"])

## Padding
- Set 100-200 words first
- Increse the words limit to 1000.

In [94]:
padded_inputs = tf.keras.utils.pad_sequences(features, padding="post", value=5000)
print(len(padded_inputs[0]))
print(padded_inputs)

1075
[[  70  746  825 ... 5000 5000 5000]
 [1209  179 1952 ... 5000 5000 5000]
 [ 287    3 3330 ... 5000 5000 5000]
 ...
 [  10    0   21 ... 5000 5000 5000]
 [  18   39  316 ... 5000 5000 5000]
 [  10    0  859 ... 5000 5000 5000]]


In [95]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_inputs, labels, test_size=0.20, random_state=42)

## Embeding


In [60]:
# embedding = layers.Embedding(input_dim=5001, output_dim=16, mask_zero=True)
# masked_output = embedding(padded_inputs)

# print(masked_output._keras_mask)

# masking_layer = layers.Masking()
# # Simulate the embedding lookup by expanding the 2D input to 3D,
# # with embedding dimension of 10.
# unmasked_embedding = tf.cast(
#     tf.tile(tf.expand_dims(padded_inputs, axis=-1), [1, 1, 10]), tf.float32
# )

# masked_embedding = masking_layer(unmasked_embedding)
# print(masked_embedding._keras_mask[0])

In [96]:
SEQUENCE_LENGTH = 300
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
OUTPUT_SIZE = 2  # binary classification
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001

In [104]:
model = keras.Sequential(
    [
        # layers.Embedding(input_dim=5001, output_dim=100, input_length = 238,  mask_zero=False),
        # layers.Masking(mask_value=5000),
        # layers.LSTM(64),
        # layers.Dense(64, activation='relu'),
        # layers.Dropout(0.5),
        # layers.Dense(1, activation='softmax')
        
        # Embedding layer for word embeddings
        keras.layers.Embedding(input_dim=5001, output_dim=EMBEDDING_SIZE, input_length=1075),
        layers.Masking(mask_value=5000),
        layers.LSTM(HIDDEN_SIZE, dropout=0.5),
        layers.Dense(HIDDEN_SIZE, activation='relu'),

        # Simple RNN layer with 8 units
        # keras.layers.SimpleRNN(20, activation='relu'),

        # Dense layer for binary classification
        # keras.layers.Dense(1, activation='sigmoid')
        layers.Dense(1, activation='softmax')
    ]
)

In [105]:
# inputs = keras.Input(shape=(None,), dtype="int32")
# x = layers.Embedding(input_dim=5000, output_dim=16, mask_zero=True)(inputs)
# outputs = layers.LSTM(32)(x)

# model = keras.Model(inputs, outputs)

In [106]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [107]:
X_train

array([[  11,  182,  158, ..., 5000, 5000, 5000],
       [   2, 1141,    8, ..., 5000, 5000, 5000],
       [1029,    6,   67, ..., 5000, 5000, 5000],
       ...,
       [ 407,  119,  175, ..., 5000, 5000, 5000],
       [  58, 1111,    4, ..., 5000, 5000, 5000],
       [   2,   48,  124, ..., 5000, 5000, 5000]], dtype=int32)

In [108]:
model.fit(np.array(X_train), np.array(y_train), epochs=5)

Epoch 1/5
148/860 [====>.........................] - ETA: 14:43 - loss: 0.6449 - accuracy: 0.3385

KeyboardInterrupt: 

In [91]:
model.evaluate(np.array(X_test), np.array(y_test), verbose=2)

122/122 - 13s - loss: 0.6931 - accuracy: 0.5018 - 13s/epoch - 104ms/step


[0.6931433081626892, 0.5017948746681213]

In [27]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.13.0


In [28]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [29]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10)
])

In [30]:
predictions = model(x_train[:1]).numpy()
predictions

array([[-0.75905657, -0.1158942 ,  0.03621826, -0.34728807, -0.38111103,
         0.19758466,  0.9706395 , -0.07617448, -0.15358764, -0.21443114]],
      dtype=float32)

In [31]:
tf.nn.softmax(predictions).numpy()

array([[0.04573764, 0.08701529, 0.10131112, 0.06904027, 0.06674417,
        0.11905228, 0.25791177, 0.09054107, 0.08379643, 0.07884997]],
      dtype=float32)

In [32]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [33]:
loss_fn(y_train[:1], predictions).numpy()

2.1281924

In [34]:
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

In [35]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2dacab580>

In [36]:
model.evaluate(x_test,  y_test, verbose=2)

313/313 - 0s - loss: 0.0779 - accuracy: 0.9764 - 187ms/epoch - 596us/step


[0.07792162895202637, 0.9764000177383423]