In [143]:
import numpy as np
import tensorflow as tf
import keras
from pathlib import Path
from keras.layers import TextVectorization
from keras.layers import (
    BatchNormalization,
    Conv1D,
    Dense,
    Dropout,
    GlobalAveragePooling1D,
    MaxPooling1D,
)

In [152]:
DATA = Path('../data/raw')
TRAIN = DATA / 'train'
TEST = DATA / 'test'
MODELS = Path('../models')

In [98]:
BATCH = 64
CLASSES = [f.stem for f in TRAIN.glob('*')]
NUM_CLASSES = 2

TRAIN_DF = tf.keras.preprocessing.text_dataset_from_directory(
    TRAIN,
    batch_size=BATCH,
    class_names=CLASSES
)

Found 139804 files belonging to 2 classes.


In [127]:
# Text Vectorization
# MAX_VOCAB = 64
MAX_LEN = 500
text_vectorizer = TextVectorization(
    output_mode="int",
    split=lambda x: tf.strings.unicode_split(x, "UTF-8")
)

text_vectorizer.adapt(TRAIN_DF.map(lambda x, y: x))
VOCAB_SIZE = len(text_vectorizer.get_vocabulary())
text_vectorizer.get_vocabulary()

['', '[UNK]', 't', 'a', 'g', 'c', 'n']

In [134]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return text_vectorizer(text)-2, label

train_data = TRAIN_DF.map(vectorize_text)

In [135]:
# Check shape
for text, label in train_data.take(1):
    print(text.shape)
    print(label.shape)

(64, 534)
(64,)


In [147]:
# # Model
# class CharCNN(tf.keras.Model):
#     def __init__(self, vocab_size, num_classes):
#         super(CharCNN, self).__init__()
#         self.onehot = keras.layers.Lambda(lambda x: tf.one_hot(tf.cast(x, "int64"), vocab_size)),
#         self.conv1 = tf.keras.layers.Conv1D(32, kernel_size=8, activation='relu')
#         self.norm = keras.layers.BatchNormalization(),
#         self.conv2 = tf.keras.layers.Conv1D(16, kernel_size=8, activation='relu')
#         self.pool = tf.keras.layers.MaxPooling1D(),
#         self.conv3 = tf.keras.layers.Conv1D(4, kernel_size=8, activation='relu')
#         self.avgpool = tf.keras.layers.GlobalAveragePooling1D(),
#         self.drop = tf.keras.layers.Dropout(0.3),
#         self.fc = tf.keras.layers.Dense(1, activation='sigmoid')

#     def call(self, x):
#         x = self.onehot(x),
#         x = self.conv1(x),
#         x = self.norm(x),
#         x = self.conv2(x),
#         x = self.norm(x),
#         x = self.pool(x),
#         x = self.conv3(x),
#         x = self.norm(x),
#         x = self.pool(x),
#         x = self.drop(x),
#         x = self.avgpool(x),
#         return self.fc(x)
    
# model = CharCNN(VOCAB_SIZE, NUM_CLASSES)
onehot_layer = tf.keras.layers.Lambda(lambda x: tf.one_hot(tf.cast(x, "int64"), VOCAB_SIZE))
last_layer = Dense(1, activation='sigmoid')

model = tf.keras.Sequential([
    onehot_layer,
    Conv1D(32, kernel_size=8, data_format="channels_last", activation="relu"),
    BatchNormalization(),
    MaxPooling1D(),
    Conv1D(16, kernel_size=8, data_format="channels_last", activation="relu"),
    BatchNormalization(),
    MaxPooling1D(),
    Conv1D(4, kernel_size=8, data_format="channels_last", activation="relu"),
    BatchNormalization(),
    MaxPooling1D(),
    Dropout(0.3),
    GlobalAveragePooling1D(),
    last_layer
])

In [148]:
# Training
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy']
)

In [149]:
# Fitting model
EPOCHS = 10
model.fit(train_data, epochs=EPOCHS)

Epoch 1/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 31ms/step - accuracy: 0.6329 - loss: 0.6374
Epoch 2/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 29ms/step - accuracy: 0.6890 - loss: 0.5863
Epoch 3/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 29ms/step - accuracy: 0.7061 - loss: 0.5657
Epoch 4/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 37ms/step - accuracy: 0.7149 - loss: 0.5542
Epoch 5/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 45ms/step - accuracy: 0.7226 - loss: 0.5457
Epoch 6/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 41ms/step - accuracy: 0.7296 - loss: 0.5391
Epoch 7/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m601s[0m 251ms/step - accuracy: 0.7323 - loss: 0.5342
Epoch 8/10
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 25ms/step - accuracy: 0.7359 - loss: 0.5306
Epoch 

<keras.src.callbacks.history.History at 0x7fbb6c78fdd0>

In [150]:
test_data = tf.keras.preprocessing.text_dataset_from_directory(
    TEST,
    batch_size=BATCH,
    class_names=CLASSES
).map(vectorize_text)

Found 34952 files belonging to 2 classes.


In [151]:
model.evaluate(test_data)

[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.6250 - loss: 0.7853


[0.7829174995422363, 0.6252861022949219]

In [153]:
model.save(MODELS / 'char_cnn.h5')

