In [1]:
# load dependencies
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Dense
from keras.metrics import Precision, Recall, Accuracy



In [2]:
# load the dataset
df_data = pd.read_csv("datasets/tweet_flagged_toxic.csv")
# data exploratory
df_data.head()

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [3]:
# normalize the columns
df_data.columns =  df_data.columns.str.lower().str.replace(' ', '_')
df_data.head()

Unnamed: 0,unnamed:_0,toxicity,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [4]:
# analyze the input features
# drop non-important features
df_data = df_data.drop(columns=['unnamed:_0'], axis=1)
# show data
df_data.head()

Unnamed: 0,toxicity,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [5]:
# prepare the data
y = df_data.toxicity.values
texts = df_data.tweet.values

print(y[:2])
print(texts[:2])
print(type(y))
print(type(texts))
print(y.shape)
print(texts.shape)

[0 0]
[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked"]
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(56745,)
(56745,)


In [6]:
# preprocess input (x)
# tokenize/vectorize the text
# load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [7]:
# tokenize the sequences
sequences = texts.tolist()
print(type(sequences))
print(sequences[:2])

<class 'list'>
[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run', "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked"]


In [8]:
# preprocess_sequences function
def preprocess_sequences(tokenizer, lst):
    return tokenizer(
        lst,
        padding='max_length',  # Changed from True to 'max_length'
        truncation=True,
        max_length=512,
        return_tensors='tf',
    )

In [10]:
# tokenize sequences
tokenized_sequences = preprocess_sequences(tokenizer, sequences)

print(type(tokenized_sequences))

2024-12-13 10:32:25.104860: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-12-13 10:32:25.104939: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-12-13 10:32:25.104952: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-12-13 10:32:25.104983: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-12-13 10:32:25.105015: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [11]:
# split the data (train, val, test)
sequence_ids = tokenized_sequences['input_ids']

print(type(sequence_ids))
print(sequence_ids.shape)

<class 'tensorflow.python.framework.ops.EagerTensor'>
(56745, 512)


In [12]:
# build the dataset
ds_full = tf.data.Dataset.from_tensor_slices((sequence_ids, y))
ds_full = ds_full.cache()
ds_full = ds_full.shuffle(160000)
ds_full = ds_full.batch(8)

In [13]:
# split the dataset
# Get the total size of the dataset:
total_size = tf.data.experimental.cardinality(ds_full).numpy()

# Calculate train, validation, and test sizes based on the desired percentages:
train_size = int(total_size * 0.7)
val_size = int(total_size * 0.2)
test_size = int(total_size * 0.1)
train_size = total_size - val_size - test_size 

# Create train, validation, and test datasets:
ds_train = ds_full.take(train_size)
ds_val = ds_full.skip(train_size).take(val_size)
ds_test = ds_full.skip(train_size + val_size).take(test_size) # Take test_size elements

# print out info
print(type(ds_train))
print(len(ds_train))

<class 'tensorflow.python.data.ops.take_op._TakeDataset'>
4967


In [15]:
# print out an example
ds_train_it = iter(ds_train)
for _ in range(5):
    x, y = next(ds_train_it)
    print('x.shape, y.shape', x.shape, y.shape)

x.shape, y.shape (8, 512) (8,)
x.shape, y.shape (8, 512) (8,)
x.shape, y.shape (8, 512) (8,)
x.shape, y.shape (8, 512) (8,)
x.shape, y.shape (8, 512) (8,)


In [16]:
# select the model
# model parameters
input_shape = sequence_ids.shape[1]
vocabulary_size = len(tokenizer.get_vocab())

print(input_shape)
print(vocabulary_size)

512
28996


In [17]:
# build model
model = Sequential(name="text-classifier")
model.add(Embedding(
    input_dim=vocabulary_size,             # Vocabulary size
    output_dim=32,                         # Embedding dimension
    input_shape=(input_shape,),            # Sequence length
    mask_zero=True                         # Handle padding
))
model.add(Bidirectional(LSTM(
    units=32,              # Number of LSTM units
    activation='tanh'      # Activation function
)))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


In [18]:
# compile the model
model.compile(loss="binary_crossentropy", optimizer='Adam', metrics=['accuracy'])

In [19]:
# train the model
with tf.device('/device:GPU:0'):
    history = model.fit(ds_train, epochs=1, validation_data=ds_val)

2024-12-13 11:05:20.540871: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m4967/4967[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 82ms/step - accuracy: 0.5707 - loss: 0.6846 - val_accuracy: 0.5748 - val_loss: 0.6820


In [21]:
# test the model
ds_iter = iter(ds_test)
x_test, y_test = next(ds_iter)
print(type(x_test))
print(type(y_test))
print(x_test.shape)
print(y_test.shape)

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>
(8, 512)
(8,)


In [24]:
# test the model
y_pred = model.predict(x_test)
print(type(y_pred))
print(y_pred.shape)
print(y_pred)
print(y_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
<class 'numpy.ndarray'>
(8, 1)
[[0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]]
tf.Tensor([1 0 0 1 1 0 1 1], shape=(8,), dtype=int64)


In [25]:
# test the model
ds_iter = ds_test.as_numpy_iterator()
x_test, y_test = next(ds_iter)
print(type(x_test))
print(type(y_test))
print(x_test.shape)
print(y_test.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(8, 512)
(8,)


In [26]:
# test the model
y_pred = model.predict(x_test)
print(type(y_pred))
print(y_pred.shape)
print(y_pred)
print(y_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
<class 'numpy.ndarray'>
(8, 1)
[[0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]
 [0.4332487]]
[1 1 1 1 0 1 0 0]


In [28]:
# evaluate the model
pre = Precision()
rec = Recall()
acc = Accuracy()

for batch in ds_test.as_numpy_iterator():
   x_test, y_test = batch
   
   y_pred_proba = model.predict(x_test)
   y_pred = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to 0 or 1
   
   pre.update_state(y_test, y_pred)
   rec.update_state(y_test, y_pred)
   acc.update_state(y_test, y_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42

2024-12-14 09:55:22.818934: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [29]:
print("precision", pre.result().numpy())
print("recall", rec.result().numpy()) 
print("accuracy", acc.result().numpy())

precision 0.0
recall 0.0
accuracy 0.57069725
