In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

from transformers import BertTokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.metrics import Precision, Recall, Accuracy

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
# Load dataset and remove an unused column
data_df = pd.read_csv("tweets_flagged_v2.csv", index_col=0)

print('Rows, cols:', data_df.shape)

# Display the first 10 rows to check the data
data_df.head(10)

Rows, cols: (56745, 2)


Unnamed: 0,harmful,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
5,0,[2/2] huge fan fare and big talking before the...
6,0,@user camping tomorrow @user @user @user @use...
7,0,the next school year is the year for exams.ð...
8,0,we won!!! love the land!!! #allin #cavs #champ...
9,0,@user @user welcome here ! i'm it's so #gr...


In [7]:
# Display the distribution of the 'harmful' column values
data_df["harmful"].value_counts()

harmful
0    32592
1    24153
Name: count, dtype: int64

In [13]:
# Randomly sample 10 tweets and their 'harmful' status for inspection
data_df.sample(10)

Unnamed: 0,harmful,tweet
21244,0,@user very much looking forward to my observi...
15840,0,this keshi's news is really hard for me to acc...
36738,1,@SteveWorks4You Friggin Red State pussy Republ...
25032,0,we love you back ð!
50708,1,"RT @chefpolio: ""I don't send nudes"" -- a bitch..."
33194,1,&#8220;@DrummerKid0328: Reggie gives me headac...
21666,0,how to be even if things arenât perfect no...
3755,0,@user the mind is never :-( to see us :-) ...
46839,1,RT @DJ_MARVZ: Chris brown got no chill button....
10107,0,scott_brown-taking_drugs__out_of_my_brain-(plu...


In [14]:
# Preprocess text data
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Convert tweets to a NumPy array for processing
X = data_df["tweet"].values
y = data_df["harmful"].values

# Tokenize tweets, add padding to ensure uniform sequence length, and convert to TensorFlow tensors
sequences = [sequence for sequence in X]
model_inputs = tokenizer(sequences, padding=True, return_tensors='tf')

Python(17288) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [22]:
model_inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [31]:
model_inputs['input_ids']

<tf.Tensor: shape=(56745, 484), dtype=int32, numpy=
array([[  101,   137,  4795, ...,     0,     0,     0],
       [  101,   137,  4795, ...,     0,     0,     0],
       [  101, 16516,  1324, ...,     0,     0,     0],
       ...,
       [  101,  1685,   171, ...,     0,     0,     0],
       [  101,  1128,  1358, ...,     0,     0,     0],
       [  101,   199,   199, ...,     0,     0,     0]], dtype=int32)>

In [30]:
model_inputs['token_type_ids']

<tf.Tensor: shape=(56745, 484), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>

In [48]:
# Create a TensorFlow dataset from the tokenized inputs and labels
dataset = tf.data.Dataset.from_tensor_slices((model_inputs['input_ids'],y))

# Optimize dataset by caching, shuffling, batching, and prefetching
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

# Split dataset into training, validation, and testing sets following a 70:20:10 ratio
train = dataset.take(int(len(dataset)*.7))
val   = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test  = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [49]:
# Define a sequential model architecture
model = Sequential(name="text-classifier")
model.add(Embedding(len(tokenizer.get_vocab()), 32)) # Embedding layer
model.add(Bidirectional(LSTM(32, activation='tanh'))) # Bidirectional LSTM layer
model.add(Dense(128, activation='relu')) # Dense layer with ReLU activation
model.add(Dense(256, activation='relu')) # Another Dense layer with ReLU
model.add(Dense(128, activation='relu')) # Additional Dense layer with ReLU

# Output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Display model summary
model.summary()

In [50]:
# Compile model with binary cross-entropy loss and Adam optimizer
model.compile(loss="binary_crossentropy", optimizer='Adam')

In [51]:
# Train model on GPU, specifying the training and validation datasets
with tf.device("/device:GPU:0"):
  history = model.fit(train, epochs=1, batch_size=16, validation_data=val)

[1m2482/2482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 87ms/step - loss: 0.2698 - val_loss: 0.1185


In [52]:
# Initialize precision, recall, and accuracy metrics
pre = Precision()
rec = Recall()
acc = Accuracy()

# Evaluate model on test dataset and update metrics
for batch in test.as_numpy_iterator():
  x_true, y_true = batch
  y_hat = model.predict(x_true) # Predict on the test batch

  # Update precision, recall, and accuracy metrics based on predictions
  pre.update_state(y_true, y_hat)
  rec.update_state(y_true, y_hat)
  acc.update_state(y_true, y_hat)

# Print precision, recall, and accuracy values
print("precision", pre.result().numpy())
print("recall", rec.result().numpy())
print("accuracy", acc.result().numpy())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

2024-05-28 16:04:36.100406: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
