In [1]:
import datasets
import pandas as pd
from transformers import AutoTokenizer

In [2]:
df = pd.read_csv('/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
import re

def remove_usernames(tweet):
    return re.sub(r'@\w+', '', tweet)

In [5]:
tweet = "Hello @Asad, thanks for the mention @user2!"
cleaned_tweet = remove_usernames(tweet)
print(cleaned_tweet)  

Hello , thanks for the mention !


In [6]:
df['tweet_cleaned'] = df['tweet'].apply(lambda text: remove_usernames(text))

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,tweet_cleaned
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! RT : As a woman you shouldn't complain abo...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! RT : boy dats cold...tyga dwn bad for cu...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! RT Dawg!!!! RT : You ever fuck a bitc...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! RT : she look like a tranny
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! RT : The shit you hear about me ...


In [8]:
from datasets import Dataset
from datasets import DatasetDict

dataset = Dataset.from_pandas(df)
dataset[0]

{'Unnamed: 0': 0,
 'count': 3,
 'hate_speech': 0,
 'offensive_language': 0,
 'neither': 3,
 'class': 2,
 'tweet': "!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...",
 'tweet_cleaned': "!!! RT : As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."}

In [9]:
train_test_valid = dataset.train_test_split()
test_valid = train_test_valid['test'].train_test_split()

train_test_valid_dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test' : test_valid['test'],
    'valid' : test_valid['train']
})

dataset = train_test_valid_dataset.remove_columns(['offensive_language', 'neither', 'Unnamed: 0', 'hate_speech', 'count'])

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['class', 'tweet', 'tweet_cleaned'],
        num_rows: 18587
    })
    test: Dataset({
        features: ['class', 'tweet', 'tweet_cleaned'],
        num_rows: 1549
    })
    valid: Dataset({
        features: ['class', 'tweet', 'tweet_cleaned'],
        num_rows: 4647
    })
})

In [11]:
model = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [12]:
random_text = "The quick brown fox jumps over the lazy dog. This is a test sentence to check the BERT tokenizer. It includes numbers like 123 and symbols like !@#$%^&*()."

output = tokenizer(random_text)

output

{'input_ids': [101, 1109, 3613, 3058, 17594, 15457, 1166, 1103, 16688, 3676, 119, 1188, 1110, 170, 2774, 5650, 1106, 4031, 1103, 139, 9637, 1942, 22559, 17260, 119, 1135, 2075, 2849, 1176, 13414, 1105, 9282, 1176, 106, 137, 108, 109, 110, 167, 111, 115, 113, 114, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
tokens = tokenizer.convert_ids_to_tokens(output['input_ids'])
print(tokens)

['[CLS]', 'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'This', 'is', 'a', 'test', 'sentence', 'to', 'check', 'the', 'B', '##ER', '##T', 'token', '##izer', '.', 'It', 'includes', 'numbers', 'like', '123', 'and', 'symbols', 'like', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '.', '[SEP]']


In [14]:
def tokenized_function(train_dataset):
    return tokenizer(train_dataset['tweet_cleaned'], padding = 'max_length')

In [15]:
tokenized_dataset = dataset.map(tokenized_function, batched = True)

Map:   0%|          | 0/18587 [00:00<?, ? examples/s]

Map:   0%|          | 0/1549 [00:00<?, ? examples/s]

Map:   0%|          | 0/4647 [00:00<?, ? examples/s]

In [16]:
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']
valid_dataset = tokenized_dataset['valid']

In [17]:
train_dataset

Dataset({
    features: ['class', 'tweet', 'tweet_cleaned', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18587
})

In [18]:
train_dataset = train_dataset.remove_columns(['tweet', 'tweet_cleaned']).with_format('tensorflow')
valid_dataset = valid_dataset.remove_columns(['tweet', 'tweet_cleaned']).with_format('tensorflow')
test_dataset = test_dataset.remove_columns(['tweet', 'tweet_cleaned']).with_format('tensorflow')

2024-06-29 20:02:55.418694: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-29 20:02:55.418807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-29 20:02:55.560807: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [19]:
train_dataset

Dataset({
    features: ['class', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18587
})

In [20]:
train_features = { x: train_dataset[x] for x in tokenizer.model_input_names }

In [21]:
train_features

{'input_ids': <tf.Tensor: shape=(18587, 512), dtype=int64, numpy=
 array([[ 101,  146, 1306, ...,    0,    0,    0],
        [ 101, 1240,  170, ...,    0,    0,    0],
        [ 101,  155, 1942, ...,    0,    0,    0],
        ...,
        [ 101,  146, 1221, ...,    0,    0,    0],
        [ 101,  155, 1942, ...,    0,    0,    0],
        [ 101,  155, 1942, ...,    0,    0,    0]])>,
 'token_type_ids': <tf.Tensor: shape=(18587, 512), dtype=int64, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])>,
 'attention_mask': <tf.Tensor: shape=(18587, 512), dtype=int64, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>}

In [22]:
import tensorflow as tf
train_set_for_final_model = tf.data.Dataset.from_tensor_slices((train_features, train_dataset['class']))

In [23]:
train_set_for_final_model = train_set_for_final_model.shuffle(buffer_size = len(train_set_for_final_model)).batch(4)

In [24]:
train_set_for_final_model

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [25]:
val_features = {x: valid_dataset[x] for x in tokenizer.model_input_names}
val_set_for_final_model = tf.data.Dataset.from_tensor_slices((val_features, valid_dataset['class']))
val_set_for_final_model = val_set_for_final_model.shuffle(buffer_size = len(val_set_for_final_model)).batch(4)


test_features = {x: test_dataset[x] for x in tokenizer.model_input_names}
test_set_for_final_model = tf.data.Dataset.from_tensor_slices((test_features, test_dataset['class']))
test_set_for_final_model = test_set_for_final_model.shuffle(buffer_size = len(test_set_for_final_model)).batch(4)

In [26]:
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 3)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
bert_model = model.bert

In [28]:
for layer in bert_model.encoder.layer[:-2]:
    layer.trainable = False

In [29]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
    metrics =[tf.keras.metrics.SparseCategoricalAccuracy()]
)

In [30]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 108312579 (413.18 MB)
Trainable params: 37433859 (142.80 MB)
Non-trainable params: 70878720 (270.38 MB)
_________________________________________________________________


In [31]:
import tensorflow as tf

# Check GPU availability
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available:", len(physical_devices))

Num GPUs Available: 1


In [32]:
model.fit(train_set_for_final_model, validation_data = val_set_for_final_model, epochs = 3)

Epoch 1/3
Cause: for/else statement not yet supported


I0000 00:00:1719691466.294249      91 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7b97e8c33010>

In [38]:
test_loss, test_acc = model.evaluate(test_set_for_final_model, verbose = 2)
print('Test Accuracy:', test_acc)

388/388 - 42s - loss: 0.2353 - sparse_categorical_accuracy: 0.9174 - 42s/epoch - 109ms/step
Test Accuracy: 0.9173660278320312


In [39]:
predict_score_and_class_dict = {
    0: 'Hate Speech',
    1: 'Offensive Language',
    2: 'Niether', 
}

preds = model(tokenizer(['He is not a good guy, I dont why he is breathing, He sucks. Everthing about him sucks'], return_tensors = 'tf', padding = True, truncation = True))['logits']

print(preds)

tf.Tensor([[-1.2483144  3.2902615 -1.3268948]], shape=(1, 3), dtype=float32)


In [41]:
import numpy as np

class_preds = np.argmax(preds, axis = 1)

for pred in class_preds:
    print(predict_score_and_class_dict[pred])

Offensive Language
