In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import torch
from transformers import Trainer, TrainingArguments

In [3]:
cyberbullying_tweets = pd.read_csv('../cleaned/cyberbullying_tweets_cleaned.csv')

In [None]:
label_map = {
    'age': 0,
    'gender': 1,
    'religion': 2,
    'ethnicity': 3,
    'not_cyberbullying': 4,
    'other_cyberbullying': 5
}


cyberbullying_tweets['label'] = cyberbullying_tweets['cyberbullying_type'].map(label_map)

In [5]:
print(cyberbullying_tweets['label'].value_counts())  # check class distribution

label
2    7998
0    7992
1    7973
3    7961
4    7945
5    7823
Name: count, dtype: int64


In [8]:
# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    cyberbullying_tweets['cleaned_text'].tolist(), cyberbullying_tweets['label'].tolist(), test_size=0.2, random_state=42
)

In [10]:
# Ensure all tweet_text entries are strings and drop any NaNs
train_texts = [str(text) for text in train_texts if pd.notnull(text)]
val_texts = [str(text) for text in val_texts if pd.notnull(text)]

In [17]:
# Trim the longer one to match the shorter one
min_len_train = min(len(train_texts), len(train_labels))
train_texts = train_texts[:min_len_train]
train_labels = train_labels[:min_len_train]

In [18]:
min_len_val = min(len(val_texts), len(val_labels))
val_texts = val_texts[:min_len_val]
val_labels = val_labels[:min_len_val]

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [20]:
class CyberbullyingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = CyberbullyingDataset(train_encodings, train_labels)
val_dataset = CyberbullyingDataset(val_encodings, val_labels)

In [21]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
training_args = TrainingArguments(
    output_dir='./cyberbullying_results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [24]:
print(f"train_texts: {len(train_texts)}, train_labels: {len(train_labels)}")

train_texts: 37979, train_labels: 37979


In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
# Save
model.save_pretrained("bert-cyberbullying")
tokenizer.save_pretrained("bert-cyberbullying")

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load
model_path = "bert-cyberbullying"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
# model.eval()  # Set to evaluation mode

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [2]:
label_map = {
    'age': 0,
    'gender': 1,
    'religion': 2,
    'ethnicity': 3,
    'not_cyberbullying': 4,
    'other_cyberbullying': 5
}
inv_label_map = {v: k for k, v in label_map.items()}

In [3]:
def predict_tweet(tweet_text):
    inputs = tokenizer(tweet_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    pred_label = outputs.logits.argmax(dim=1).item()
    return inv_label_map[pred_label]

In [4]:
while True:
    tweet = input("Enter a tweet (or type 'exit' to quit): ")
    if tweet.lower() == 'exit':
        break
    prediction = predict_tweet(tweet)
    print(f"Prediction: {prediction}\n")

Prediction: religion

Prediction: religion

Prediction: religion

Prediction: gender

Prediction: religion

