# Importing hugging face transformers and libraries



In [13]:
# Install the Hugging Face Transformers library
!pip install transformers
# Import necessary libraries
import torch
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM
import os





# Setting the device

In [14]:

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")


Using device: cuda


# Hugging Face login

In [15]:
# Log in to Hugging Face Hub (optional, if you want to push models)
#from huggingface_hub import login

# Replace 'YOUR_HUGGINGFACE_TOKEN' with your actual token
# login("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")


# Loading tokenizer and model from hf

In [16]:
# 1. Load Tokenizer and Model for BERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(DEVICE)

# 2. Data Loaders and Training Loop remain mostly the same, but adjust for BERT-specific tokenization if necessary.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Checkpoint Dir

In [17]:
# Create a directory for saving checkpoints
CHECKPOINT_DIR = '/content/checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)


# Load clean training data

In [18]:
import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

url = 'https://raw.githubusercontent.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/refs/heads/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.3.csv'

df = pd.read_csv(url)
# print(df.head())

df_cleaned = df[['text', 'label']]
df_cleaned.dropna(subset=['text', 'label'], inplace = True)
df_cleaned['label'] = df_cleaned['label'].map ({'hate': 1, 'nothate' : 0})
cleaned_data = df_cleaned.sample(n=1000, random_state=42)
print(cleaned_data)

train_df, val_df = train_test_split(cleaned_data, test_size=0.2, random_state=42)

# Convert the DataFrames to lists for tokenization
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()


                                                    text  label
4750   social services have a love/hate relationship ...      0
24147  Yes it's a joke and offensive isn't it? Why ar...      0
29898  I do not have passions in life, and I'm not su...      0
16529  You better be. That was the most half-assed co...      0
12701  Waging war on, and bombing, a foreign sovereig...      1
...                                                  ...    ...
4265   The Fucking British government is ruining our ...      0
14495         I've got nothing against maradona but lol.      0
10393  innocence is not a quality with which women ca...      1
19138  Ohhh wow, look it is as if women are equally g...      0
28465  Africa is vast, and Europe is tiny by comparis...      1

[1000 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.dropna(subset=['text', 'label'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['label'] = df_cleaned['label'].map ({'hate': 1, 'nothate' : 0})


# Tokenization

In [19]:
# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, max_length=512)


# Data Prep

In [20]:
import torch

class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)



# Data Loaders

In [21]:
from torch.utils.data import DataLoader

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)


# Training Setup

In [22]:
from transformers import AdamW

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)





# Training Loop

In [29]:
from transformers import AdamW
from torch.cuda.amp import GradScaler, autocast
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        # Move input tensors to the GPU
        for key in batch:
            batch[key] = batch[key].to(DEVICE)

        optimizer.zero_grad()

        # Automatic Mixed Precision
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss

        # Scale the loss for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Free up memory
        torch.cuda.empty_cache()

    print(f"Epoch {epoch + 1} finished.")


  scaler = GradScaler()
  with autocast():


Epoch 1 finished.
Epoch 2 finished.
Epoch 3 finished.


# Saving the model

In [33]:
# After your training loop
output_dir = "/drive/MyDrive/training_v1"  # Specify your save path

# Save the model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to /drive/MyDrive/training_v1
