# Importing hugging face transformers and libraries



In [1]:
!pip install transformers datasets pandas scikit-learn

# Import necessary libraries
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split  # Import train_test_split
from torch.utils.data import DataLoader


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# Setting the device

In [2]:

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")


Using device: cuda


# Loading tokenizer and model from hf

In [3]:
# 1. Load Tokenizer and Model for BERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(DEVICE)


Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Checkpoint Dir

In [None]:
# Create a directory for saving checkpoints
CHECKPOINT_DIR = '/content/checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)


# Load clean training data

In [4]:

from sklearn.model_selection import train_test_split

# Load the dataset from Hugging Face
import pandas as pd

df = pd.read_parquet("hf://datasets/valurank/offensive-multi/data/train-00000-of-00001.parquet")

# Keep only the 'text' and 'label' columns to match the previous format
df = df[['text', 'label']]

# Remove mentions, hashtags, links, and "RT" from the text
df['text'] = df['text'].str.replace(r'@\w+', '', regex=True)   # Remove mentions
df['text'] = df['text'].str.replace(r'#\w+', '', regex=True)   # Remove hashtags
df['text'] = df['text'].str.replace(r'http\S+|www\S+|t.co/\S+', '', regex=True)  # Remove links
df['text'] = df['text'].str.replace(r'\bRT\b', '', regex=True)
df['text'] = df['text'].str.replace(':', '', regex=True)  # Remove "RT" for retweets

# Drop rows with any missing values (if any)
df.dropna(inplace=True)

# Split into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Display the cleaned dataset format
print(train_df.head(15))


# Convert the DataFrames to lists for tokenization
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()


                                                    text  label
35232    it’s a con and a mighty big con. and by who ...      0
13738    roses are red,\ncamo is green.\ntrick or tre...      1
12166  my manager is retarded bruh always asking siri...      1
3923                                \ni likes da yankees      0
25897   faso nra scores 2016    nra - candidate posit...      0
28165   are you lot trying to rival west ham for infi...      0
20873     throw that flag bitch you aint the real bush!!      1
34662     you prove how little you know about history...      0
33914     german antifastock fought against the nazis...      0
24115   its a political hit job gleefully pushed by  ...      0
34775                   or a piss up in a brewery !! hat      1
32248   this terrorist fanboy  and his saddo  cronies...      1
24378   always looking for faults and forgetting to l...      0
5511           lol my bad, i forgot that hoe made ya sad      1
21227  weekend is here. what an amazing 

# Tokenization

In [None]:
# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


# Data Prep

In [None]:
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)



# Data Loaders

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Training Setup

In [None]:
from transformers import AdamW

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)





# Training Loop

In [None]:
from transformers import AdamW
from torch.cuda.amp import GradScaler, autocast
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        # Move input tensors to the GPU
        for key in batch:
            batch[key] = batch[key].to(DEVICE)

        optimizer.zero_grad()

        # Automatic Mixed Precision
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss

        # Scale the loss for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Free up memory
        torch.cuda.empty_cache()

    print(f"Epoch {epoch + 1} finished.")


  scaler = GradScaler()
  with autocast():


Epoch 1 finished.
Epoch 2 finished.
Epoch 3 finished.


# Saving the model

In [None]:
# After your training loop
output_dir = "/drive/MyDrive/training_v1"  # Specify your save path

# Save the model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to /drive/MyDrive/training_v1
