# Importing hugging face transformers and libraries



In [1]:
# Install the Hugging Face Transformers library
!pip install transformers

# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os





# Setting the device

In [2]:

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")


Using device: cuda


# Hugging Face login

In [4]:
# Log in to Hugging Face Hub (optional, if you want to push models)
from huggingface_hub import login

# Replace 'YOUR_HUGGINGFACE_TOKEN' with your actual token
login("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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading tokenizer and model from hf

In [5]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=2).to(DEVICE)

tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Checkpoint Dir

In [6]:
# Create a directory for saving checkpoints
CHECKPOINT_DIR = '/content/checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)


# Load clean training data

In [7]:
import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

url = 'https://raw.githubusercontent.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/refs/heads/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.3.csv'

df = pd.read_csv(url)
# print(df.head())

df_cleaned = df[['text', 'label']]
df_cleaned.dropna(subset=['text', 'label'], inplace = True)
df_cleaned['label'] = df_cleaned['label'].map ({'hate': 1, 'nothate' : 0})
cleaned_data = df_cleaned.sample(n=100, random_state=42)
print(cleaned_data)

train_df, val_df = train_test_split(cleaned_data, test_size=0.2, random_state=42)

# Convert the DataFrames to lists for tokenization
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()


                                                    text  label
4750   social services have a love/hate relationship ...      0
24147  Yes it's a joke and offensive isn't it? Why ar...      0
29898  I do not have passions in life, and I'm not su...      0
16529  You better be. That was the most half-assed co...      0
12701  Waging war on, and bombing, a foreign sovereig...      1
...                                                  ...    ...
12296  it was really embarrassing to watch you almost...      1
12829  In order to be a hu-MAN you have to be of Adam...      1
4144   why does everyone say camels are ugly :( they ...      1
12033  Fortunately black people are more susceptible ...      1
27629  I do think all black ppl r dumb. U know, they ...      1

[100 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.dropna(subset=['text', 'label'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['label'] = df_cleaned['label'].map ({'hate': 1, 'nothate' : 0})


# Tokenization

In [8]:
# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


# Data Prep

In [9]:
import torch

class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)



# Data Loaders

In [10]:
from torch.utils.data import DataLoader

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)


# Training Setup

In [11]:
from transformers import AdamW

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to the specified device
model.to(DEVICE)




LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
   

# Training Loop

In [12]:
# Training loop
from torch.cuda.amp import GradScaler, autocast
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

num_epochs = 3  # Set your number of epochs
for epoch in range(2):
    model.train()
    for batch in train_loader:
        # Move input tensors to the GPU
        for key in batch:
            batch[key] = batch[key].to(DEVICE)

        optimizer.zero_grad()

        with autocast():  # Automatic Mixed Precision
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()  # Scale the loss for mixed precision
        scaler.step(optimizer)           # Update the weights
        scaler.update()                  # Update the scale for next iteration

        torch.cuda.empty_cache()

    print(f"Epoch {epoch + 1} finished.")


  scaler = GradScaler()
  with autocast():  # Automatic Mixed Precision


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 15.06 MiB is free. Process 26681 has 14.73 GiB memory in use. Of the allocated memory 14.51 GiB is allocated by PyTorch, and 87.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)