In [1]:
# 1. Uninstall existing versions to clear conflicts
!pip uninstall -y protobuf tensorboard

# 2. Install a stable, compatible version of protobuf
!pip install -q protobuf==3.20.3

# 3. Reinstall tensorboard
!pip install -q tensorboard

Found existing installation: protobuf 6.33.0
Uninstalling protobuf-6.33.0:
  Successfully uninstalled protobuf-6.33.0
Found existing installation: tensorboard 2.18.0
Uninstalling tensorboard-2.18.0:
  Successfully uninstalled tensorboard-2.18.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
tensorflow 2.18.0 requires tensorboard<2.19,>=2.18, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>=5.29.5, but you have protobuf 3.

In [2]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter  # <--- NEW IMPORT
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, f1_score
import matplotlib.pyplot as plt

2025-12-27 19:26:33.904457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766863594.085309      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766863594.140040      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
MAX_LEN = 256      # Max length of tweets (BERT max is 512, but tweets are short)
BATCH_SIZE = 16    # 16 or 32 is standard for BERT
EPOCHS = 5         # BERT fine-tunes quickly (2-4 epochs is usually enough)
LEARNING_RATE = 1e-5 
USE_BALANCED = False
lambdaa = 1.2
# --- 3. Data Loading & Minimal Cleaning ---
writer = SummaryWriter(f'runs/BERT_CNN_{EPOCHS}_epochs_{LEARNING_RATE}_lr_BALANCED' if USE_BALANCED else f'BERT_CNN_{EPOCHS}_epochs_{LEARNING_RATE}_lr')

def clean_text_bert(text):
    # Minimal cleaning for BERT. It needs context, so we keep stopwords.
    text = str(text).lower()
    text = re.sub(r'rt\s', '', text)               # Remove RT
    text = re.sub(r'@\w+', '', text)               # Remove mentions
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'&#[0-9]+;', '', text)          # Remove HTML
    # We KEEP punctuation because BERT uses it for context/structure
    return text.strip()

# Load Data
df = pd.read_csv('/kaggle/input/sentiment-analysis-twitter-hate-speech/train.csv')
df_test = pd.read_csv('/kaggle/input/sentiment-analysis-twitter-hate-speech/test.csv')
df['clean_text'] = df['tweet'].apply(clean_text_bert)
df_test['clean_text'] = df_test['tweet'].apply(clean_text_bert)
# Split Data
X_train, X_val, y_train, y_val = train_test_split(
    df['clean_text'], df['class'], test_size=0.2, random_state=42
)

df_balanced_data = pd.read_csv('/kaggle/input/sentiment-analysis-twitter-hate-speech/balanced_data.csv')
X_train_balanced, X_val_balanced, y_train_balanced, y_val_balanced = train_test_split(
    df_balanced_data['clean_text'], df_balanced_data['class'], test_size=0.2, random_state=42
)

Using device: cuda


In [4]:
# --- 1. Initialize Tokenizer ---z
# 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# --- 2. Custom Dataset Class ---
class TwoHeadDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        # Reset index to avoid errors if dataframe was shuffled/split
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        # Encoding: This handles Tokenization, Padding, and Truncation
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,    # Add [CLS] and [SEP]
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',       # Pad to max_len
            truncation=True,            # Truncate if too long
            return_attention_mask=True,
            return_tensors='pt',        # Return PyTorch tensors
        )

        return {
            'text': text,
            # Flatten because DataLoader adds the batch dimension later
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            # We pass the raw label (0, 1, or 2). 
            # The train_fn logic will handle splitting this into binary targets.
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- 3. Create DataLoaders ---

# Create Dataset objects
if not USE_BALANCED:
    train_dataset = TwoHeadDataset(
        texts=X_train, 
        labels=y_train, 
        tokenizer=tokenizer, 
        max_len=MAX_LEN
    )
    
    val_dataset = TwoHeadDataset(
        texts=X_val, 
        labels=y_val, 
        tokenizer=tokenizer, 
        max_len=MAX_LEN
    )
else:
    train_dataset = TwoHeadDataset(
        texts=X_train_balanced, 
        labels=y_train_balanced, 
        tokenizer=tokenizer, 
        max_len=MAX_LEN
    )
    
    val_dataset = TwoHeadDataset(
        texts=X_val_balanced, 
        labels=y_val_balanced, 
        tokenizer=tokenizer, 
        max_len=MAX_LEN
    )
test_dataset = TwoHeadDataset(
    texts=df_test['clean_text'],
    labels=df_test['class'],
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
# Create DataLoaders
# shuffle=True for training to break correlations
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers=2 # Optional: speeds up data loading
)

# shuffle=False for validation so results are reproducible
val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=2 
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2
)

print(f"Data Loaded: {len(train_dataset)} training samples, {len(val_dataset)} validation samples.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Data Loaded: 15860 training samples, 3966 validation samples.


In [5]:
# --- 1. Custom Focal Loss ---
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        """
        Args:
            alpha (float, list, or torch.Tensor): 
                - If float: Applies the same weight to all classes.
                - If list/Tensor: Weights for each class (e.g., [1.0, 0.5, 0.1]).
                  Must match the number of classes.
            gamma (float): Focusing parameter (default 2).
            reduction (str): 'mean', 'sum', or 'none'.
        """
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction
        
        # Handle alpha
        if isinstance(alpha, (list, tuple, np.ndarray)):
            self.alpha = torch.tensor(alpha).float()
        else:
            self.alpha = alpha

    def forward(self, inputs, targets):
        # Move alpha to the correct device (GPU/CPU) automatically
        if self.alpha is not None and isinstance(self.alpha, torch.Tensor):
            self.alpha = self.alpha.to(inputs.device)

        # 1. Calculate Standard Cross Entropy (raw log_softmax)
        # We assume inputs are raw logits (not probabilities)
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        
        # 2. Calculate probabilities (pt)
        pt = torch.exp(-ce_loss)
        
        # 3. Calculate Alpha Factor
        if self.alpha is not None:
            if isinstance(self.alpha, torch.Tensor):
                # Select the specific weight for the target class of each sample
                alpha_t = self.alpha[targets]
            else:
                # Scalar alpha
                alpha_t = self.alpha
        else:
            alpha_t = 1.0
            
        # 4. Focal Loss Formula
        # Loss = -alpha * (1 - pt)^gamma * log(pt)
        # Note: ce_loss is already -log(pt)
        focal_loss = alpha_t * (1 - pt) ** self.gamma * ce_loss

        # 5. Reduction
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# --- 2. BERT + CNN Model Architecture ---
class BertCNN(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super(BertCNN, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # CNN Hyperparameters
        embedding_dim = self.bert.config.hidden_size # 768 for bert-base
        n_filters = 100
        filter_sizes = [2, 3, 4] # Look at 2-grams, 3-grams, 4-grams
        
        # Convolutional Layers
        # We create a ModuleList of Conv1d layers for different window sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        
        # Fully Connected Layer
        self.fc = nn.Linear(len(filter_sizes) * n_filters, n_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        # BERT Output
        # We need the 'last_hidden_state' which has shape [batch_size, seq_len, hidden_dim]
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
        
        # Permute for CNN: [batch, hidden_dim, seq_len]
        # Conv1d expects channels (hidden_dim) as the second dimension
        embedded = last_hidden_state.permute(0, 2, 1)
        
        # Apply CNN & Max Pooling
        # For each filter size: Conv1d -> ReLU -> MaxPool1d
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # Concatenate pooled features
        # Shape: [batch, n_filters * len(filter_sizes)]
        cat = self.dropout(torch.cat(pooled, dim=1))
        
        # Final Classification
        return self.fc(cat)

In [6]:
def train_fn(data_loader, criterion, model, optimizer, device, scheduler=None, epoch_index = 0):
    model.train()
    total_loss = 0
    losses = []
    for idx, batch in enumerate(tqdm(data_loader, desc="Training")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Labels are 0 (Hate), 1 (Offensive), 2 (Neither)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # 1. Forward Pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
        # Focal Loss
        loss = criterion(outputs, labels)

        losses.append(loss.item())        
        loss.backward()
        
        # Clip gradients to prevent explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        if scheduler:
            scheduler.step()
            
        total_loss += loss.item()
        current_step = epoch_index * len(data_loader) + idx
        writer.add_scalar('Loss/Train', loss.item(), current_step)
    return total_loss / len(data_loader)

def evaluate_fn(data_loader, criterion, model, device, epoch_index = 0, is_testing = False):
    model.eval()
    
    final_targets = []
    final_predictions = []
    val_losses = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # 1. Forward Pass
            # The BertCNN model returns raw logits of shape [Batch, 3]
            logits = model(input_ids, attention_mask)
            
            # 2. Calculate Loss (Validation Loss)
            # We use the same Focal Loss criterion passed from the main loop
            loss = criterion(logits, labels)
            val_losses.append(loss.item())
            
            # 3. Get Predictions
            # Since this is a standard Multi-class problem (0, 1, 2) for the CNN,
            # we just take the argmax. 
            # (Note: The hierarchical logic was for the Two-Head model. 
            # For this BertCNN 3-class model, we use standard argmax).
            _, preds = torch.max(logits, dim=1)
            
            final_targets.extend(labels.cpu().numpy())
            final_predictions.extend(preds.cpu().numpy())
            
    # 4. Calculate Metrics
    avg_val_loss = np.mean(val_losses)
    val_f1 = f1_score(final_targets, final_predictions, average='macro')
    if not is_testing:
        writer.add_scalar('Loss/Validation', avg_val_loss, epoch_index)
        writer.add_scalar('F1/Validation', val_f1, epoch_index)
        print(f"\nValidation Loss: {avg_val_loss:.4f} | Validation F1: {val_f1:.4f}")
    
    target_names = ['Hate Speech (0)', 'Offensive (1)', 'Neither (2)']
    print(classification_report(final_targets, final_predictions, target_names=target_names))
    
    return avg_val_loss, val_f1

In [7]:
model = BertCNN(n_classes=3)
model = model.to(device)
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
w_norm = np.array(weights) / np.array(weights).sum()
# Initialize Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
class_weights = [5.0, 2.0, 1.0]
# Class 0 (Hate): High weight (e.g., 5.0)
# Class 1 (Offensive): Medium weight (e.g., 2.0)
# Class 2 (Neither): Low weight (e.g., 1.0)
criterion = FocalLoss(alpha=w_norm, gamma=2)

# Assume train_loader and val_loader are already created from previous steps
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)
best_val_loss = float('inf')
save_path = f"BERT_CNN_{EPOCHS}_epochs_{LEARNING_RATE}_lr_BALANCED.pth" if USE_BALANCED else f"BERT_CNN_{EPOCHS}_epochs_{LEARNING_RATE}_lr.pth"

# Training Loop
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_fn(train_loader, criterion, model, optimizer, device, scheduler, epoch)
    print(f"Train Loss: {train_loss}")
    val_loss, val_f1 = evaluate_fn(val_loader, criterion, model, device, epoch)
    model.eval()
    
    
    # --- 4. Save Model if Val Loss Improved ---
    if val_loss < best_val_loss:
        print(f"Validation loss decreased ({best_val_loss:.4f} --> {val_loss:.4f}). Saving model...")
        torch.save(model.state_dict(), save_path)
        print(f"Saved at {save_path}")
        best_val_loss = val_loss

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/5


Training: 100%|██████████| 992/992 [07:13<00:00,  2.29it/s]


Train Loss: 0.029167861787080314


Evaluating: 100%|██████████| 248/248 [00:28<00:00,  8.73it/s]



Validation Loss: 0.0187 | Validation F1: 0.7202
                 precision    recall  f1-score   support

Hate Speech (0)       0.27      0.74      0.39       220
  Offensive (1)       0.98      0.83      0.90      3052
    Neither (2)       0.83      0.92      0.87       694

       accuracy                           0.84      3966
      macro avg       0.69      0.83      0.72      3966
   weighted avg       0.91      0.84      0.86      3966

Validation loss decreased (inf --> 0.0187). Saving model...
Saved at BERT_CNN_5_epochs_1e-05_lr.pth
Epoch 2/5


Training: 100%|██████████| 992/992 [07:13<00:00,  2.29it/s]


Train Loss: 0.016790944611725974


Evaluating: 100%|██████████| 248/248 [00:28<00:00,  8.73it/s]



Validation Loss: 0.0208 | Validation F1: 0.7750
                 precision    recall  f1-score   support

Hate Speech (0)       0.39      0.67      0.49       220
  Offensive (1)       0.97      0.91      0.94      3052
    Neither (2)       0.87      0.92      0.90       694

       accuracy                           0.90      3966
      macro avg       0.74      0.83      0.77      3966
   weighted avg       0.92      0.90      0.91      3966

Epoch 3/5


Training: 100%|██████████| 992/992 [07:13<00:00,  2.29it/s]


Train Loss: 0.012412560905607778


Evaluating: 100%|██████████| 248/248 [00:28<00:00,  8.73it/s]



Validation Loss: 0.0219 | Validation F1: 0.7194
                 precision    recall  f1-score   support

Hate Speech (0)       0.24      0.77      0.37       220
  Offensive (1)       0.97      0.81      0.89      3052
    Neither (2)       0.88      0.92      0.90       694

       accuracy                           0.83      3966
      macro avg       0.70      0.83      0.72      3966
   weighted avg       0.92      0.83      0.86      3966

Epoch 4/5


Training: 100%|██████████| 992/992 [07:13<00:00,  2.29it/s]


Train Loss: 0.008350925207152261


Evaluating: 100%|██████████| 248/248 [00:28<00:00,  8.73it/s]



Validation Loss: 0.0265 | Validation F1: 0.7348
                 precision    recall  f1-score   support

Hate Speech (0)       0.28      0.69      0.40       220
  Offensive (1)       0.97      0.85      0.91      3052
    Neither (2)       0.87      0.93      0.90       694

       accuracy                           0.86      3966
      macro avg       0.71      0.82      0.73      3966
   weighted avg       0.91      0.86      0.88      3966

Epoch 5/5


Training: 100%|██████████| 992/992 [07:13<00:00,  2.29it/s]


Train Loss: 0.006454673334188086


Evaluating: 100%|██████████| 248/248 [00:28<00:00,  8.73it/s]


Validation Loss: 0.0290 | Validation F1: 0.7386
                 precision    recall  f1-score   support

Hate Speech (0)       0.29      0.70      0.41       220
  Offensive (1)       0.97      0.86      0.91      3052
    Neither (2)       0.89      0.91      0.90       694

       accuracy                           0.86      3966
      macro avg       0.71      0.82      0.74      3966
   weighted avg       0.92      0.86      0.88      3966






In [8]:
saved_model = BertCNN(n_classes=3).to(device)
saved_model.load_state_dict(torch.load(save_path, map_location=device))
test_loss, test_f1 = evaluate_fn(test_loader, criterion, saved_model, device, epoch, is_testing = False)

Evaluating: 100%|██████████| 310/310 [00:35<00:00,  8.74it/s]


Validation Loss: 0.0191 | Validation F1: 0.7227
                 precision    recall  f1-score   support

Hate Speech (0)       0.27      0.73      0.40       286
  Offensive (1)       0.98      0.83      0.90      3838
    Neither (2)       0.83      0.92      0.87       833

       accuracy                           0.84      4957
      macro avg       0.69      0.83      0.72      4957
   weighted avg       0.91      0.84      0.87      4957




