In [None]:
!pip install torch torchvision transformers pandas

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

class ClickbaitDataset(Dataset):
    def __init__(self, titles, labels, tokenizer, max_len):
        self.titles = titles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(titles, labels, tokenizer, max_len, batch_size):
    ds = ClickbaitDataset(
        titles=titles,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=4, pin_memory=True)

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()

    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()

    losses = []
    correct_predictions = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return correct_predictions.double() / n_examples, np.mean(losses), all_labels, all_preds

def main():
    print("Starting main process")
    df = pd.read_csv('/content/drive/MyDrive/clickbait/english.csv')

    df = df.dropna(subset=['label'])
    df['label'] = pd.to_numeric(df['label'], errors='coerce').dropna().astype(int)

    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16
    EPOCHS = 10
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01  # Adding weight decay

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Add dropout with probability 0.3 to BERT layers
    model.config.hidden_dropout_prob = 0.3
    model = model.to(device)

    train_titles = df_train['title'].tolist()
    train_labels = df_train['label'].tolist()
    test_titles = df_test['title'].tolist()
    test_labels = df_test['label'].tolist()

    train_data_loader = create_data_loader(train_titles, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
    test_data_loader = create_data_loader(test_titles, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    loss_fn = torch.nn.CrossEntropyLoss().to(device)

    best_accuracy = 0
    early_stop_patience = 3  # Early stopping after 3 epochs without improvement
    epochs_no_improvement = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')

        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            len(df_train)
        )

        test_acc, test_loss, all_labels, all_preds = eval_model(
            model,
            test_data_loader,
            loss_fn,
            device,
            len(df_test)
        )

        print(f'Train loss {train_loss} accuracy {train_acc}')
        print(f'Test loss {test_loss} accuracy {test_acc}')

        # Calculate additional metrics
        precision = precision_score(all_labels, all_preds, average='weighted')
        recall = recall_score(all_labels, all_preds, average='weighted')
        f1 = f1_score(all_labels, all_preds, average='weighted')

        print(f'Test Precision: {precision}')
        print(f'Test Recall: {recall}')
        print(f'Test F1 Score: {f1}')

        # Early stopping check
        if test_acc > best_accuracy:
            best_accuracy = test_acc
            epochs_no_improvement = 0
            torch.save(model.state_dict(), 'best_model_state.bin')  # Save best model
        else:
            epochs_no_improvement += 1
            if epochs_no_improvement >= early_stop_patience:
                print('Early stopping due to no improvement.')
                break

if __name__ == '__main__':
    main()


Starting main process


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Train loss 0.06040877926025132 accuracy 0.9796875
Test loss 0.03537472735290066 accuracy 0.9873437500000001
Test Precision: 0.9873558155787214
Test Recall: 0.98734375
Test F1 Score: 0.9873443574207812
Epoch 2/10
Train loss 0.01307406977462051 accuracy 0.9959375
Test loss 0.04330345459136879 accuracy 0.98828125
Test Precision: 0.9884521692566088
Test Recall: 0.98828125
Test F1 Score: 0.9882827188345058
Epoch 3/10
Train loss 0.006484577016240109 accuracy 0.998125
Test loss 0.03523693567720329 accuracy 0.98984375
Test Precision: 0.989886088021622
Test Recall: 0.98984375
Test F1 Score: 0.9898445915936724
Epoch 4/10
Train loss 0.004058393228706336 accuracy 0.9986328125
Test loss 0.03731407635532378 accuracy 0.9912500000000001
Test Precision: 0.9912554232446149
Test Recall: 0.99125
Test F1 Score: 0.9912502906585943
Epoch 5/10
Train loss 0.00097237457323331 accuracy 0.9998046875000001
Test loss 0.07664182438819807 accuracy 0.9890625000000001
Test Precision: 0.9891584020413688
Test 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
