In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyError: 'TBE_EPHEM_CREDS_ADDR'

In [None]:
!pip install torch torchvision transformers pandas

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import time
import traceback

class ClickbaitDataset(Dataset):
    def __init__(self, titles, labels, tokenizer, max_len):
        self.titles = titles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        label = self.labels[idx]

        # Ensure the label is a valid integer
        try:
            label = int(label)
        except ValueError:
            raise ValueError(f"Label {label} at index {idx} is not a valid integer.")

        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'title_text': title,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(titles, labels, tokenizer, max_len, batch_size):
    ds = ClickbaitDataset(
        titles=titles,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=4, pin_memory=True)

def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model.train()

    losses = []
    correct_predictions = 0
    start_time = time.time()

    for batch_idx, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if batch_idx % 10 == 0:
            print(f'Batch {batch_idx}/{len(data_loader)}, Loss: {loss.item()}')

    total_time = time.time() - start_time
    print(f'Training epoch completed in: {total_time // 60:.0f}m {total_time % 60:.0f}s')

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()

    losses = []
    correct_predictions = 0
    start_time = time.time()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch_idx, d in enumerate(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            if batch_idx % 10 == 0:
                print(f'Batch {batch_idx}/{len(data_loader)}, Test Loss: {loss.item()}')

    total_time = time.time() - start_time
    print(f'Test evaluation completed in: {total_time // 60:.0f}m {total_time % 60:.0f}s')

    return correct_predictions.double() / n_examples, np.mean(losses), all_labels, all_preds

def clean_dataset(df):
    # Remove rows where labels are NaN
    df = df.dropna(subset=['label'])
    # Convert labels to integers, and filter out rows where this conversion fails
    df['label'] = pd.to_numeric(df['label'], errors='coerce')
    df = df.dropna(subset=['label'])
    df['label'] = df['label'].astype(int)
    return df

def main():
    print("Starting main process")

    try:
        print('Loading the dataset.')
        df_train = pd.read_csv('/content/bangla_train.csv')  # Use pd.read_excel for XLSX files
        print('Training dataset loaded successfully.')

        print('Cleaning the training dataset.')
        df_train = clean_dataset(df_train)
        print('Training dataset cleaned.')

        print('Loading the test dataset.')
        df_test = pd.read_csv('/content/bangla_test.csv')  # Use pd.read_excel for XLSX files
        print('Test dataset loaded successfully.')

        RANDOM_SEED = 42
        MAX_LEN = 128
        BATCH_SIZE = 16
        EPOCHS = 10
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f'Using device: {device}')

        print('Loading the tokenizer and model.')
        tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
        model = BertForSequenceClassification.from_pretrained('sagorsarker/bangla-bert-base', num_labels=2)
        model = model.to(device)
        print('Model and tokenizer loaded and moved to device.')

        print('Creating data loaders.')
        train_titles = df_train['title'].tolist()
        train_labels = df_train['label'].tolist()
        test_titles = df_test['title'].tolist()
        test_labels = df_test['label'].tolist()

        train_data_loader = create_data_loader(train_titles, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
        test_data_loader = create_data_loader(test_titles, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)
        print('Data loaders created.')

        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)
        print('Optimizer, scheduler, and loss function defined.')

        for epoch in range(EPOCHS):
            print(f'Starting epoch {epoch + 1}/{EPOCHS}')

            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train)
            )
            print(f'Train loss {train_loss} accuracy {train_acc}')

            test_acc, test_loss, all_labels, all_preds = eval_model(
                model,
                test_data_loader,
                loss_fn,
                device,
                len(df_test)
            )
            print(f'Test loss {test_loss} accuracy {test_acc}')

            # Calculate additional metrics
            precision = precision_score(all_labels, all_preds, average='weighted')
            recall = recall_score(all_labels, all_preds, average='weighted')
            f1 = f1_score(all_labels, all_preds, average='weighted')

            print(f'Test Precision: {precision}')
            print(f'Test Recall: {recall}')
            print(f'Test F1 Score: {f1}')

    except Exception as e:
        print(f'An error occurred: {e}')
        traceback.print_exc()

if __name__ == '__main__':
    main()


Starting main process
Loading the dataset.


  df_train = pd.read_csv('/content/bangla_train.csv')  # Use pd.read_excel for XLSX files
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = pd.to_numeric(df['label'], errors='coerce')


Training dataset loaded successfully.
Cleaning the training dataset.
Training dataset cleaned.
Loading the test dataset.
Test dataset loaded successfully.
Using device: cuda
Loading the tokenizer and model.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded and moved to device.
Creating data loaders.
Data loaders created.
Optimizer, scheduler, and loss function defined.
Starting epoch 1/10
Batch 0/677, Loss: 0.9987808465957642
Batch 10/677, Loss: 0.6476734280586243
Batch 20/677, Loss: 0.5750921368598938
Batch 30/677, Loss: 0.6597618460655212
Batch 40/677, Loss: 0.6294072866439819
Batch 50/677, Loss: 0.535251796245575
Batch 60/677, Loss: 0.42236781120300293
Batch 70/677, Loss: 0.3186330497264862
Batch 80/677, Loss: 0.4654485583305359
Batch 90/677, Loss: 0.45308491587638855
Batch 100/677, Loss: 0.2870646119117737
Batch 110/677, Loss: 0.2723543643951416
Batch 120/677, Loss: 0.5982267260551453
Batch 130/677, Loss: 0.5895584225654602
Batch 140/677, Loss: 0.4520467221736908
Batch 150/677, Loss: 0.4688686728477478
Batch 160/677, Loss: 0.555928647518158
Batch 170/677, Loss: 0.5776329636573792
Batch 180/677, Loss: 0.5739564299583435
Batch 190/677, Loss: 0.39260953664779663
Batch 200/677, Loss: 0.28276458382606506
Batch 2