In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install openpyxl




In [7]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
import traceback

class ClickbaitDataset(Dataset):
    def __init__(self, titles, labels, tokenizer, max_len):
        self.titles = titles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        label = self.labels[idx]

        # Ensure the label is a valid integer
        try:
            label = int(label)
        except ValueError:
            raise ValueError(f"Label {label} at index {idx} is not a valid integer.")

        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'title_text': title,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(titles, labels, tokenizer, max_len, batch_size):
    ds = ClickbaitDataset(
        titles=titles,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=4, pin_memory=True)

def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model.train()

    losses = []
    correct_predictions = 0
    start_time = time.time()

    for batch_idx, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if batch_idx % 10 == 0:
            print(f'Batch {batch_idx}/{len(data_loader)}, Loss: {loss.item()}')

    total_time = time.time() - start_time
    print(f'Training epoch completed in: {total_time // 60:.0f}m {total_time % 60:.0f}s')

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()

    losses = []
    correct_predictions = 0
    start_time = time.time()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch_idx, d in enumerate(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            if batch_idx % 10 == 0:
                print(f'Batch {batch_idx}/{len(data_loader)}, test Loss: {loss.item()}')

    total_time = time.time() - start_time
    print(f'test completed in: {total_time // 60:.0f}m {total_time % 60:.0f}s')

    return correct_predictions.double() / n_examples, np.mean(losses), all_labels, all_preds

def clean_dataset(df):
    # Remove rows where labels are NaN
    df = df.dropna(subset=['label'])
    # Convert labels to integers, and filter out rows where this conversion fails
    df['label'] = pd.to_numeric(df['label'], errors='coerce')
    df = df.dropna(subset=['label'])
    df['label'] = df['label'].astype(int)
    return df

def main():
    print("Starting main process")

    try:
        print('Loading the dataset.')
        df = pd.read_excel('/content/drive/MyDrive/clickbait/arabic.xlsx')  # Use pd.read_excel for XLSX files
        print('Dataset loaded successfully.')

        print('Cleaning the dataset.')
        df = clean_dataset(df)
        print('Dataset cleaned.')

        print('Splitting the dataset into training and test sets.')
        df_train, df_test = random_split(df, [int(0.8 * len(df)), len(df) - int(0.8 * len(df))])
        print('Dataset split into training and test sets.')

        RANDOM_SEED = 42
        MAX_LEN = 128
        BATCH_SIZE = 16
        EPOCHS = 10
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f'Using device: {device}')

        print('Loading the tokenizer and model.')
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
        model = model.to(device)
        print('Model and tokenizer loaded and moved to device.')

        print('Creating data loaders.')
        train_titles = [df.iloc[i].title for i in df_train.indices]
        train_labels = [df.iloc[i].label for i in df_train.indices]
        test_titles = [df.iloc[i].title for i in df_test.indices]
        test_labels = [df.iloc[i].label for i in df_test.indices]

        train_data_loader = create_data_loader(train_titles, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
        test_data_loader = create_data_loader(test_titles, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)
        print('Data loaders created.')

        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)
        print('Optimizer, scheduler, and loss function defined.')

        for epoch in range(EPOCHS):
            print(f'Starting epoch {epoch + 1}/{EPOCHS}')

            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train)
            )
            print(f'Train loss {train_loss} accuracy {train_acc}')

            test_acc, test_loss, all_labels, all_preds = eval_model(
                model,
                test_data_loader,
                loss_fn,
                device,
                len(df_test)
            )
            print(f'test loss {test_loss} accuracy {test_acc}')

            # Calculate additional metrics
            precision = precision_score(all_labels, all_preds, average='weighted')
            recall = recall_score(all_labels, all_preds, average='weighted')
            f1 = f1_score(all_labels, all_preds, average='weighted')
            accuracy = accuracy_score(all_labels, all_preds)

            print(f'Test Precision: {precision}')
            print(f'Test Recall: {recall}')
            print(f'Test F1 Score: {f1}')
            print(f'Test Accuracy: {accuracy}')

    except Exception as e:
        print(f'An error occurred: {e}')
        traceback.print_exc()

if __name__ == '__main__':
    main()


Starting main process
Loading the dataset.
Dataset loaded successfully.
Cleaning the dataset.
Dataset cleaned.
Splitting the dataset into training and test sets.
Dataset split into training and test sets.
Using device: cuda
Loading the tokenizer and model.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded and moved to device.
Creating data loaders.
Data loaders created.
Optimizer, scheduler, and loss function defined.
Starting epoch 1/10
Batch 0/774, Loss: 0.7239124774932861
Batch 10/774, Loss: 0.7020589113235474
Batch 20/774, Loss: 0.715133547782898
Batch 30/774, Loss: 0.7172030806541443
Batch 40/774, Loss: 0.6756219267845154
Batch 50/774, Loss: 0.6937083601951599
Batch 60/774, Loss: 0.643505871295929
Batch 70/774, Loss: 0.6289806962013245
Batch 80/774, Loss: 0.5488048791885376
Batch 90/774, Loss: 0.8170941472053528
Batch 100/774, Loss: 0.6192912459373474
Batch 110/774, Loss: 0.5544247031211853
Batch 120/774, Loss: 0.550691545009613
Batch 130/774, Loss: 0.32311686873435974
Batch 140/774, Loss: 0.41098347306251526
Batch 150/774, Loss: 0.49459147453308105
Batch 160/774, Loss: 0.18418629467487335
Batch 170/774, Loss: 0.6211909055709839
Batch 180/774, Loss: 0.4202614724636078
Batch 190/774, Loss: 0.46695902943611145
Batch 200/774, Loss: 0.6642058491706848
Batch 2