In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch torchvision transformers pandas

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
import traceback

class ClickbaitDataset(Dataset):
    def __init__(self, titles, labels, tokenizer, max_len):
        self.titles = titles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        label = self.labels[idx]

        try:
            label = int(label)
        except ValueError:
            raise ValueError(f"Label {label} at index {idx} is not a valid integer.")

        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'title_text': title,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(titles, labels, tokenizer, max_len, batch_size):
    ds = ClickbaitDataset(
        titles=titles,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=4, pin_memory=True)

def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model.train()

    losses = []
    correct_predictions = 0
    start_time = time.time()

    for batch_idx, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if batch_idx % 10 == 0:
            print(f'Batch {batch_idx}/{len(data_loader)}, Loss: {loss.item()}')

    total_time = time.time() - start_time
    print(f'Training epoch completed in: {total_time // 60:.0f}m {total_time % 60:.0f}s')

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()

    losses = []
    correct_predictions = 0
    start_time = time.time()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch_idx, d in enumerate(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            if batch_idx % 10 == 0:
                print(f'Batch {batch_idx}/{len(data_loader)}, Test Loss: {loss.item()}')

    total_time = time.time() - start_time
    print(f'Test evaluation completed in: {total_time // 60:.0f}m {total_time % 60:.0f}s')

    return correct_predictions.double() / n_examples, np.mean(losses), all_labels, all_preds

def clean_dataset(df):
    df = df.dropna(subset=['label'])
    df['label'] = pd.to_numeric(df['label'], errors='coerce')
    df = df.dropna(subset=['label'])
    df['label'] = df['label'].astype(int)
    return df

def main():
    print("Starting main process")

    try:
        print('Loading the dataset.')
        df_train = pd.read_excel('/content/drive/MyDrive/clickbait/hebrew.xlsx')
        print('Training dataset loaded successfully.')

        print('Cleaning the training dataset.')
        df_train = clean_dataset(df_train)
        print('Training dataset cleaned.')

        print('Loading the test dataset.')
        df_test = pd.read_excel('/content/drive/MyDrive/clickbait/‏‏hebrew_test.xlsx')
        print('Test dataset loaded successfully.')

        RANDOM_SEED = 42
        MAX_LEN = 128
        BATCH_SIZE = 16
        EPOCHS = 5
        EARLY_STOPPING_PATIENCE = 2
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f'Using device: {device}')

        print('Loading the tokenizer and model.')
        tokenizer = BertTokenizer.from_pretrained('avichr/heBERT')
        model = BertForSequenceClassification.from_pretrained('avichr/heBERT', num_labels=2)

        # Adding Dropout
        model.config.hidden_dropout_prob = 0.3
        model.config.attention_probs_dropout_prob = 0.3

        # Apply L2 regularization
        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

        model = model.to(device)
        print('Model and tokenizer loaded and moved to device.')

        print('Creating data loaders.')
        train_titles = df_train['title'].tolist()
        train_labels = df_train['label'].tolist()
        test_titles = df_test['title'].tolist()
        test_labels = df_test['label'].tolist()

        train_data_loader = create_data_loader(train_titles, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
        test_data_loader = create_data_loader(test_titles, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)
        print('Data loaders created.')

        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)
        print('Optimizer, scheduler, and loss function defined.')

        best_val_loss = float('inf')
        patience_counter = 0

        for epoch in range(EPOCHS):
            print(f'Starting epoch {epoch + 1}/{EPOCHS}')

            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train)
            )
            print(f'Train loss {train_loss} accuracy {train_acc}')

            test_acc, test_loss, all_labels, all_preds = eval_model(
                model,
                test_data_loader,
                loss_fn,
                device,
                len(df_test)
            )
            print(f'Test loss {test_loss} accuracy {test_acc}')

            # Calculate additional metrics
            precision = precision_score(all_labels, all_preds, average='weighted')
            recall = recall_score(all_labels, all_preds, average='weighted')
            f1 = f1_score(all_labels, all_preds, average='weighted')

            print(f'Test Precision: {precision}')
            print(f'Test Recall: {recall}')
            print(f'Test F1 Score: {f1}')

            # Early stopping
            if test_loss < best_val_loss:
                best_val_loss = test_loss
                patience_counter = 0
                print('Validation loss improved, saving model.')
                # Save model checkpoint if needed
            else:
                patience_counter += 1
                if patience_counter >= EARLY_STOPPING_PATIENCE:
                    print('Early stopping triggered.')
                    break

    except Exception as e:
        print(f'An error occurred: {e}')
        traceback.print_exc()

if __name__ == '__main__':
    main()


Starting main process
Loading the dataset.
Training dataset loaded successfully.
Cleaning the training dataset.
Training dataset cleaned.
Loading the test dataset.
Test dataset loaded successfully.
Using device: cpu
Loading the tokenizer and model.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded and moved to device.
Creating data loaders.
Data loaders created.
Optimizer, scheduler, and loss function defined.
Starting epoch 1/5




Batch 0/66, Loss: 0.7183716297149658
Batch 10/66, Loss: 0.5318698287010193
Batch 20/66, Loss: 0.4278326630592346
Batch 30/66, Loss: 0.4228333532810211
Batch 40/66, Loss: 0.38035330176353455
Batch 50/66, Loss: 0.30478718876838684
Batch 60/66, Loss: 0.28793928027153015
Training epoch completed in: 22m 59s
Train loss 0.46569276736541226 accuracy 0.796384395813511
Batch 0/17, Test Loss: 0.4037087559700012
Batch 10/17, Test Loss: 0.41212737560272217
Test evaluation completed in: 1m 49s
Test loss 0.3120093231692034 accuracy 0.8787878787878788
Test Precision: 0.8790529758011772
Test Recall: 0.8787878787878788
Test F1 Score: 0.8787182126454499
Validation loss improved, saving model.
Starting epoch 2/5




Batch 0/66, Loss: 0.3205702006816864
Batch 10/66, Loss: 0.31417325139045715
Batch 20/66, Loss: 0.18482893705368042
Batch 30/66, Loss: 0.1350516378879547
Batch 40/66, Loss: 0.2185259312391281
Batch 50/66, Loss: 0.17412744462490082
Batch 60/66, Loss: 0.09435988962650299
Training epoch completed in: 22m 24s
Train loss 0.22420745151061 accuracy 0.9181731684110371
Batch 0/17, Test Loss: 0.4352775514125824
Batch 10/17, Test Loss: 0.4576916992664337
Test evaluation completed in: 1m 49s
Test loss 0.3273055969792254 accuracy 0.8825757575757576
Test Precision: 0.8835155118490375
Test Recall: 0.8825757575757576
Test F1 Score: 0.8824220675551813
Starting epoch 3/5




Batch 0/66, Loss: 0.1428309679031372
Batch 10/66, Loss: 0.17387254536151886
Batch 20/66, Loss: 0.04425901547074318
Batch 30/66, Loss: 0.10249293595552444
Batch 40/66, Loss: 0.10581494867801666
Batch 50/66, Loss: 0.036190878599882126
Batch 60/66, Loss: 0.04705500975251198
Training epoch completed in: 22m 32s
Train loss 0.09330155975608663 accuracy 0.9705042816365367
Batch 0/17, Test Loss: 0.7020514607429504
Batch 10/17, Test Loss: 0.6937597990036011
Test evaluation completed in: 1m 49s
Test loss 0.4944565953577266 accuracy 0.8712121212121212
Test Precision: 0.8778845690610397
Test Recall: 0.8712121212121212
Test F1 Score: 0.8704071969696969
Early stopping triggered.
