In [1]:
#!pip install transformers

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import pandas as pd
from sklearn import preprocessing

import warnings
warnings.filterwarning('ignore')

# Load your clickbait/non-clickbait dataset (replace 'your_dataset.csv' with your actual dataset file)
train_df = pd.read_csv('clickbait_training.csv')
test_df = pd.read_csv('Clickbait_testing.csv')

'''# Encode labels (clickbait: 1, non-clickbait: 0)
le = preprocessing.LabelEncoder()
df['label'] = le.fit_transform(df['label'])'''

# Split the dataset into training and testing sets
#train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the tokenizer and load the pre-trained BERT model for sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode the training data
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=128, return_tensors='pt')
train_labels = torch.tensor(list(train_df['label']))

# Tokenize and encode the testing data
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_labels = torch.tensor(list(test_df['label']))

# Create PyTorch DataLoader for training and testing data
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 2)

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Accuracy: 0.49748743718592964





In [3]:
# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

for epoch in range(35):  # Adjust the number of epochs as needed
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

# Evaluation
model.eval()
predictions = []

for batch in tqdm(test_loader):
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(test_df['label'], predictions)
print(f'Accuracy: {accuracy}')

100%|█████████████████████████████████████████| 308/308 [26:58<00:00,  5.25s/it]
100%|█████████████████████████████████████████| 308/308 [03:18<00:00,  1.55it/s]
100%|█████████████████████████████████████████| 308/308 [16:21<00:00,  3.19s/it]
100%|█████████████████████████████████████████| 308/308 [04:20<00:00,  1.18it/s]
100%|█████████████████████████████████████████| 308/308 [03:25<00:00,  1.50it/s]
100%|█████████████████████████████████████████| 308/308 [03:27<00:00,  1.48it/s]
100%|█████████████████████████████████████████| 308/308 [03:25<00:00,  1.50it/s]
  2%|▉                                          | 7/308 [00:04<03:16,  1.54it/s]


KeyboardInterrupt: 