In [34]:
!pip install torch transformers pandas scikit-learn seaborn matplotlib requests



In [35]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import zipfile
import os
import json

In [36]:
!wget https://gist.githubusercontent.com/amitness/0a2ddbcb61c34eab04bad5a17fd8c86b/raw/66ad13dfac4bd1201e09726677dd8ba8048bb8af/clickbait.csv

--2024-04-21 08:52:39--  https://gist.githubusercontent.com/amitness/0a2ddbcb61c34eab04bad5a17fd8c86b/raw/66ad13dfac4bd1201e09726677dd8ba8048bb8af/clickbait.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1835406 (1.8M) [text/plain]
Saving to: 'clickbait.csv.1'


2024-04-21 08:52:39 (69.6 MB/s) - 'clickbait.csv.1' saved [1835406/1835406]



In [37]:
df = pd.read_csv('/kaggle/working/clickbait.csv')

In [38]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [39]:
class ClickbaitDataset(Dataset):
    def __init__(self, titles, labels, tokenizer, max_len):
        self.titles = titles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, item):
        title = str(self.titles[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
          title,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        return {
          'title_text': title,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }

In [40]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [41]:
max_len = 128
batch_size = 16

In [42]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = ClickbaitDataset(
        titles=df.title.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
    )


In [43]:
train_data_loader = create_data_loader(train_df, tokenizer, max_len, batch_size)
val_data_loader = create_data_loader(val_df, tokenizer, max_len, batch_size)

In [44]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)



In [46]:
!pip install tqdm



In [47]:
from tqdm import tqdm

def train_epoch(model, data_loader, optimizer, device, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    # Set up the progress bar
    progress_bar = tqdm(data_loader, desc='Training', leave=False)

    for d in progress_bar:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask,
          labels=labels
        )

        loss = outputs[0]
        logits = outputs[1]
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss = np.mean(losses)
        running_accuracy = correct_predictions.double() / n_examples
        
        progress_bar.set_postfix({
            'loss': f'{running_loss:.4f}',
            'acc': f'{running_accuracy:.4f}'
        })
    
    progress_bar.close()
    return correct_predictions.double() / n_examples, np.mean(losses)


In [None]:
epochs = 10

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device='cuda',
        n_examples=len(train_df)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')


Epoch 1/10


Training:   0%|          | 0/1600 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
                                                                                      

Train loss 0.033697033705902865 accuracy 0.9880412693450055
Epoch 2/10


Training:   0%|          | 0/1600 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
                                                                                      

Train loss 0.009607010565378004 accuracy 0.9969516961075504
Epoch 3/10


Training:   0%|          | 0/1600 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
                                                                                      

Train loss 0.005557714428850886 accuracy 0.998632171330311
Epoch 4/10


Training:   0%|          | 0/1600 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
                                                                                      

Train loss 0.0045272355472502565 accuracy 0.9988666562451148
Epoch 5/10


Training:   0%|          | 0/1600 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Training:   2%|▏         | 39/1600 [00:07<05:05,  5.10it/s, loss=0.0008, acc=0.0244]

In [None]:
torch.save(model.state_dict(), 'clickbait_classifier_model.bin')

In [None]:
def eval_model(model, data_loader, device, n_examples):
    model = model.eval()

    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs[0], dim=1)

            predictions.extend(preds)
            real_values.extend(labels)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return predictions, real_values

In [None]:
predictions, real_values = eval_model(
    model,
    val_data_loader,
    device='cuda',
    n_examples=len(val_df)
)

In [None]:
print(classification_report(real_values, predictions, target_names=['Non-Clickbait', 'Clickbait']))

In [None]:
cm = confusion_matrix(real_values, predictions)
df_cm = pd.DataFrame(cm, index=['Non-Clickbait', 'Clickbait'], columns=['Non-Clickbait', 'Clickbait'])

# Plotting the heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(df_cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
zip_url = 'https://zenodo.org/records/6362726/files/webis-clickbait-22.zip'
zip_path = 'data.zip'

response = requests.get(zip_url)
with open(zip_path, 'wb') as f:
    f.write(response.content)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
data_dir = 'data'
output_file = 'target_descriptions.txt'

with open(output_file, 'w', encoding='utf-8') as out_f:
    # List all files in the data directory
    for filename in os.listdir(data_dir):
        # Check if the file is a .jsonl file
        if filename.endswith('.jsonl'):
            # Open the .jsonl file
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
                # Read each line in the .jsonl file
                for line in file:
                    try:
                        # Parse JSON object from line
                        json_obj = json.loads(line)
                        # Write the 'targetDescription' field to the output file
                        target_description = json_obj.get('targetDescription', '')
                        if target_description and len(target_description) >= 10:
                            out_f.write(target_description + '\n')
                    except json.JSONDecodeError:
                        continue


In [None]:
file_path = '/kaggle/working/target_descriptions.txt'

unlabeled_titles = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        title = line.strip()
        if title:
            unlabeled_titles.append(title)

print(f"Loaded {len(unlabeled_titles)} titles from the file.")


In [None]:
class UnlabeledDataset(Dataset):
    def __init__(self, titles, tokenizer, max_len):
        self.titles = titles
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, item):
        title = str(self.titles[item])

        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
unlabeled_dataset = UnlabeledDataset(unlabeled_titles, tokenizer, max_len)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=16, num_workers=2)

In [None]:
from transformers import BertForSequenceClassification

teacher_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model_path = '/kaggle/working/clickbait_classifier_model.bin'
teacher_model.load_state_dict(torch.load(model_path))

teacher_model = teacher_model.to('cuda')

In [None]:
def get_pseudo_labels(model, data_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for d in tqdm(data_loader, desc='Generating pseudo-labels'):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return predictions

In [None]:
pseudo_labels = get_pseudo_labels(teacher_model, unlabeled_loader, 'cuda')

In [None]:
combined_titles = list(train_df['title']) + unlabeled_titles
combined_labels = list(train_df['label']) + pseudo_labels

combined_dataset = ClickbaitDataset(combined_titles, combined_labels, tokenizer, max_len)
combined_loader = DataLoader(combined_dataset, batch_size=16, shuffle=True, num_workers=2)

In [None]:
student_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, problem_type="single_label_classification", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
student_model.to('cuda')

optimizer = AdamW(student_model.parameters(), lr=2e-5)

for epoch in range(3):
    print(f'Epoch {epoch + 1}/3')
    train_acc, train_loss = train_epoch(
        student_model,
        combined_loader,
        optimizer,
        'cuda',
        len(combined_dataset)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

In [None]:
s_predictions, s_real_values = eval_model(
    student_model,
    val_data_loader,
    device='cuda',
    n_examples=len(val_df)
)

In [None]:
print(classification_report(s_real_values, s_predictions, target_names=['Non-Clickbait', 'Clickbait']))

In [None]:
cm = confusion_matrix(s_real_values, s_predictions)
df_cm = pd.DataFrame(cm, index=['Non-Clickbait', 'Clickbait'], columns=['Non-Clickbait', 'Clickbait'])

# Plotting the heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(df_cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()