In [1]:
!pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from transformers import BertTokenizer, BertModel, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from tqdm.auto import tqdm

In [2]:
from sklearn.preprocessing import LabelEncoder
import json

df_train = pd.read_csv('LDA_train_df.csv')
df_val = pd.read_csv('LDA_val_df.csv')
df_test = pd.read_csv('LDA_test_df.csv')

with open('label_mapping.json', 'r') as f:
    label_mapping = json.load(f)

label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(list(label_mapping.keys()))

In [3]:
df_train['topic_distribution'] = df_train['topic_distribution'].apply(lambda x: list(map(float, x.split(','))))
df_val['topic_distribution'] = df_val['topic_distribution'].apply(lambda x: list(map(float, x.split(','))))
df_test['topic_distribution'] = df_test['topic_distribution'].apply(lambda x: list(map(float, x.split(','))))

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(hidden_size, 1)

    def forward(self, pooled_output, topic_distribution):
        topic_distribution = topic_distribution.unsqueeze(1)
        attn_weights = F.softmax(self.attn(topic_distribution), dim=1)
        attended_topic = torch.bmm(attn_weights.transpose(1, 2), pooled_output.unsqueeze(1))
        attended_topic = attended_topic.squeeze(1)
        return attended_topic

class LDA_attended_BERTModel(nn.Module):
    def __init__(self, bert_model, num_labels, num_topic_distribution=10, hidden_size=768, hidden_layers=2, dropout_prob=0.1):
        super(LDA_attended_BERTModel, self).__init__()
        self.bert = bert_model
        self.attention = Attention(num_topic_distribution)

        layers = []
        input_size = hidden_size
        for _ in range(hidden_layers):
            layers.append(nn.Linear(input_size, input_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_prob))
        self.fc_layers = nn.Sequential(*layers)

        self.classifier = nn.Linear(input_size, num_labels)

    def forward(self, input_ids, attention_mask, topic_distribution):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        attended_topic_distribution = self.attention(pooled_output, topic_distribution)
        fc_output = self.fc_layers(attended_topic_distribution)
        logits = self.classifier(fc_output)
        return logits

In [5]:
class NewsDataset(Dataset):
    def __init__(self, documents, labels, topic_distributions, tokenizer, max_length):
        self.documents = documents
        self.labels = labels
        self.topic_distributions = topic_distributions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, idx):
        document = self.documents[idx]
        label = self.labels[idx]
        topic_distribution = self.topic_distributions[idx]

        encoding = self.tokenizer(
            document,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(label)
        item['topic_distribution'] = torch.tensor(topic_distribution, dtype=torch.float)  # Convert to tensor

        return item

In [6]:
tokenizer = BertTokenizer.from_pretrained('klue/roberta-base')
model = LDA_attended_BERTModel(num_labels=len(label_encoder.classes_), bert_model = BertModel.from_pretrained('klue/roberta-base'))

train_dataset = NewsDataset(df_train['document'].tolist(), df_train['press_encoded'].tolist(), df_train['topic_distribution'].tolist(), tokenizer, max_length=512)
val_dataset = NewsDataset(df_val['document'].tolist(), df_val['press_encoded'].tolist(), df_val['topic_distribution'].tolist(), tokenizer, max_length=512)
test_dataset = NewsDataset(df_test['document'].tolist(), df_test['press_encoded'].tolist(), df_test['topic_distribution'].tolist(), tokenizer, max_length=512)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=1e-4)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.ou

LDA_attended_BERTModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, el

In [None]:
progress_bar = tqdm(range(num_training_steps))

total_steps = 0
ts = []
tl = []
ta = []

for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        topic_distribution = batch['topic_distribution'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, topic_distribution=topic_distribution)

        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        predictions = torch.argmax(outputs, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

        progress_bar.update(1)

        if (total_steps + 1) % 100 == 0:
            avg_loss = total_loss / (step + 1)
            accuracy = correct_predictions / total_predictions
            print(f"Step {total_steps + 1}/{len(train_dataloader)}")
            ts.append(total_steps)
            print(f"Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}")
            tl.append(avg_loss)
            ta.append(accuracy)

        total_steps += 1

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct_predictions / total_predictions

  0%|          | 0/2520 [00:00<?, ?it/s]

Step 100/840
Train Loss: 1.5988, Train Accuracy: 0.2500
Step 200/840
Train Loss: 1.5971, Train Accuracy: 0.2519
Step 300/840
Train Loss: 1.5945, Train Accuracy: 0.2525
Step 400/840
Train Loss: 1.5951, Train Accuracy: 0.2481
Step 500/840
Train Loss: 1.5905, Train Accuracy: 0.2537
Step 600/840
Train Loss: 1.5892, Train Accuracy: 0.2525
Step 700/840
Train Loss: 1.5877, Train Accuracy: 0.2557
Step 800/840
Train Loss: 1.5882, Train Accuracy: 0.2558
Step 900/840
Train Loss: 1.5879, Train Accuracy: 0.2292


In [None]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_labels = []
    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)

            loss = outputs.loss
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == inputs['labels']).sum().item()
            total_predictions += inputs['labels'].size(0)

            all_labels.extend(inputs['labels'].cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    roc_auc = roc_auc_score(all_labels, np.array(all_probabilities), multi_class='ovr')

    print(f"Loss: {avg_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    return avg_loss, accuracy, f1, precision, recall, roc_auc