In [None]:
!pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [

In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from tqdm.auto import tqdm

In [None]:
df_train = pd.read_csv('train_data.csv')
df_val = pd.read_csv('val_data.csv')
df_test = pd.read_csv('test_data.csv')

In [None]:
import json

with open('label_mapping.json', 'r') as f:
    label_mapping = json.load(f)

label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(list(label_mapping.keys()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['press_encoded'] = label_encoder.fit_transform(df_filtered['press'])


In [None]:
class NewsDataset(Dataset):
    def __init__(self, documents, labels, tokenizer, max_length):
        self.documents = documents
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, idx):
        document = self.documents[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            document,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(label)
        return item

In [None]:
tokenizer = BertTokenizer.from_pretrained('klue/roberta-base')
model = BertForSequenceClassification.from_pretrained('klue/roberta-base', num_labels=len(label_encoder.classes_))

train_dataset = NewsDataset(df_train['document'].tolist(), df_train['press_encoded'].tolist(), tokenizer, max_length=512)
val_dataset = NewsDataset(df_val['document'].tolist(), df_val['press_encoded'].tolist(), tokenizer, max_length=512)
test_dataset = NewsDataset(df_test['document'].tolist(), df_test['press_encoded'].tolist(), tokenizer, max_length=512)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=1e-3)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.laye

  0%|          | 0/2520 [00:00<?, ?it/s]

In [None]:
progress_bar = tqdm(range(num_training_steps))

total_steps = 0
ts = []
tl = []
ta = []

for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for step, batch in enumerate(train_dataloader):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)

        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == inputs['labels']).sum().item()
        total_predictions += inputs['labels'].size(0)

        progress_bar.update(1)

        if (total_steps + 1) % 100 == 0:
            avg_loss = total_loss / (step + 1)
            accuracy = correct_predictions / total_predictions
            print(f"Step {total_steps + 1}/{len(train_dataloader)}")
            ts.append(total_steps)
            print(f"Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}")
            tl.append(avg_loss)
            ta.append(accuracy)

        total_steps += 1

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}")

  0%|          | 0/2520 [00:00<?, ?it/s]

Step 100/840
Train Loss: 1.6173, Train Accuracy: 0.2425
Step 200/840
Train Loss: 1.6018, Train Accuracy: 0.2481
Step 300/840
Train Loss: 1.5649, Train Accuracy: 0.2787
Step 400/840
Train Loss: 1.5261, Train Accuracy: 0.2972
Step 500/840
Train Loss: 1.4809, Train Accuracy: 0.3210
Step 600/840
Train Loss: 1.4081, Train Accuracy: 0.3550
Step 700/840
Train Loss: 1.3471, Train Accuracy: 0.3857
Step 800/840
Train Loss: 1.3004, Train Accuracy: 0.4050
Epoch 1/3
Train Loss: 1.2827, Train Accuracy: 0.4122
Step 100/840
Train Loss: 0.8231, Train Accuracy: 0.6538
Step 200/840
Train Loss: 0.7904, Train Accuracy: 0.6631
Step 300/840
Train Loss: 0.7420, Train Accuracy: 0.6871
Step 400/840
Train Loss: 0.7081, Train Accuracy: 0.7059
Step 500/840
Train Loss: 0.6732, Train Accuracy: 0.7198
Step 600/840
Train Loss: 0.6407, Train Accuracy: 0.7350
Step 700/840
Train Loss: 0.6059, Train Accuracy: 0.7525
Step 800/840
Train Loss: 0.5871, Train Accuracy: 0.7619
Epoch 2/3
Train Loss: 0.5748, Train Accuracy: 0.767

In [None]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_labels = []
    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)

            loss = outputs.loss
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == inputs['labels']).sum().item()
            total_predictions += inputs['labels'].size(0)

            all_labels.extend(inputs['labels'].cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    roc_auc = roc_auc_score(all_labels, np.array(all_probabilities), multi_class='ovr')

    print(f"Loss: {avg_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    return avg_loss, accuracy, f1, precision, recall, roc_auc

In [None]:
print("Validation metrics:")
evaluate(model, val_dataloader)

Validation metrics:
Loss: 0.2101
Accuracy: 0.9249
F1 Score: 0.9250
Precision: 0.9252
Recall: 0.9249
ROC AUC: 0.9927


(0.21014134915959504,
 0.9249478804725504,
 0.9250478802701833,
 0.9252291958919711,
 0.9249478804725504,
 0.992734786779703)

In [None]:
print("Test metrics:")
evaluate(model, test_dataloader)

Test metrics:
Loss: 0.2229
Accuracy: 0.9222
F1 Score: 0.9223
Precision: 0.9224
Recall: 0.9222
ROC AUC: 0.9916


(0.22288933710402084,
 0.9222222222222223,
 0.9222808626963898,
 0.9224003328944993,
 0.9222222222222223,
 0.9916056382497789)

In [None]:
!pip install huggingface

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: huggingface
Successfully installed huggingface-0.0.1


In [None]:
from huggingface_hub import login

login(token='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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

local_model_path = "news_press_classification"

model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)

model.push_to_hub(repo_id="a2ran/news_press_classification")
tokenizer.push_to_hub(repo_id="a2ran/news_press_classification")

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/a2ran/news_press_classification/commit/3210d9459c5c56b8239dcaa4768b8509602201a8', commit_message='Upload tokenizer', commit_description='', oid='3210d9459c5c56b8239dcaa4768b8509602201a8', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "a2ran/news_press_classification"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]