In [1]:
# HuggingFace transformers 설치
!pip install transformers
!pip install datasets
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 41.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [2]:
import numpy as np
import pandas as pd
import random
import torch
import sklearn
from sklearn import metrics
import pytorch_lightning as pl
from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import AutoTokenizer, ElectraModel, ElectraConfig, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import gc
from datasets import load_dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
# 랜덤 시드 고정
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)  # type: ignore
torch.cuda.manual_seed_all(SEED) # if use multi-GPU
torch.backends.cudnn.deterministic = True  # type: ignore
torch.backends.cudnn.benchmark = True  # type: ignore


# set parameter
max_length = 512
n_epochs = 30
batch_size = 16
lr = 2e-5
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
def one_hot_encoder(dataset, n_labels=44):
    one_hot = [0] * n_labels
    label_idx = dataset
    for idx in label_idx:
        one_hot[idx] = 1
    return {"labels": torch.LongTensor(one_hot)}

In [6]:
# data set
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
dataset = load_dataset("searle-j/kote")
dataset = dataset.map(lambda x: tokenizer(x["text"],
                                          add_special_tokens=True,
                                          max_length=max_length,
                                          return_token_type_ids=False,
                                          padding="max_length",
                                          return_attention_mask=True), batched=True)
dataset = dataset.map(lambda x: one_hot_encoder(x["labels"]))
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

train_dataset = dataset["train"] 
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/387k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

No config specified, defaulting to: kote/dichotomized


Downloading and preparing dataset kote/dichotomized to /root/.cache/huggingface/datasets/searle-j___kote/dichotomized/0.0.0/9e18d6e4c5fb5b54c412810da99dfa5e5ece83c40924ee5eb3f41ce5b4d5b436...


Downloading data:   0%|          | 0.00/2.90M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/365k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/370k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset kote downloaded and prepared to /root/.cache/huggingface/datasets/searle-j___kote/dichotomized/0.0.0/9e18d6e4c5fb5b54c412810da99dfa5e5ece83c40924ee5eb3f41ce5b4d5b436. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/40000 [00:00<?, ?ex/s]

  0%|          | 0/5000 [00:00<?, ?ex/s]

  0%|          | 0/5000 [00:00<?, ?ex/s]

In [7]:
class D_model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.electra = ElectraModel.from_pretrained("beomi/KcELECTRA-base").to(device)
        self.classifier = nn.Linear(self.electra.config.hidden_size, 44).to(device)
        
    def forward(self, input_ids=None, attention_mask=None):
        output = self.electra(input_ids, attention_mask)
        output = output.last_hidden_state[:,0,:]
        output = self.classifier(output)
        output = torch.sigmoid(output)
        torch.cuda.empty_cache()
        
        return output

In [8]:
def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()

    f1_score= metrics.f1_score(labels,preds,average='micro')
    return {"f1_score": f1_score}

In [9]:
def loss_function(outputs, labels):
    if labels is None:
        return None
    return nn.BCEWithLogitsLoss()(outputs, labels.float())

In [10]:
def train(model, train_dataloader, scheduler, batch_size, n_epochs, lr=1e-5):

    total_loss = 0
        
    model.train()

    for train_input in tqdm(train_dataloader):
        optimizer.zero_grad()
        y_batch = train_input["labels"].to(device)
        mask = train_input["attention_mask"].to(device)
        input_id = train_input["input_ids"].to(device)
        y_pred = model(input_id, mask)
        loss = loss_function(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()


    return total_loss

In [11]:
def valid(model, val_dataloader, batch_size):
    
    val_loss = 0
    targets = []
    outputs = []

    model.eval()
    with torch.no_grad():
        for val_input in tqdm(val_dataloader):
            y_batch = val_input["labels"].to(device)
            mask = val_input["attention_mask"].to(device)
            input_id = val_input["input_ids"].to(device)
            y_pred = model(input_id, mask)
            loss = loss_function(y_pred, y_batch)
            val_loss += loss.item()

            targets.extend(y_batch)
            outputs.extend(y_pred)

    
    return val_loss, targets, outputs

In [12]:
def test(model, test_dataloader, batch_size):
    
    test_loss = 0
    targets = []
    outputs = []

    model.eval()
    with torch.no_grad():
        for test_input in tqdm(test_dataloader):
            y_batch = test_input["labels"].to(device)
            mask = test_input["attention_mask"].to(device)
            input_id = test_input["input_ids"].to(device)
            y_pred = model(input_id, mask)
            loss = loss_function(y_pred, y_batch)
            test_loss += loss.item()

            targets.extend(y_batch)
            outputs.extend(y_pred)

    
    return test_loss, targets, outputs

In [None]:
model = D_model()

optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2500, num_training_steps = len(train_dataset) / batch_size * n_epochs)
best_val_loss = 100
f1_score = 0

for epoch in range(n_epochs):
    gc.collect()
    torch.cuda.empty_cache()
    train_loss= train(model, train_dataloader, scheduler, batch_size, n_epochs, lr=lr)
    val_loss, labels, preds = valid(model, val_dataloader, batch_size)

    for i in range(len(preds)):
        preds[i] = preds[i]+0.3
        preds[i] = torch.round(preds[i])
    f1_score = log_metrics(preds, labels)['f1_score']
    avg_train_loss, avg_val_loss = train_loss / len(train_dataloader), val_loss / len(val_dataloader)

    print(f"[{epoch+1}/{n_epochs}]")
    print(f"f1_score: {f1_score}")
    print(f"Average Train Loss: {avg_train_loss}")
    print(f"Average Valid Loss: {avg_val_loss}")
    print("\n")
    
torch.save(model, "/content/drive/MyDrive/RoBERTa/best_model.pt")
print(f"Model saved as current valid loss: {avg_val_loss}")

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[1/30]
f1_score: 0.2312196608546116
Average Train Loss: 0.7491513370990753
Average Valid Loss: 0.6942182158509763




  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[2/30]
f1_score: 0.42182309349103786
Average Train Loss: 0.6890119303941726
Average Valid Loss: 0.6847059035453552




  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[3/30]
f1_score: 0.4712499564657123
Average Train Loss: 0.6823501253604889
Average Valid Loss: 0.6819483755876462




  0%|          | 0/2500 [00:00<?, ?it/s]