In [1]:
! pip install transformers==4.40.1 datasets==2.19.0 huggingface_hub==0.23.0 -qqq

# Hugging Face Transformer COdes with BERT & GPT-2 Models

In [2]:
from transformers import AutoTokenizer, AutoModel

input_str = 'What is Huggingface Transformers?'

# BERT Model

bert_model = AutoModel.from_pretrained('bert-base-uncased')
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
input_enc = bert_tokenizer(input_str, return_tensors='pt')
output_enc = bert_model(**input_enc)



In [3]:
output_enc

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3009,  0.0158,  0.0698,  ..., -0.3406,  0.5976,  0.5820],
         [-0.1109,  0.0754, -0.1906,  ...,  0.2970,  0.4278, -0.0391],
         [-0.5813, -0.0042,  0.4034,  ..., -0.2549,  0.2216,  0.8121],
         ...,
         [ 0.9971,  0.3301, -0.0688,  ..., -0.4873,  0.0168, -0.0345],
         [-0.2394, -0.0573, -0.5885,  ..., -0.0415,  0.3123, -0.0288],
         [ 0.7884,  0.4039,  0.0217,  ...,  0.3869, -0.4785, -0.4116]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.7247e-01, -3.1464e-01, -5.2733e-01,  7.3685e-01,  1.8460e-01,
         -1.8295e-01,  8.9985e-01,  2.9762e-01, -3.9314e-01, -9.9994e-01,
          7.1272e-02,  7.2618e-01,  9.7232e-01,  3.2968e-01,  9.1370e-01,
         -7.5043e-01, -3.5754e-01, -5.6530e-01,  2.8968e-01, -6.4040e-01,
          6.0199e-01,  9.9856e-01,  3.8675e-01,  2.9078e-01,  4.0431e-01,
          7.7010e-01, -7.6515e-01,  9.1830e-01,  9.4834e-01,  7.255

# Intro to Hugging Face Libraries

### Load Model using `model_id`

In [4]:
from transformers import AutoModel

model_id = 'klue/roberta-base'
model = AutoModel.from_pretrained(model_id)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load Model including Classifier Head

In [5]:
from transformers import AutoModelForSequenceClassification

model_id = 'SamLowe/roberta-base-go_emotions'
classification_model = AutoModelForSequenceClassification.from_pretrained(model_id)

### Load Model with Classifier Head randomly reset

In [6]:
from transformers import AutoModelForSequenceClassification

model_id = 'klue/roberta-base'
classification_model = AutoModelForSequenceClassification.from_pretrained(model_id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load Tokenizer

In [7]:
from transformers import AutoTokenizer

model_id = 'klue/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Use Tokenizer

In [8]:
text_str = '토크나이저는 텍스트를 토큰 단위로 나눈다'
text_tok = tokenizer(text_str)
for k, v in  text_tok.items():
    print(k, ':', v)

input_ids : [0, 9157, 7461, 2190, 2259, 8509, 2138, 1793, 2855, 5385, 2200, 20950, 2]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
text_ids = text_tok['input_ids']

print(' '.join(tokenizer.convert_ids_to_tokens(text_ids)))
print(tokenizer.decode(text_ids))
print(tokenizer.decode(text_ids, skip_special_tokens=True))



[CLS] 토크 ##나이 ##저 ##는 텍스트 ##를 토 ##큰 단위 ##로 나눈다 [SEP]
[CLS] 토크나이저는 텍스트를 토큰 단위로 나눈다 [SEP]
토크나이저는 텍스트를 토큰 단위로 나눈다


### Use Multiple Sentences with Tokenizer

In [10]:
input_list = ['첫 번째 문장', '두 번째 문장']
input_tok = tokenizer(input_list)
for k, v in input_tok.items():
    print(k, ':', v)

input_ids : [[0, 1656, 1141, 3135, 6265, 2], [0, 864, 1141, 3135, 6265, 2]]
token_type_ids : [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
attention_mask : [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]


### Use Multiple Sentences as One Document with Tokenizer

In [11]:
input_list = [['첫 번째 문장', '두 번째 문장']]
input_tok = tokenizer(input_list)
for k, v in input_tok.items():
    print(k, ':', v)

input_ids : [[0, 1656, 1141, 3135, 6265, 2, 864, 1141, 3135, 6265, 2]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


### Decode Token IDs to Original String

In [12]:
input_list = ['첫 번째 문장', '두 번째 문장']
input_tok = tokenizer(input_list)
input_ids = input_tok['input_ids']

print(tokenizer.batch_decode(input_ids))

input_list = [['첫 번째 문장', '두 번째 문장']]
input_tok = tokenizer(input_list)
input_ids = input_tok['input_ids']

print(tokenizer.batch_decode(input_ids))

['[CLS] 첫 번째 문장 [SEP]', '[CLS] 두 번째 문장 [SEP]']
['[CLS] 첫 번째 문장 [SEP] 두 번째 문장 [SEP]']


### `token_type_ids`: BERT Tokenizer vs. RoBERTa Tokenizer

In [13]:
input_list = [['첫 번째 문장', '두 번째 문장']]
bert_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
input_tok = bert_tokenizer(input_list)
print('BERT:')
for k, v in input_tok.items():
    print(k, ':', v)

input_list = [['첫 번째 문장', '두 번째 문장']]
roberta_tokenizer = AutoTokenizer.from_pretrained('klue/roberta-base')
input_tok = roberta_tokenizer(input_list)
print('RoBERTa Base:')
for k, v in input_tok.items():
    print(k, ':', v)

input_list = [['first sentence', 'second sentence']]
en_roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
input_tok = en_roberta_tokenizer(input_list)
print('RoBERTa:')
for k, v in input_tok.items():
    print(k, ':', v)

BERT:
input_ids : [[2, 1656, 1141, 3135, 6265, 3, 864, 1141, 3135, 6265, 3]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
RoBERTa Base:
input_ids : [[0, 1656, 1141, 3135, 6265, 2, 864, 1141, 3135, 6265, 2]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
RoBERTa:
input_ids : [[0, 9502, 3645, 2, 2, 10815, 3645, 2]]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1]]


### `attention_mask`

In [14]:
input_list = ['첫 번째 문장은 짧다.', '두 번째 문장은 첫 번째 문장 보다 더 길다.']
input_tok = tokenizer(input_list, padding='longest')

for k, v in input_tok.items():
    print(k, ':', v)

input_ids : [[0, 1656, 1141, 3135, 6265, 2073, 1599, 2062, 18, 2, 1, 1, 1, 1, 1, 1], [0, 864, 1141, 3135, 6265, 2073, 1656, 1141, 3135, 6265, 3632, 831, 647, 2062, 18, 2]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


### Download Dataset from KLUE MRC

In [15]:
from datasets import load_dataset
klue_mrc_data = load_dataset('klue', 'mrc')
# klue_mrc_train_data = load_dataset('klue', 'mrc', split='train')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Load Dataset from Local Environment

In [None]:
# Local Files
from datasets import load_dataset
data = load_dataset('csv', data_files='my_files.csv')

# Pandas Dictionaries
from datasets import Dataset
my_dict = {'a': [1, 2, 3]}
data = Dataset.from_dict(my_dict)

# Pandas Dataframe
from datasets import Dataset
import pandas as pd
data_df = pd.DataFrame(my_dict)
data = Dataset.from_pandas(data_df)

### Download YNAT Datasets to Train Model

In [17]:
from datasets import load_dataset

train_data = load_dataset('klue', 'ynat', split='train')
eval_data = load_dataset('klue', 'ynat', split='validation')

In [18]:
train_data[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [19]:
train_data.features['label'].names

['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']

### Remove Unnecessary Columns

In [20]:
train_data = train_data.remove_columns(['guid', 'url', 'date'])
eval_data = eval_data.remove_columns(['guid', 'url', 'date'])
train_data

Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

### Add `label_str` Column

In [21]:
def make_label_str(batch):
    labels = train_data.features['label']
    batch['label_str'] = labels.int2str(batch['label'])
    return batch

train_data = train_data.map(
    make_label_str,
    batched=True,
    batch_size=1000
)

train_data[0]

{'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영', 'label': 3, 'label_str': '생활문화'}

### Split into Train/Validate/Test Datasets

In [22]:
train_data = train_data.train_test_split(
    test_size=10000,
    shuffle=True,
    seed=42
)['test']

eval_data = eval_data.train_test_split(
    test_size=1000,
    shuffle=True,
    seed=42
)

val_data = eval_data['train'].train_test_split(
    test_size=1000,
    shuffle=True,
    seed=42
)['test']
test_data = eval_data['test']

In [23]:
print('Train Dataset:', len(train_data))
print('Validate Dataset:', len(val_data))
print('Test Dataset:', len(test_data))

Train Dataset: 10000
Validate Dataset: 1000
Test Dataset: 1000


### Train Model with Trainer API

In [24]:
import numpy as np

from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
import torch

def tokenize(examples):
    return tokenizer(
        examples['title'],
        padding='max_length',
        truncation=True
    )

model_id = 'klue/roberta-base'
targets_cnt = len(train_data.features['label'].names)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=targets_cnt
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

train_data = train_data.map(tokenize, batched=True)
val_data = val_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [25]:
hyperparams = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    push_to_hub=False
)

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': (predictions == labels).mean()}

In [26]:
trainer = Trainer(
    model=model,
    args=hyperparams,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate(test_data)

  0%|          | 0/1250 [00:00<?, ?it/s]

{'loss': 0.6584, 'grad_norm': 44.51023864746094, 'learning_rate': 3e-05, 'epoch': 0.4}
{'loss': 0.5223, 'grad_norm': 29.636672973632812, 'learning_rate': 1e-05, 'epoch': 0.8}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.524187445640564, 'eval_accuracy': 0.837, 'eval_runtime': 114.8583, 'eval_samples_per_second': 8.706, 'eval_steps_per_second': 1.088, 'epoch': 1.0}
{'train_runtime': 4235.7883, 'train_samples_per_second': 2.361, 'train_steps_per_second': 0.295, 'train_loss': 0.5675425598144531, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.4738537073135376,
 'eval_accuracy': 0.852,
 'eval_runtime': 129.5776,
 'eval_samples_per_second': 7.717,
 'eval_steps_per_second': 0.965,
 'epoch': 1.0}

### Train Model without Trainer API

* (1) Prepare Model & Tokenizer

In [27]:
import torch
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW

def tokenize(examples):
    return tokenizer(
        examples['title'],
        padding='max_length',
        truncation=True
    )

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_id = 'klue/roberta-base'
targets_cnt = len(train_data.features['label'].names)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=targets_cnt
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

* Prepare Data to Train

In [28]:
def make_dataloader(dataset, batch_size, shuffle=True):
    dataset = dataset.map(tokenize, batched=True).with_format('torch')
    dataset = dataset.rename_column('label', 'labels')
    dataset = dataset.remove_columns(column_names=['title'])
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = make_dataloader(train_data, batch_size=8, shuffle=True)
val_dataloader = make_dataloader(val_data, batch_size=8, shuffle=True)
test_dataloader = make_dataloader(test_data, batch_size=8, shuffle=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

* Define Function to Train

In [29]:
def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)                
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()        
        optimizer.step()
        
        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss

* Define Function to Evaluate

In [30]:
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0

    preds = []
    actual_labels = []    

    with torch.no_grad():
        for batch in tqdm(data_loader):

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids,
                attention_mask=attention_mask,
                labels=labels
            )                                    
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()
            pred = torch.argmax(logits, dim=-1)
            preds.extend(pred.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    accuracy = np.mean(
        np.array(preds) == np.array(actual_labels)
    )
    return avg_loss, accuracy

* Start to Train

In [31]:
epochs_cnt = 1
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(epochs_cnt):
    print(f'Epoch {epoch + 1} / {epochs_cnt}')    
    valid_loss, valid_accuracy = evaluate(model, val_dataloader)
    print(f'Validate Loss:', {valid_loss})
    print(f'Validate Accuracy:', {valid_accuracy})

_, test_accuracy = evaluate(model, test_dataloader)
print(f'Test Accuracy: {test_accuracy}')

Epoch 1 / 1




  0%|          | 0/125 [00:00<?, ?it/s]

Validate Loss: {1.9003448028564454}
Validate Accuracy: {0.386}


  0%|          | 0/125 [00:00<?, ?it/s]

Test Accuracy: 0.398


### Upload Model to Hugging Face Hub

In [32]:
id2label = {id:label for id, label in enumerate(train_data.features['label'].names)}
label2id = {label:id for id, label in id2label.items()}

model.config.id2label = id2label
model.config.label2id = label2id

In [33]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
repo_id = 'joshua-data/roberta-base-klue-ynat-classification'

login(token=huggingface_token)

# Trained by Trainer API
trainer.push_to_hub(repo_id)

# Trained by Self Trained Model
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/joshuakim/.cache/huggingface/token
Login successful


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/joshua-data/roberta-base-klue-ynat-classification/commit/98ae2bc2658cdc44f088aedbb525135bb4d406cd', commit_message='Upload tokenizer', commit_description='', oid='98ae2bc2658cdc44f088aedbb525135bb4d406cd', pr_url=None, pr_revision=None, pr_num=None)

### Inference Model

* With Pipeline

In [34]:
import torch
import torch.nn.functional as F
from datasets import load_dataset

dataset = load_dataset(
    'klue',
    'ynat',
    split='validation'
)

from transformers import pipeline

model_id = 'joshua-data/roberta-base-klue-ynat-classification'
model_pipeline = pipeline(
    'text-classification',
    model=model_id
)
model_pipeline(
    dataset['title'][:5]
)



config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

[{'label': '사회', 'score': 0.16757720708847046},
 {'label': '사회', 'score': 0.17347490787506104},
 {'label': '사회', 'score': 0.1690426915884018},
 {'label': '사회', 'score': 0.1701769381761551},
 {'label': '사회', 'score': 0.17067451775074005}]

* With Custom Pipeline

In [37]:
import torch
from torch.nn.functional import softmax
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer
)

class CustomPipeline:
    
    def __init__(self, model_id):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_id)
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)        
        self.model.eval()

    def __call__(self, texts):
        tokenized = self.tokenizer(
            texts,
            return_tensors='pt',
            padding=True,
            truncation=True
        )
        with torch.no_grad():
            outputs = self.model(**tokenized)
            logits = outputs.logits

        probs = softmax(logits, dim=-1)
        scores, labels = torch.max(probs, dim=-1)
        labels_str = [self.model.config.id2label[label_idx] for label_idx in labels.tolist()]                

        return [
            {'label': label, 'score': score.item()}
            for label, score in zip(labels_str, scores)
        ]

custom_pipeline = CustomPipeline(model_id)
custom_pipeline(dataset['title'][:5])

[{'label': '사회', 'score': 0.16757717728614807},
 {'label': '사회', 'score': 0.17347487807273865},
 {'label': '사회', 'score': 0.1690426915884018},
 {'label': '사회', 'score': 0.1701769083738327},
 {'label': '사회', 'score': 0.17067457735538483}]