# 2025.06.04

In [None]:
from dotenv import load_dotenv
load_dotenv() # If True, reading successful.
# Make values environment variables temporarily.

In [None]:
import os


key = os.getenv("HUGGINGFACE_API_KEY")
# key = os.getenv("HUGGINGFACE_API_KEYsssda") # None
print(key)

In [None]:
key = os.environ['HUGGINGFACE_API_KEY']
# key = os.environ['HUGGINGFACE_API_KEYs'] # KeyError
print(key)

## Fine Tuning

### Naver Movie comments Classification

#### Huggingface Dataset Package

* `pip install datasets`
* https://huggingface.co/datasets
* https://github.com/huggingface/datasets

#### Loading

* `load_dataset('dastaset_name')`

In [None]:
%pip install --upgrade datasets

In [None]:
from datasets import load_dataset

nsmc = load_dataset('e9t/nsmc', trust_remote_code = True)   # 
type(nsmc)

In [None]:
# Dataset: actual dataset
# DatasetDict: dictionary of dataset collection(train/valid/test)

nsmc

In [None]:
trainset = nsmc['train']
testset = nsmc.get('test')
trainset, testset

In [None]:
trainset['document'][:5], trainset['label'][:5], trainset['id'][:5]

In [None]:
df = trainset.to_pandas()
df.head()

In [None]:
# datasets.Dataset.from_extension(dataset_name)
import datasets

d = datasets.Dataset.from_pandas(df.head(100))
d

#### sampling

* train: 10000

* test: 5000

In [None]:
sample_train = trainset.shuffle().select(range(10000))
sample_test = testset.shuffle().select(range(5000))
# sample_test = testset.shuffle()[:5000]

In [None]:
print(sample_train, sample_test, sep = '\n')

#### Model, Tokenizer by AutoClass

* https://huggingface.co/docs/transformers/model_doc/auto

In [None]:
%pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_id = 'beomi/kcbert-base'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels = 2)

In [None]:
# beomi/kcbert-base -> Feature Extraction Model(not for Classification)

# Message: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# L Untrained Estimator

In [None]:
print(model)

#### pytorch Dataset, DataLoader

In [None]:
train_X, test_X, train_y, test_y = sample_train['document'], sample_test['document'], sample_train['label'], sample_test['label']

In [None]:
# Tokenization
train_encoding = tokenizer(train_X, return_tensors = 'pt', padding = True) # padding standard?
test_encoding = tokenizer(test_X, return_tensors = 'pt', padding = True)

In [None]:
train_encoding.keys()
train_encoding['input_ids'].shape

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class NSMCDataset(Dataset):

    def __init__(self, comments, labels):
        """
        Args:
            comments(dict)
            labels(list)
        """
        self.comments = comments
        self.labels = labels


    def __len__(self):
        return(len(self.labels))
    
    def __getitem__(self, idx):
        """
        to look up a single sequence in the batch
        BERT Input Format -> (input_ids, token_type_ids, attention_mask)
        Args(int):
            idx(int)
        Returns:
            dict - input_ids, token_type_ids, attention_mask, label
        """
        data = {key:val[idx] for key, val in self.comments.items()}
        data['labels'] = torch.tensor([self.labels[idx]], dtype = torch.int64)
        return data

In [None]:
train_encoding

In [None]:
train_set = NSMCDataset(train_encoding, train_y) 
test_set = NSMCDataset(test_encoding, test_y)

In [None]:
len(train_set), len(test_set)

In [None]:
train_set[0]

#### Train

* `TrainingArguments`, `Trainer` -> Convenience

In [None]:
from transformers import TrainingArguments, Trainer

N_EPOCHS = 1
BATCH_SIZE = 64

train_args = TrainingArguments(
    output_dir = 'models/nsmc',
    num_train_epochs = N_EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,

    eval_strategy = 'epoch',  # 'no', 'step', 'epoch
    save_strategy = 'epoch',

    save_total_limit = 1,
    load_best_model_at_end = True,   # Save and Load. subject to: eval_strategy = save_strategy

    metric_for_best_model = 'eval_loss',
    greater_is_better = False,      # like NMSE

    report_to = 'none'
)

In [None]:
import evaluate

# Performance Evaluation
acc_fn = evaluate.load('accuracy')   # f1, recall, precision, etc.
acc_fn

In [None]:
pred = torch.tensor([0, 1, 0, 1])
ref = torch.tensor([0, 1, 0, 0])

acc_fn.compute(predictions = pred, references = ref)

In [None]:
def compute_metrics(pred):
    """
    to get predicted values and evaluate performance while training
    Args:
        pred(EvalPrediction): Predicted values with ground truths
    Returns:
        dict - dict(score_name = score) 
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(dim = -1)
    metrics1 = evaluate.load("accuracy")
    metrics2 = evaluate.load("f1")

    acc = metrics1.compute(predictions = preds, references = labels)
    f1 = metrics2.compute(predictions = preds, references = labels)
    return {"Accuracy": acc, "F1 Score": f1}

In [None]:
trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset = train_set,   # torch.utils.data.Dataset   # trainer.train()
    eval_dataset = testset,     # trainer.evaluate()
    compute_metrics = compute_metrics   # default = loss function
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# save: tokenizer and model always same path
save_path = 'models/nsmc'
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

In [None]:
# load
from transformers import AutoTokenizer, AutoModelForSequenceClassification

load_tokenizer = AutoTokenizer.from_pretrained(save_path)   # local path
load_model = AutoModelForSequenceClassification.from_pretrained(save_path)

#### Prediction

In [None]:
sentence = ["이걸 영화라고 만든 거냐?", "아무 기대 없이 봤는데 재미있네.", "내가 감독이어도 이것보다 재미있게 만들겠다.", "시간이 어떻게 가는 줄 모르고 봤다."]

In [None]:
from transformers import pipeline

pipe = pipeline(task = 'text-classification', tokenizer = load_tokenizer, model = load_model)
result = pipe(sentence)
result

In [None]:
from huggingface_hub import login

login(key)

In [None]:
model_id = 'kcbert-nsmc-10000'
load_tokenizer.push_to_hub(model_id)
load_model.push_to_hub(model_id)