In [None]:
!pip install transformers==4.30.2
!pip install datasets==2.13.1
!pip install accelerate==0.20.3
!pip install sentencepiece==0.1.99
!pip install umap-learn==0.5.3

In [None]:
from huggingface_hub import notebook_login

notebook_login()


In [None]:
from datasets import load_dataset

dataset_name = "djifg/GROW-classification"
dataset = load_dataset(dataset_name)

In [None]:
from transformers import AutoTokenizer

model_ckpt = "beomi/kcbert-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
from transformers import AutoModel
import torch
model_ckpt = "beomi/kcbert-large"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
dataset_encoded.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"])

In [None]:
dataset_hidden = dataset_encoded.map(extract_hidden_states, batched=True)

In [None]:
import numpy as np

X_train = np.array(dataset_hidden["train"]["hidden_state"])
X_valid = np.array(dataset_hidden["validation"]["hidden_state"])
y_train = np.array(dataset_hidden["train"]["label"])
y_valid = np.array(dataset_hidden["validation"]["label"])
X_train.shape, X_valid.shape

In [None]:
import pandas as pd
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler


X_scaled = MinMaxScaler().fit_transform(X_train)

mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)

df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 5
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments


batch_size = 64
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name =  "djifg/grow_classification_kcbert"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=5,
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    log_level="error",
)


trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    tokenizer=tokenizer
)


trainer.train()


In [None]:
trainer.push_to_hub(commit_message="Training completed!")