In [None]:
!pip install -q evaluate
!pip install -q -U transformers peft accelerate optimum
!pip install --quiet bitsandbytes
!pip install transformers

In [None]:
import json
from datasets import Dataset

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        data = [json.loads(line) for line in lines]
    return data

In [None]:
train_data = load_jsonl('./data/nikluge-iau-2023-train.jsonl')
test_data = load_jsonl('./data/nikluge-iau-2023-dev.jsonl')

In [None]:
train_dataset = Dataset.from_dict({
    'id': [item['id'] for item in train_data],
    'text': [item['input'] for item in train_data],
    'label': [1 if item['output'] == 'POSITIVE' else 0 for item in train_data]
})

test_dataset = Dataset.from_dict({
    'id': [item['id'] for item in test_data],
    'text': [item['input'] for item in test_data],
    'label': [1 if item['output'] == 'POSITIVE' else 0 for item in test_data]
})

In [None]:
from transformers import AutoTokenizer

model_id = "snunlp/KR-ELECTRA-discriminator" # fine-tuned ELECTRA model for NSMC
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
import os
# 파인튜닝된 모델 경로
# 사용하고 싶은 체크포인트 경로를 입력해주세요.
model_checkpoint = "./ckpt/krelectra-nsmc-ckpt/checkpoint-2000/"

# 경로 확인
if not os.path.exists(model_checkpoint):
    raise ValueError(f"Path {model_checkpoint} does not exist.")
else:
    print(f"Path {model_checkpoint} exists. Proceeding to load the model.")

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch

model_checkpoint = "./ckpt/krelectra-nsmc-ckpt/checkpoint-2000/"
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)


In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(output_dir="test_trainer",
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  num_train_epochs=10,
                                  load_best_model_at_end = True,
                                  )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# nikluge-iau-2023-test.jsonl를 가져와서 output을 예측
test_data = load_jsonl('./data/nikluge-iau-2023-test.jsonl')
test_dataset = Dataset.from_dict({
    'id': [item['id'] for item in test_data],
    'text': [item['input'] for item in test_data],
})

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_test_dataset)

output = []
for idx, pred in enumerate(predictions.predictions):
    output.append({
        'id': test_data[idx]['id'],
        'input': test_data[idx]['input'],
        'output': 'POSITIVE' if pred.argmax() == 1 else 'NEGATIVE'
    })
    
# JSONL 파일로 저장
with open('./data/test-predictions.jsonl', 'w', encoding='utf-8') as file:
    for item in output:
        file.write(json.dumps(item, ensure_ascii=False) + '\n')

!head -n 5 ./data/test-predictions.jsonl