# Attitude detection for inappropriate
- model: kcbert-large

In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json
from datasets import Dataset
from peft import LoraConfig, TaskType
from peft import get_peft_model


def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        data = [json.loads(line) for line in lines]
    return data

train_data = load_jsonl('./data/nikluge-iau-2023-train.jsonl')
test_data = load_jsonl('./data/nikluge-iau-2023-dev.jsonl')


train_dataset = Dataset.from_dict({
    'id': [item['id'] for item in train_data],
    'text': [item['input'] for item in train_data],
    'label': [1 if item['output'] == 'POSITIVE' else 0 for item in train_data]
})

test_dataset = Dataset.from_dict({
    'id': [item['id'] for item in test_data],
    'text': [item['input'] for item in test_data],
    'label': [1 if item['output'] == 'POSITIVE' else 0 for item in test_data]
})


model_id = "beomi/kcbert-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1
)

# Model
model = BertForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2
)

model = get_peft_model(model, lora_config)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)

    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1_macro = f1_score(labels, preds, average='macro')
    f1_micro = f1_score(labels, preds, average='micro')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro
    }

training_args = TrainingArguments(output_dir="test_trainer",
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  num_train_epochs=10,
                                  weight_decay=0.01,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1_macro",
                                  greater_is_better=True,
                                  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# evalueate 결과를 저장
result  = trainer.evaluate()

# 결과를 score.csv에 저장
with open('./data/score.csv', 'a') as file:
    file.write(f'{model_id},{result["eval_accuracy"]},{result["eval_precision"]},{result["eval_recall"]},{result["eval_f1_macro"]},{result["eval_f1_micro"]}\n')


# 테스트 데이터에 대한 예측 결과를 저장
test_data = load_jsonl('./data/nikluge-iau-2023-test.jsonl')
test_dataset = Dataset.from_dict({
    'id': [item['id'] for item in test_data],
    'text': [item['input'] for item in test_data],
})

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_test_dataset)

output = []
for idx, pred in enumerate(predictions.predictions):
    output.append({
        'id': test_data[idx]['id'],
        'input': test_data[idx]['input'],
        'output': 'POSITIVE' if pred.argmax() == 1 else 'NEGATIVE'
    })
    
# JSONL 파일로 저장
with open('./data/' + model_id.split('/')[1] + '-predictions.jsonl', 'w', encoding='utf-8') as file:
    for item in output:
        file.write(json.dumps(item, ensure_ascii=False) + '\n')

!head -n 5 ./data/nikluge-iau-2023-test-predictions.jsonl



  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 13/13 [00:01<00:00, 12.12ba/s]
100%|██████████| 2/2 [00:00<00:00, 16.16ba/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m-zero[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro,F1 Micro
1,0.346,0.312591,0.883005,0.882781,0.883005,0.832135,0.883005
2,0.3221,0.297201,0.895936,0.892557,0.895936,0.840927,0.895936
3,0.2924,0.273224,0.897783,0.894766,0.897783,0.841454,0.897783
4,0.3026,0.289258,0.897167,0.893932,0.897167,0.844184,0.897167
5,0.2888,0.276797,0.900246,0.897285,0.900246,0.849665,0.900246
6,0.2698,0.294756,0.899015,0.897203,0.899015,0.852198,0.899015
7,0.2699,0.292666,0.902094,0.89926,0.902094,0.852608,0.902094
8,0.25,0.299843,0.901478,0.89872,0.901478,0.852471,0.901478
9,0.2496,0.298238,0.899631,0.896615,0.899631,0.848574,0.899631
10,0.2489,0.299038,0.901478,0.899077,0.901478,0.854014,0.901478




100%|██████████| 2/2 [00:00<00:00, 16.14ba/s]


{"id": "nikluge-2023-iau-test-000001", "input": "아니 진짜 미친놈아니에요?????", "output": "NEGATIVE"}
{"id": "nikluge-2023-iau-test-000002", "input": "아진심 미쳘냐공ㄱ", "output": "POSITIVE"}
{"id": "nikluge-2023-iau-test-000003", "input": "먹고후회할바엔 먹지말자 ㅅㅂ", "output": "NEGATIVE"}
{"id": "nikluge-2023-iau-test-000004", "input": "심멎사진 나갑니다", "output": "POSITIVE"}
{"id": "nikluge-2023-iau-test-000005", "input": "아시발너무 ..", "output": "NEGATIVE"}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
