In [12]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from data_preprocessing import Autodata
from datasets import concatenate_datasets
from sklearn.model_selection import train_test_split
from dataloader import CustomDataset
from utils import compute_metrics


In [2]:
model_name = "monologg/koelectra-small-v3-discriminator"
data = Autodata("./data")
cfn_question = data.concat_dataset["question"]
normal_instruction_question = data.load_instruction_dataset("nlpai-lab/kullm-v2")

In [3]:
normal_instruction_question = normal_instruction_question[:1000]

In [4]:
cfn_data = data.label_indexing(cfn_question, state=0)
normal_instruction_data = data.label_indexing(normal_instruction_question, state=1)

total_data = concatenate_datasets([cfn_data, normal_instruction_data])
# train_dataset, val_dataset = train_test_split(
#     total_data, test_size=0.2, random_state=42
# )
question = total_data["question"]
target = total_data["target"]

In [5]:
question_train, question_valid, target_train, target_valid = train_test_split(
    question, target, test_size=0.2, random_state=42, stratify=target
)

In [16]:
question_valid

['BackupPlan 리소스 유형을 가진 백업 계획을 생성하는 템플릿을 만들어줘',
 'Signaling Channel 생성 템플릿을 만들고 채널 이름을 UniqueChannelName으로 설정해줘, 태그는 environment를 key로 prod를 value로 설정해줘',
 '주어진 키워드로 스토리를 생성합니다.',
 'Cognito IdentityPoolRoleAttachment 리소스를 IdentityPoolId를 region:guid 형식으로, authenticated와 unauthenticated역할을 설정해줘',
 '은유가 드러나도록 문장을 다시 작성합니다.',
 "Elastic Beanstalk 애플리케이션의 configuration template을 만들어줘. 애플리케이션 이름을 MyApplication으로, 설명을 'Elastic Beanstalk configuration template example'로 설정하고, PHP 7.1 실행 환경을 사용해줘.",
 '사용자 정보 집합이 주어지면 유효한 사용자 이름을 생성합니다.',
 '출근길에 건강한 간식을 추천합니다.',
 '주어진 회사의 로고를 디자인합니다.',
 '건강한 아침 식사를 위한 5가지 품목의 쇼핑 목록을 작성하세요.',
 'Capacity Reservation Fleet 템플릿을 생성해줘. EndDate를 2023-12-31T23:59:59Z로 설정하고, NoRemoveEndDate가 false이며, RemoveEndDate도 false로 설정해줘.',
 "HTTP API를 생성하려고 해. 이 API의 이름은 'ProductService', 프로토콜 타입은 'HTTP'로, 라우트 선택 표현식을 '${request.method} ${request.path}'로 설정해줘.",
 'Route53 RecordSetGroup을 만들어서 메일 서버의 A, MX, TXT 레코드를 각각 지정해줘. A 레코드는 IP 주소 \'192.0.2.44\'로 설정하고, MX 레코드는 \'10 mail.exa

In [7]:
train_data = CustomDataset(
    data=question_train,
    target=target_train,
    model_name=model_name,
    text_columns="question",
    max_length=256,
    state="train",
)

val_data = CustomDataset(
    data=question_valid,
    target=target_valid,
    model_name=model_name,
    text_columns="question",
    max_length=256,
    state="train",
)

Tokenizing: 100%|██████████| 1330/1330 [00:00<00:00, 10779.17it/s]
Tokenizing: 100%|██████████| 333/333 [00:00<00:00, 10460.94it/s]


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, ignore_mismatched_sizes=True
)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
args = TrainingArguments(
    output_dir="output_dir",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    dataloader_num_workers=4,
    logging_steps=50,
    seed=42,
    group_by_length=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/210 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARAL

{'eval_loss': 0.6844255328178406, 'eval_f1': 0.7504690431519699, 'eval_runtime': 31.6654, 'eval_samples_per_second': 10.516, 'eval_steps_per_second': 0.189, 'epoch': 1.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.6841882467269897, 'eval_f1': 0.7504690431519699, 'eval_runtime': 29.3795, 'eval_samples_per_second': 11.334, 'eval_steps_per_second': 0.204, 'epoch': 2.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'loss': 0.6614, 'grad_norm': 1.0868926048278809, 'learning_rate': 7.61904761904762e-06, 'epoch': 2.38}


 30%|███       | 63/210 [03:55<17:38,  7.20s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. D

{'eval_loss': 0.6861188411712646, 'eval_f1': 0.9393939393939394, 'eval_runtime': 29.5784, 'eval_samples_per_second': 11.258, 'eval_steps_per_second': 0.203, 'epoch': 3.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.6874817609786987, 'eval_f1': 0.15668202764976957, 'eval_runtime': 30.604, 'eval_samples_per_second': 10.881, 'eval_steps_per_second': 0.196, 'epoch': 4.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'loss': 0.5543, 'grad_norm': 1.4174736738204956, 'learning_rate': 5.2380952380952384e-06, 'epoch': 4.76}


 50%|█████     | 105/210 [06:42<12:40,  7.24s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

{'eval_loss': 0.682039737701416, 'eval_f1': 0.2608695652173913, 'eval_runtime': 31.2133, 'eval_samples_per_second': 10.669, 'eval_steps_per_second': 0.192, 'epoch': 5.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.5800318717956543, 'eval_f1': 0.8636363636363636, 'eval_runtime': 29.3466, 'eval_samples_per_second': 11.347, 'eval_steps_per_second': 0.204, 'epoch': 6.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.5767043232917786, 'eval_f1': 0.8235294117647058, 'eval_runtime': 29.766, 'eval_samples_per_second': 11.187, 'eval_steps_per_second': 0.202, 'epoch': 7.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'loss': 0.4746, 'grad_norm': 2.605710506439209, 'learning_rate': 2.8571428571428573e-06, 'epoch': 7.14}


 80%|████████  | 168/210 [10:52<05:01,  7.18s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

{'eval_loss': 0.4978097081184387, 'eval_f1': 0.9159891598915989, 'eval_runtime': 29.603, 'eval_samples_per_second': 11.249, 'eval_steps_per_second': 0.203, 'epoch': 8.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.4933280944824219, 'eval_f1': 0.9130434782608695, 'eval_runtime': 30.1528, 'eval_samples_per_second': 11.044, 'eval_steps_per_second': 0.199, 'epoch': 9.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'loss': 0.4349, 'grad_norm': 1.2843250036239624, 'learning_rate': 4.7619047619047623e-07, 'epoch': 9.52}


100%|██████████| 210/210 [13:39<00:00,  7.20s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

{'eval_loss': 0.5021293759346008, 'eval_f1': 0.8980716253443526, 'eval_runtime': 29.4923, 'eval_samples_per_second': 11.291, 'eval_steps_per_second': 0.203, 'epoch': 10.0}


100%|██████████| 210/210 [14:09<00:00,  4.05s/it]

{'train_runtime': 849.6029, 'train_samples_per_second': 15.654, 'train_steps_per_second': 0.247, 'train_loss': 0.526094118754069, 'epoch': 10.0}





TrainOutput(global_step=210, training_loss=0.526094118754069, metrics={'train_runtime': 849.6029, 'train_samples_per_second': 15.654, 'train_steps_per_second': 0.247, 'train_loss': 0.526094118754069, 'epoch': 10.0})

In [11]:
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Li

In [23]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import torch.nn.functional as F

text = "이름이 'T'로 시작하는 쥬라기 시대의 공룡 종을 찾습니다."
base_model = "monologg/koelectra-small-v3-discriminator"
model = AutoModelForSequenceClassification.from_pretrained(
    "WinF/stackorderflow-filter-v1",
    num_labels=2,
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits.detach().cpu()
pr = F.softmax(logits).numpy()
arg = np.argmax(pr, axis=1)
print(logits)
print(pr)
print(int(arg))
if int(arg) == 0 and (pr[0][0] >= 0.98).all():
    print("관련된 질문입니다.")
else:
    print("관련된 질문이 아닙니다.")

tensor([[ 0.0521, -0.0423]])
[[0.5235834  0.47641668]]
0
관련된 질문이 아닙니다.


  pr = F.softmax(logits).numpy()
  print(int(arg))
  if int(arg) == 0 and (pr[0][0] >= 0.98).all():
