In [23]:
import os
import pickle as pickle
from pydoc import locate
from datetime import datetime

import pandas as pd

import torch
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import data_loaders.data_loader as dataloader
from data_loaders.data_loader import MyDataCollatorWithPadding
import utils.util as utils
from omegaconf import OmegaConf

In [27]:
conf = OmegaConf.load(f"./config/base_config.yaml")

In [31]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = conf.model.model_name
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
data_collator = MyDataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

model.parameters
model.to(device)
model.eval()
print("model")

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classif

model


In [37]:
## load predict datset
RE_predict_dataset = dataloader.load_predict_dataset(tokenizer, conf.path.predict_path, conf)
RE_test_dataset = dataloader.load_dataset(tokenizer, conf.path.test_path, conf)

tokenizing:   0%|          | 0/4846 [00:00<?, ?it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
        labels                                           sentence  \
0      neutral  According to Gran, the company has no plans to...   
1      neutral  Technopolis plans to develop in stages an area...   
2     negative  The international electronic industry company ...   
3     positive  With the new production plant the company woul...   
4     positive  According to the company's updated strategy fo...   
...        ...                                                ...   
4841  negative  LONDON MarketWatch -- Share prices ended lower...   
4842   neutral  Rinkuskiai's beer sales fell by 6.5 per cent t...   
4843  negative  Operating profit fell to EUR 35.4 mn from EUR ...   
4844  negative  Net sales of the Paper segment decreased to EU...   
4845  negative  Sales in Finland decreased by 10.5 % in Januar...   

                                           kor_sentence  
0     Gran에 따르면, 그 회사는 회사가 성장하고 있는 곳이지만, 모든 생산을 러시아로

tokenizing:   0%|          | 0/4846 [00:00<?, ?it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


{'input_ids': tensor([    0, 15895, 17664,  2008,  2039,  4586,  7052, 15053,  3947,    16,
          5062, 20793, 22058, 13866,  2041, 19665, 16434, 16654,  7052,    80,
         15176, 21058, 31138,  4596,  7052,    54, 23559,  7856,    16, 13532,
          7088, 24899, 16845, 11376, 14092, 16387,  5062, 20793, 22058, 11376,
         23246,  5936,  4586,    18,     2]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor(0)}

In [41]:
RE_test_dataset[4]

{'input_ids': tensor([    0, 15895, 17664,  2008,  2039,  4586,  7052,  5062, 20793, 22058,
            11,    86, 19763,  2039, 22079,  7982, 27064,  2076,  8672,  5062,
            92, 10220,  2041,  4877,    17,  4708,    16, 29432,  2175, 14452,
            87,  4092,  8398, 13077,    68,    79,  9971,    17,    87, 19131,
         17640,    86, 18481,  2041, 23246,  5936,  7088,  5011,  5062,    85,
         21349,  4868,  3619,     9,    17,  4064,     9, 13412, 11171, 17630,
          3762, 24047, 12862, 30421, 27669,  2064,  3898,  4868,  3633,     9,
            17,  3619,     9,  4868, 17640,    86, 18481,  2041,    18,     2]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [43]:
test_args = TrainingArguments(output_dir="./prediction", do_train=False, do_predict=True, per_device_eval_batch_size=1, dataloader_drop_last=False)
trainer = Trainer(model=model, args=test_args, compute_metrics=utils.compute_metrics, data_collator=data_collator)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [44]:
outputs = trainer.predict(RE_test_dataset)
logits = torch.FloatTensor(outputs.predictions)
prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
result = torch.argmax(logits, axis=-1).detach().cpu().numpy()

pred_answer = result.tolist()
pred_answer = utils.num_to_label(pred_answer)
output_prob = prob.tolist()

output = pd.read_csv("./dataset/finance_data.csv")
output["pred_label"] = pred_answer
output["probs"] = output_prob

***** Running Prediction *****
  Num examples = 4846
  Batch size = 1
THCudaCheck FAIL file=../aten/src/THC/THCCachingHostAllocator.cpp line=280 error=710 : device-side assert triggered


RuntimeError: cuda runtime error (710) : device-side assert triggered at ../aten/src/THC/THCCachingHostAllocator.cpp:280

In [None]:

# 실행 시간을 기록합니다.
now = datetime.now()
inference_start_time = now.strftime("%d-%H-%M")




# Test 점수 확인
predict_dev = True  # dev set에 대한 prediction 결과값 구하기 (output분석)
predict_submit = False  # dev set은 evaluation만 하고 submit할 결과값 구하기
if predict_dev:
    

    output.to_csv(os.path.join(path, f"dev_submission_{inference_start_time}.csv"), index=False)
    output.to_csv(f"./prediction/dev_submission_{inference_start_time}.csv", index=False)  # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장.
if predict_submit:
    metrics = trainer.evaluate(RE_test_dataset)
    print("Training is complete!")
    print("==================== Test metric score ====================")
    print("eval loss: ", metrics["eval_loss"])
    print("eval auprc: ", metrics["eval_auprc"])
    print("eval micro f1 score: ", metrics["eval_micro f1 score"])

    outputs = trainer.predict(RE_predict_dataset)
    logits = torch.FloatTensor(outputs.predictions)
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    result = torch.argmax(logits, axis=-1).detach().cpu().numpy()

    pred_answer = result.tolist()
    pred_answer = utils.num_to_label(pred_answer)
    output_prob = prob.tolist()

    output = pd.read_csv("./prediction/sample_submission.csv")
    output["pred_label"] = pred_answer
    output["probs"] = output_prob

    output.to_csv(os.path.join(path, f"submission_{inference_start_time}.csv"), index=False)
    output.to_csv(f"./prediction/submission_{inference_start_time}.csv", index=False)  # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장.
#### 필수!! ##############################################
print("==================== Inference finish! ====================")