In [34]:
# 라이브러리 불러오기
import os
os.chdir('/aiffel/aiffel/dlthon-minions/dlthon-stuart')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import json

import re

In [2]:
submission_data = pd.read_json('./data/test.json').transpose()
PRETRAINED_MODEL = "gogamza/kobart-base-v2"

In [3]:
# 전처리
def preprocess_sentence(sentence): 
    # \n을 공백으로 바꾸기
    sentence = re.sub("\n", " ", sentence)
    
    # (ㄱ-ㅎ, ㅏ-ㅣ, ".", "?", "!", ",", ' ')를 제외한 모든 문자를 없애기
    sentence = re.sub("[^ㄱ-ㅣ가-힣.?!, ]", "", sentence)
    
    # 단어와 구두점(punctuation) 사이에 공백 추가
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    
    return sentence


preprocessed = submission_data['text'].apply(preprocess_sentence).tolist()

In [4]:
import torch
from torch import nn
from transformers import BartModel, BartConfig

# BART 분류기 클래스 정의하기
class BartForSequenceClassification(nn.Module):
    def __init__(self, num_labels):
        super(BartForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.config = BartConfig.from_pretrained(PRETRAINED_MODEL, num_labels=num_labels)
        self.bart = BartModel.from_pretrained(PRETRAINED_MODEL, config=self.config)
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(self.config.hidden_size, num_labels))
        
    def forward(self, *args, token_type_ids=None, labels=None, **kwargs, ): #input_ids, attention_mask=None, labels=None):
        outputs = self.bart(*args, **kwargs,)
        pooled_output = outputs[0][:, -1, :]  # 마지막 토큰의 출력 사용
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return loss, logits

In [6]:

# 대화 최대 길이 128으로 설정
MAX_LENGTH = 128

from transformers import BartTokenizerFast, PreTrainedTokenizerFast
# 각 conversation을 토큰화하여 새로운 열 'tokenized'에 저장
tokenizer=PreTrainedTokenizerFast.from_pretrained(PRETRAINED_MODEL)

tokenized = tokenizer(
    preprocessed, 
    max_length=MAX_LENGTH,
    padding='max_length',  # Pad to the max_length
    truncation=True,       # Truncate sequences to the max_length
    return_tensors='pt')

In [8]:
from torch.utils.data import Dataset, StackDataset

dataset = StackDataset(**dict(tokenized))

In [10]:
from torch.utils.data import DataLoader

pred_loader = DataLoader(dataset, batch_size=16, shuffle=False)

In [11]:
import wandb
run = wandb.init()
artifact = run.use_artifact('aiffel_minions/DLthon_finetune_koBart/model-epoch-best:v3', type='model')
artifact_dir = artifact.download()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[34m[1mwandb[0m: Currently logged in as: [33mhojae-choi[0m ([33maiffel_minions[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Downloading large artifact model-epoch-best:v3, 472.60MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:5.6


In [12]:
import torch
import os
model_path = os.path.join(artifact_dir, 'model_epochbest.pth')

# Load the state dictionary
state_dict = torch.load(model_path)

model = BartForSequenceClassification(5)

# Apply the state dictionary to the model
model.load_state_dict(state_dict)

<All keys matched successfully>

In [15]:
# Now the model is ready to use
model.eval()  # Set the model to evaluation mode
from tqdm import tqdm
predictions =[]
with torch.no_grad():
    for batch in tqdm(pred_loader):
        predictions.append(model(**batch)[1])

100%|██████████| 32/32 [02:19<00:00,  4.35s/it]


In [23]:
predictions = torch.concat(predictions, axis=0).argmax(axis = 1).numpy().tolist()

In [24]:
# 'class'를 'type'으로 매핑하는 딕셔너리 생성하기
class_to_type = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3,
    '일반 대화': 4
}
type_to_class = {key: idx for idx, key in class_to_type.items()}

In [25]:
class_predictions = [type_to_class[pred] for pred in predictions]

In [39]:
pred_df=pd.DataFrame(predictions, columns=['class'])
pred_df['file_name'] = submission_data.index

In [40]:
pred_df

Unnamed: 0,class,file_name
0,3,t_000
1,2,t_001
2,3,t_002
3,1,t_003
4,0,t_004
...,...,...
495,2,t_495
496,0,t_496
497,1,t_497
498,2,t_498


In [41]:
pred_df.to_csv('submissions/submission_kobart_1.csv', index=False)