## Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

## Hyperparameter Setting

In [None]:
CFG = {
    'EPOCHS': 2,
    'LEARNING_RATE':1e-6,
    'BATCH_SIZE':4,
    'SEED':2023
}

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train = pd.read_csv('/content/drive/MyDrive/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/test.csv')

In [None]:
train

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,TRAIN_00002,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실,사실형-긍정-미래-확실
3,TRAIN_00003,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,TRAIN_00004,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
...,...,...,...,...,...,...,...
16536,TRAIN_16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16537,TRAIN_16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16538,TRAIN_16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
16539,TRAIN_16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,사실형,긍정,과거,불확실,사실형-긍정-과거-불확실


## Label encoding

In [None]:
le1 = LabelEncoder()
le1=le1.fit(train['유형'])
train['유형']=le1.transform(train['유형'])

In [None]:
le2 = LabelEncoder()
le2=le2.fit(train['극성'])
train['극성']=le2.transform(train['극성'])

In [None]:
le3 = LabelEncoder()
le3=le3.fit(train['시제'])
train['시제']=le3.transform(train['시제'])

In [None]:
le4 = LabelEncoder()
le4=le4.fit(train['확실성'])
train['확실성']=le4.transform(train['확실성'])

## Train/Validation split

In [None]:
# 대충 8:2로 자르기
valid=train[13000:].reset_index(drop=True)
train=train[:13000].reset_index(drop=True)

train_len=len(train)
val_len=len(valid)

print(train_len)
print(val_len)

13000
3541


## Tokenizer Define

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.7 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = AutoModel.from_pretrained('klue/roberta-large')
tokenizers = AutoTokenizer.from_pretrained('klue/roberta-large')

Downloading (…)lve/main/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

## CustomDataset

In [None]:
class CustomDataset(Dataset):

    def __init__(self, data, mode = "train"):
        self.dataset = data
        self.tokenizer = tokenizers
        self.mode = mode
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset['문장'][idx]
        inputs = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'][0]
        # token_type_ids = inputs['token_type_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        if self.mode == "train":
            st_type = self.dataset['유형'][idx]
            st_polarity = self.dataset['극성'][idx]
            st_tense = self.dataset['시제'][idx]
            st_certainty = self.dataset['확실성'][idx]
            return input_ids, attention_mask, st_type, st_polarity, st_tense, st_certainty
        else:
            return input_ids, attention_mask

In [None]:
train = CustomDataset(train, mode = "train")
valid = CustomDataset(valid, mode = "train")

train_dataloader = torch.utils.data.DataLoader(train, batch_size= CFG['BATCH_SIZE'], shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid, batch_size= CFG['BATCH_SIZE'], shuffle=False)

## Model Define

In [None]:
class BaseModel(nn.Module):

    def __init__(self, dropout=0.5):

        super(BaseModel, self).__init__()

        self.nlp_model = model_name
        self.type_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=1024, out_features=4),
        )
        self.polarity_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=1024, out_features=3),
        )
        self.tense_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=1024, out_features=3),
        )
        self.certainty_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=1024, out_features=2),
        )

    def forward(self, input_id, mask):

        _, pooled_output = self.nlp_model(input_ids= input_id, attention_mask=mask,return_dict=False)
        type_output = self.type_classifier(pooled_output)
        polarity_output = self.polarity_classifier(pooled_output)
        tense_output = self.tense_classifier(pooled_output)
        certainty_output = self.certainty_classifier(pooled_output)

        return type_output, polarity_output, tense_output, certainty_output

## Train

In [None]:
def train(model, optimizer, train_dataloader, val_dataloader, scheduler, device):

    model.to(device)

    criterion = {
        'type' : nn.CrossEntropyLoss().to(device),
        'polarity' : nn.CrossEntropyLoss().to(device),
        'tense' : nn.CrossEntropyLoss().to(device),
        'certainty' : nn.CrossEntropyLoss().to(device)
    }

    best_loss = 999999
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []

        for sentence,attention_mask, type_label, polarity_label, tense_label, certainty_label in tqdm(iter(train_dataloader)):
            sentence = sentence.to(device)
            type_label = type_label.type(torch.LongTensor).to(device)
            polarity_label = polarity_label.type(torch.LongTensor).to(device)
            tense_label = tense_label.type(torch.LongTensor).to(device)
            certainty_label = certainty_label.type(torch.LongTensor).to(device)
            mask = attention_mask.to(device)

            optimizer.zero_grad()

            type_logit, polarity_logit, tense_logit, certainty_logit = model(sentence, mask)

            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_label)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_loss, val_type_f1, val_polarity_f1, val_tense_f1, val_certainty_f1 = validation(model, val_dataloader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{val_type_f1:.5f}] 극성 F1 : [{val_polarity_f1:.5f}] 시제 F1 : [{val_tense_f1:.5f}] 확실성 F1 : [{val_certainty_f1:.5f}]')

        if scheduler is not None:
            scheduler.step(val_loss)

        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            torch.save(model, "/content/drive/MyDrive/Allineone-KOR-best-model2.pth")
            print('Model saved!')
    return best_model

In [None]:
def validation(model, val_dataloader, criterion, device):
    model.eval()
    val_loss = []

    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []


    with torch.no_grad():
        for sentence,attention_mask, type_label, polarity_label, tense_label, certainty_label in tqdm(iter(val_dataloader)):
            sentence = sentence.to(device)
            type_label = type_label.type(torch.LongTensor).to(device)
            polarity_label = polarity_label.type(torch.LongTensor).to(device)
            tense_label = tense_label.type(torch.LongTensor).to(device)
            certainty_label = certainty_label.type(torch.LongTensor).to(device)
            mask = attention_mask.to(device)

            type_logit, polarity_logit, tense_logit, certainty_logit = model(sentence, mask)

            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_label)

            val_loss.append(loss.item())

            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            type_labels += type_label.detach().cpu().numpy().tolist()

            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_labels += polarity_label.detach().cpu().numpy().tolist()

            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_labels += tense_label.detach().cpu().numpy().tolist()

            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_labels += certainty_label.detach().cpu().numpy().tolist()

    type_f1 = f1_score(type_labels, type_preds, average='weighted')
    polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
    tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
    certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')

    return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

## Run!!

In [None]:
model = BaseModel()
model.eval()
optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_dataloader, val_dataloader, scheduler, device)

100%|██████████| 3250/3250 [1:09:03<00:00,  1.27s/it]
100%|██████████| 886/886 [05:56<00:00,  2.49it/s]


Epoch : [1] Train Loss : [0.36271] Val Loss : [0.25327] 유형 F1 : [0.87343] 극성 F1 : [0.93024] 시제 F1 : [0.88626] 확실성 F1 : [0.92409]
Model saved!


100%|██████████| 3250/3250 [1:09:01<00:00,  1.27s/it]
100%|██████████| 886/886 [05:56<00:00,  2.49it/s]


Epoch : [2] Train Loss : [0.22896] Val Loss : [0.21791] 유형 F1 : [0.86928] 극성 F1 : [0.96775] 시제 F1 : [0.89170] 확실성 F1 : [0.91658]
Model saved!


## Inference

In [None]:
infer_model = torch.load("/content/drive/MyDrive/Allineone-KOR-best-model2.pth")

In [None]:
test = pd.read_csv('/content/drive/MyDrive/data/test.csv')

In [None]:
test = CustomDataset(test, mode = "test")
test_dataloader = torch.utils.data.DataLoader(test, batch_size= CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()

    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []

    with torch.no_grad():
        for sentence, attention_mask in tqdm(test_loader):
            sentence = sentence.to(device)
            mask = attention_mask.to(device)

            type_logit, polarity_logit, tense_logit, certainty_logit = model(sentence, mask)

            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()

    return type_preds, polarity_preds, tense_preds, certainty_preds

In [None]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(infer_model, test_dataloader, device)

100%|██████████| 1773/1773 [11:50<00:00,  2.49it/s]


In [None]:
type_preds = le1.inverse_transform(type_preds)
polarity_preds = le2.inverse_transform(polarity_preds)
tense_preds = le3.inverse_transform(tense_preds)
certainty_preds = le4.inverse_transform(certainty_preds)

In [None]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

## Submit

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/data/sample_submission.csv')
submit.head(15)

Unnamed: 0,ID,label
0,TEST_0000,추론형-긍정-현재-확실
1,TEST_0001,추론형-긍정-현재-확실
2,TEST_0002,추론형-긍정-현재-확실
3,TEST_0003,추론형-긍정-현재-확실
4,TEST_0004,추론형-긍정-현재-확실
5,TEST_0005,추론형-긍정-현재-확실
6,TEST_0006,추론형-긍정-현재-확실
7,TEST_0007,추론형-긍정-현재-확실
8,TEST_0008,추론형-긍정-현재-확실
9,TEST_0009,추론형-긍정-현재-확실


In [None]:
submit['label'] = predictions
submit.head()

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-과거-확실
4,TEST_0004,사실형-긍정-과거-확실


In [None]:
submit.to_csv('/content/drive/MyDrive/submit_allinone_kor2.csv', index=False)

In [None]:
pd.read_csv('/content/drive/MyDrive/submit_allinone_kor2.csv')

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-과거-확실
4,TEST_0004,사실형-긍정-과거-확실
...,...,...
7085,TEST_7085,사실형-긍정-현재-확실
7086,TEST_7086,추론형-부정-현재-확실
7087,TEST_7087,사실형-긍정-현재-확실
7088,TEST_7088,추론형-긍정-현재-확실
