In [None]:
# roberta+lstm small = 0.2225
# roberta+base+lstm  small= 0.234 

In [None]:
import pandas as pd
import numpy as np
import torch
import os
import random
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from tqdm.auto import tqdm, trange
from konlpy.tag import Mecab

# for graphing
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_original = pd.read_csv('train.csv')
# 중복 삭제
train_original = train_original.loc[train_original.ID != 'TRAIN_14989']
train_original = train_original.loc[train_original.ID != 'TRAIN_03364']
train_original = train_original.loc[train_original.ID != 'TRAIN_07099']
train_original = train_original.loc[train_original.ID != 'TRAIN_02108']
train_original = train_original.drop_duplicates('문장', keep='first')
train_original.drop(columns=['ID'], inplace=True)
test = pd.read_csv('test.csv')
test.drop(columns=['ID'], inplace=True)
submission = pd.read_csv('sample_submission.csv')

In [None]:
 tagger = [
        "NNG", # 일반 명사
        "NNP", # 고유 명사
        "NNB", # 의존 명사
        "NNBC", # 단위를 나타내는 명사
        "NR", # 수사
        "NP", # 대명사
        "VV", # 동사
        "VA", # 형용사
        "VX", # 보조 용언
        "VCP", # 긍정 지정사
        "VCN",# 부정 지정사
        "MM", # 관형사
        "MAG", # 일반 부사
        "MAJ", # 접속 부사
        "IC", # 감탄사
        "JKC",
        "JKS", # 주격 조사
        "JKG", # 관형격 조사
        "JKO", # 목적격 조사
        "JKB", # 부사격 조사
        "JKV", # 호격 조사
        "JKQ", # 인용격 조사
        "JX", # 보조사
        "JC", # 접속 조사
        "EP", # 선어말 어미
        "EF", # 종결 어미
        "EC", # 연결 어미
        "ETN", # 명사형 전성 어미
        "ETM", # 관형형 전성 어미
        "XPN", # 체언 접두사
        "XSN", # 명사 파생 접미사
        "XSV", # 동사 파생 접미사
        "XSA", # 형용사 파생 접미사
        "XR",  # 어근
        'SF', # 마침표, 물음표, 느낌표
        "SE", # 줄임표
        "SSO", # 여는 괄호
        "SSC", # 닫는 괄호
        "SC", # 구분자
        "SY",
        "SL", # 외국어
        "SH", # 한자
        "SN", # 숫자
         "UNKNOWN",
         "NA"
        ]

In [None]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = True

CFG = {
    'EPOCHS':5,
    'LEARNING_RATE':2e-5,
    'BATCH_SIZE':2,
    'SEED':42
}

seed_everything(CFG['SEED'])  # seed 고정
device = torch.device('cuda')

In [None]:
train, val, _, _ = train_test_split(train_original, train_original['label'], test_size = 0.2,random_state=CFG['SEED'])
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [None]:
model = 'klue/roberta-small'
base_model = AutoModel.from_pretrained(model)
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
class SentenceTypeDataset(Dataset):
  def __init__(self, dataframe, tokenizer, labels=None):
    texts = dataframe['문장'].values.tolist()

    self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors='pt') for text in texts]
    self.labels = labels

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    pos = self.input_pos[idx]
    if self.labels is not None:
      type_tmp = self.labels['type'][idx]
      polarity_tmp = self.labels['polarity'][idx]
      tense_tmp = self.labels['tense'][idx]
      certainty_tmp = self.labels['certainty'][idx]
      
      return text, torch.Tensor(type_tmp), torch.Tensor(polarity_tmp), torch.Tensor(tense_tmp), torch.Tensor(certainty_tmp)
    else:
      return text, torch.Tensor([-1, -1, -1, -1]), torch.Tensor([-1, -1, -1]), torch.Tensor([-1, -1, -1]), torch.Tensor([-1, -1])

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
import torch.nn.functional as F

class SentenceClassifier(nn.Module):
  def __init__(self, base_model, input_size=768):
    super().__init__()
    self.roberta = base_model # from transformers package
    hidden_size = self.roberta.config.hidden_size
    # self.num_classes = num_classes
    self.input_size = input_size
    
    # LSTM
    self.BiLstm = nn.LSTM(input_size=768,
                          hidden_size=320,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=True)
    self.fc = nn.Sequential(nn.Dropout(0.5),
                            nn.Linear(320 * 2, 32),
                            nn.ReLU())
    # 출력
    self.type_clf = nn.Sequential(
        nn.Dropout(p=0.3),
        nn.Linear(in_features=32, out_features=4),
    )
    self.polarity_clf = nn.Sequential(
        nn.Dropout(p=0.3),
        nn.Linear(in_features=32, out_features=3),
    )
    self.tense_clf = nn.Sequential(
        nn.Dropout(p=0.3),
        nn.Linear(in_features=32, out_features=3),
    )
    self.certainty_clf = nn.Sequential(
        nn.Dropout(p=0.3),
        nn.Linear(in_features=32, out_features=2),
    )
    # self.type_clf = nn.Linear(32, 4)
    # self.polarity_clf = nn.Linear(32, 3)
    # self.tense_clf = nn.Linear(32, 3)
    # self.certainty_clf = nn.Linear(32, 2)
    # self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask, pos_input):
    klue_out = self.roberta(input_ids = input_ids, attention_mask = attention_mask)[0][:,0]
#     out = self.roberta(input_ids = input_ids, attention_mask = attention_mask)
#     cls_feats = out.last_hidden_state
#     outputs = self.feature_extract(cls_feats)
    outputs, _ = self.BiLstm(klue_out)
    outputs = outputs[:, -1, :]
    outputs = self.fc(outputs)


    type_output = self.type_clf(outputs)
    # type_output = self.softmax(type_output)
    polarity_output = self.polarity_clf(outputs)
    # polarity_output = self.softmax(polarity_output)
    tense_output= self.tense_clf(outputs)
    # tense_output = self.softmax(tense_output)
    certainty_output = self.certainty_clf(outputs)
    # certainty_output = self.softmax(certainty_output)

    return type_output, polarity_output, tense_output, certainty_output

In [None]:
from sklearn.metrics import f1_score

def sentence_train(model, train_dataloader, val_dataloader, learning_rate, epochs, model_nm):
    best_val_loss = 99999999999999
    early_stopping_threshold_count = 0

    criterion = {
      'type': nn.CrossEntropyLoss().to(device),
      'polarity': nn.CrossEntropyLoss().to(device),
      'tense': nn.CrossEntropyLoss().to(device),
      'certainty': nn.CrossEntropyLoss().to(device)
  }

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model = model.to(device)

    for epoch in range(epochs):
        total_f1_train = 0
        total_loss_train = 0

        model.train()

        for train_input, type_label, polarity_label, tense_label, certainty_label in tqdm(train_dataloader):
            attention_mask = train_input['attention_mask'].to(device)
            input_ids = train_input['input_ids'].squeeze(1).to(device)
            type_label = type_label.to(device)
            polarity_label = polarity_label.to(device)
            tense_label = tense_label.to(device)
            certainty_label = certainty_label.to(device)
            optimizer.zero_grad()

            type_output, polarity_output, tense_output, certainty_output = model(input_ids, attention_mask) # from the forward function

            loss = 0.25*criterion['type'](type_output, type_label.float()) + \
                    0.25*criterion['polarity'](polarity_output, polarity_label.float()) + \
                    0.25*criterion['tense'](tense_output, tense_label.float()) + \
                    0.25*criterion['certainty'](certainty_output, certainty_label.float())

            total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        with torch.no_grad():  # since we should not change gradient for validation
            total_f1_val = 0
            total_loss_val = 0
            type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
            type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
            model.eval()

          # same process as the above
            for val_input, vtype_label, vpolarity_label, vtense_label, vcertainty_label in tqdm(val_dataloader):
                attention_mask = val_input['attention_mask'].to(device)
                input_ids = val_input['input_ids'].squeeze(1).to(device)

                vtype_label = vtype_label.to(device)
                vpolarity_label = vpolarity_label.to(device)
                vtense_label = vtense_label.to(device)
                vcertainty_label = vcertainty_label.to(device)
                
                vtype_output, vpolarity_output, vtense_output, vcertainty_output = model(input_ids, attention_mask) # from the forward function

                loss = 0.25*criterion['type'](vtype_output, vtype_label.float()) + \
                    0.25*criterion['polarity'](vpolarity_output, vpolarity_label.float()) + \
                    0.25*criterion['tense'](vtense_output, vtense_label.float()) + \
                    0.25*criterion['certainty'](vcertainty_output, vcertainty_label.float())

                total_loss_val += loss.item()

        print(f"Epochs:{epoch + 1} "
              f"| Train Loss:{total_loss_train / len(train_dataloader): .3f}"
              f'| Val Loss: {total_loss_val / len(val_dataloader): .3f} '
              )
          # f'| 유형 F1: {type_f1:.5f} 극성 {polarity_f1:.5f} 시제 {tense_f1:.5f} 확실성 {certainty_f1:.5f}',)

        if best_val_loss > total_loss_val:
            best_val_loss = total_loss_val # saving only the best one
            torch.save(model, f"{model_nm}.pt")
            print("Saved model")
            early_stopping_threshold_count = 0
        else:
            early_stopping_threshold_count += 1 # checking how many epochs have passed that val_loss didn't increase

        if early_stopping_threshold_count >= 2: # patience=1
            print('Early stopping')
            break




In [None]:
train_tmp = train[['문장', '유형', '극성', '시제', '확실성']]
train_tmp = pd.get_dummies(train_tmp, columns=['유형', '극성', '시제', '확실성'])
train_tmp

In [None]:
train_type = train_tmp.iloc[:, 1:5].values.tolist()
train_polarity = train_tmp.iloc[:, 5:8].values.tolist()
train_tense = train_tmp.iloc[:,8:11].values.tolist()
train_certainty = train_tmp.iloc[:,11:13].values.tolist()

train_labels = {
    'type': train_type,
    'polarity': train_polarity,
    'tense': train_tense,
    'certainty': train_certainty
}

In [None]:
val_tmp = val[['문장', '유형', '극성', '시제', '확실성']]
val_tmp = pd.get_dummies(val_tmp, columns=['유형', '극성', '시제', '확실성'])

val_type = val_tmp.iloc[:,1:5].values.tolist()
val_polarity = val_tmp.iloc[:,5:8].values.tolist()
val_tense = val_tmp.iloc[:,8:11].values.tolist()
val_certainty = val_tmp.iloc[:,11:13].values.tolist()
val_labels = {
    'type': val_type,
    'polarity': val_polarity,
    'tense': val_tense,
    'certainty': val_certainty
}

In [None]:
train_dataloader = DataLoader(SentenceTypeDataset(train_tmp, tokenizer, train_labels), batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=0)
val_dataloader = DataLoader(SentenceTypeDataset(val_tmp, tokenizer, val_labels), batch_size=CFG['BATCH_SIZE'], num_workers=0)

In [None]:
model = SentenceClassifier(base_model)
model.eval()

In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()


In [None]:
sentence_train(model, train_dataloader, val_dataloader, CFG['LEARNING_RATE'], CFG['EPOCHS'], 'roberta+lstm')

In [None]:
def get_type_predictions(model, loader):
  device = torch.device('cuda')
  model = model.to(device)

  type_probs, polarity_probs, tense_probs, clarity_probs = [], [], [], []
  with torch.no_grad():
    model.eval()
    for data_input, _, _, _, _ in tqdm(loader):
      attention_mask = data_input['attention_mask'].to(device)
      input_ids = data_input['input_ids'].squeeze(1).to(device)

      type_output, polarity_output, tense_output, clarity_output = model(input_ids, attention_mask)
      type_probs.append(type_output)
      polarity_probs.append(polarity_output)
      tense_probs.append(tense_output)
      clarity_probs.append(clarity_output)

  return torch.cat(type_probs).cpu().detach().numpy(), \
            torch.cat(polarity_probs).cpu().detach().numpy(), \
            torch.cat(tense_probs).cpu().detach().numpy(), \
            torch.cat(clarity_probs).cpu().detach().numpy()

In [None]:
model = torch.load('roberta+lstm.pt')
test_dataloader = DataLoader(SentenceTypeDataset(test, tokenizer), batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [None]:
test_pred_type, test_pred_polarity, test_pred_tense, test_pred_certainty = get_type_predictions(model, test_dataloader)

In [None]:
test_type = ['대화형' if i==0 else '사실형' if i==1 else '예측형' if i==2 else '추론형' for i in [np.argmax(p) for p in test_pred_type]]
test_polarity = ['긍정' if i==0 else '미정' if i==1 else '부정' for i in [np.argmax(p) for p in test_pred_polarity]]
test_tense = ['과거' if i==0 else '미래' if i==1 else '현재' for i in [np.argmax(p) for p in test_pred_tense]]
test_certainty = ['불확실' if i==0 else '확실' for i in [np.argmax(p) for p in test_pred_certainty]]

In [None]:
label_sum = []
for i in range(len(test_type)):
    label_sum.append(f'{test_type[i]}-{test_polarity[i]}-{test_tense[i]}-{test_certainty[i]}')

submission['label'] = label_sum
submission.to_csv('roberta_feature_lstm.csv', index=False)

In [None]:
submission