In [1]:
import re
import numpy as np
import pandas as pd
import tqdm

from sklearn.model_selection import StratifiedKFold

# torch
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
import torchmetrics

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

# transformers
from transformers import AdamW, ElectraForSequenceClassification, ElectraTokenizer
# from transformers import AdamW, XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from transformers.optimization import get_linear_schedule_with_warmup

In [2]:
def preprocessing():
    label_df = pd.read_excel('data/한국표준산업분류(10차)_국문.xlsx', header=2)
    label_df = label_df.iloc[:, range(0,5,2)]
    label_df = label_df.fillna(method='ffill')
    label_df.drop_duplicates(inplace=True)
    label_df.reset_index(drop=True, inplace=True)
    label_df[['코드.1', '코드.2']] = label_df[['코드.1', '코드.2']].astype(int)
    label_df['target'] = label_df[['코드', '코드.1', '코드.2']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)

    df = pd.read_table('data/1. 실습용자료.txt', sep='|', encoding='cp949')
    df.fillna('', inplace=True)
    df['text'] = df['text_obj'] + ' ' + df['text_mthd'] + ' ' + df['text_deal']
    clean = re.compile("[^ㄱ-힣 ]")
    df['text'] = df['text'].apply(lambda x: clean.sub(' ', str(x)))
    df['target'] = df[['digit_1', 'digit_2', 'digit_3']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)
    
    label_dict = {value: idx for idx, value in enumerate(label_df['target'])}
    data_list = [[text, label_dict[target]] for text, target in zip(df['text'], df['target'])]
    
    return label_dict, data_list

In [3]:
class CONFIG:
    LABEL_DICT, DATA_LIST = preprocessing()
    CLASSES = len(LABEL_DICT)
    MODEL = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=CLASSES)
    TOKENIZER = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
    SEED = 3413
    BATCH_SIZE = 64
    EPOHCS = 30
    MAX_LENGTH = 64
    LEARNING_RATE = 6e-6
    DEVICE = 'cuda'
    N_JOBS = 14
    FOLD = 2

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [4]:
class LitData(pl.LightningDataModule):
    def __init__(self, fold, tokenizer, batch_size):
        super().__init__()
        self.fold = CONFIG.FOLD
        self.tokenizer = CONFIG.TOKENIZER
        self.batch_size = CONFIG.BATCH_SIZE
        self.data_list = CONFIG.DATA_LIST
        self.max_length = CONFIG.MAX_LENGTH
        self.seed = CONFIG.SEED
        
    def setup(self, stage=None):
        texts = np.array([i[0] for i in self.data_list])
        labels = np.array([i[1] for i in self.data_list])
        indices = self.tokenizer.batch_encode_plus(texts,
                                                  max_length=self.max_length,
                                                  add_special_tokens=True,
                                                  return_attention_mask=True,
                                                  padding='longest',
                                                  truncation=True)
        
        input_ids = np.array(indices['input_ids'])
        attention_mask = np.array(indices['attention_mask'])
        cross_validation = StratifiedKFold(self.fold, shuffle=True, random_state=self.seed)
        
#         for fold, (train_idx, val_idx) in enumerate(cross_validation.split(input_ids, labels, attention_mask)):
#             train_inputs = input_ids[train_idx]
#             train_labels = labels[train_idx]
#             train_masks = attention_mask[train_idx]
            
#             validation_inputs = input_ids[val_idx]
#             validation_labels = labels[val_idx]
#             validation_masks = attention_mask[val_idx]
            
        for fold, (train_idx, val_idx) in enumerate(cross_validation.split(input_ids, labels)):
            train_inputs = input_ids[train_idx]
            train_labels = labels[train_idx]
            validation_inputs = input_ids[val_idx]
            validation_labels = labels[val_idx]
            
            if fold == self.fold:
                break
            
        for fold, (train_idx, val_idx) in enumerate(cross_validation.split(attention_mask, labels)):
            train_masks = attention_mask[train_idx]
            validation_masks = attention_mask[val_idx]
            
            if fold == self.fold:
                break
        

        self.train_inputs = torch.tensor(train_inputs)
        self.validation_inputs = torch.tensor(validation_inputs)
        self.train_labels = torch.tensor(train_labels, dtype=torch.long)
        self.validation_labels = torch.tensor(validation_labels, dtype=torch.long)
        self.train_masks = torch.tensor(train_masks, dtype=torch.long)
        self.validation_masks = torch.tensor(validation_masks, dtype=torch.long)

        
    def train_dataloader(self):
        train_data = TensorDataset(self.train_inputs, self.train_masks, self.train_labels)
        train_sampler = RandomSampler(train_data)
        return DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size, num_workers=CONFIG.N_JOBS, pin_memory=True)
    
    def val_dataloader(self):
        validation_data = TensorDataset(self.validation_inputs, self.validation_masks, self.validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        return DataLoader(validation_data, sampler=validation_sampler, batch_size=self.batch_size, num_workers=CONFIG.N_JOBS, pin_memory=True)

In [5]:
class LitModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = CONFIG.MODEL
        self.f1_score = torchmetrics.F1Score(num_classes=CONFIG.CLASSES)
        
    def forward(self, b_input_ids, b_input_mask, b_labels):
        output = self.model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels)
        return output
    
    def training_step(self, batch, batch_idx):
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]
        z = self(b_input_ids, b_input_mask, b_labels)
        loss = z[0]
        return loss
    
    def validation_step(self, batch, batch_idx):
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]
        z = self(b_input_ids, b_input_mask, b_labels)
        val_loss = z[0]
        logits = z[1]
        self.log('val_loss', val_loss, prog_bar=True)
        self.log('val_f1_score', self.f1_score(logits, b_labels), prog_bar=True)
        return val_loss
    
    def configure_optimizers(self):
        optimizer = AdamW(model.parameters(), lr=CONFIG.LEARNING_RATE)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                   num_warmup_steps=0,
                                                   num_training_steps=189*CONFIG.EPOHCS)
        return [optimizer], [scheduler]
    
    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
for fold in range(CONFIG.FOLD):
    dm = LitData(fold = fold, tokenizer=CONFIG.TOKENIZER, batch_size = CONFIG.BATCH_SIZE)
    chk_callback = ModelCheckpoint(monitor='val_f1_score',
                                  filename='model_best',
                                  save_top_k=1,
                                  mode='max')
    
    es_callback = EarlyStopping(monitor='val_f1_score',
                               min_delta=0.001,
                               patience=5,
                               verbose=False,
                               mode='max')
    model = LitModel()
    
    trainer = pl.Trainer(gpus=1,
                        max_epochs=CONFIG.EPOHCS,
                        callbacks=[chk_callback, es_callback])
    
    trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                             | Params
--------------------------------------------------------------
0 | model    | ElectraForSequenceClassification | 113 M 
1 | f1_score | F1Score                          | 0     
--------------------------------------------------------------
113 M     Trainable params
0         Non-trainable params
113 M     Total params
452.399   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
def run_inference(model, device, batch_size):
    
    df = pd.read_table('data/2. 모델개발용자료.txt', sep='|', encoding='cp949')
    df.fillna('', inplace=True)
    df['text'] = df['text_obj'] + ' ' + df['text_mthd'] + ' ' + df['text_deal']
    clean = re.compile("[^ㄱ-힣 ]")
    df['text'] = df['text'].apply(lambda x: clean.sub(' ', str(x)))
     
    texts = df['text'].values

    indices = CONFIG.TOKENIZER.batch_encode_plus(texts,
                                                 max_length=CONFIG.MAX_LENGTH,
                                                 add_special_tokens=True,
                                                 return_attention_mask=True,
                                                 padding='longest',
                                                 truncation=True)
    
    input_ids = indices["input_ids"]
    attention_masks = indices["attention_mask"]

    test_inputs = torch.tensor(input_ids)
    test_masks = torch.tensor(attention_masks)

    # Create the DataLoader.
    test_data = TensorDataset(test_inputs, test_masks)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    print('Predicting labels')
    
    preds = []
    for fold in range(CONFIG.FOLD):
        # model.load_state_dict(torch.load(f'./lightning_logs/version_{fold+5}/checkpoints/model_best.ckpt')['state_dict']) ###### version 확인 ######
        model.eval()
        model.to(device)

        predictions = []

        # Predict 
        for batch in tqdm.notebook.tqdm(test_dataloader, total=len(test_dataloader)):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask = batch

            with torch.no_grad():
                outputs = model(b_input_ids, b_input_mask, None)

            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            predictions.append(logits)

        flat_predictions = [item for sublist in predictions for item in sublist]
        flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
        preds.append(flat_predictions)
        
    preds = np.round(np.mean(preds, axis=0), 0)
    
    label_dict_reverse= dict(map(reversed, CONFIG.LABEL_DICT.items()))
    df['label'] = [label_dict_reverse[i] for i in preds]
    
    df['digit_1'] = df['label'].apply(lambda x: x.split()[0])
    df['digit_2'] = df['label'].apply(lambda x: x.split()[1])
    df['digit_3'] = df['label'].apply(lambda x: x.split()[2])
        
    return df

In [None]:
predict = run_inference(model, CONFIG.DEVICE, batch_size=CONFIG.BATCH_SIZE)

In [None]:
predict.iloc[:, :-2].to_csv('submission_3.csv', index=False)