In [1]:
import os
import warnings

warnings.filterwarnings(action='ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import random
import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

CONFIG = {
    'SEED': 42,
    'BATCH_SIZE': 16,
    'SUBMISSION_NUMBER': 2,
    'MODEL_NAME': 'roberta-base',
    'LEARNING_RATE': 2e-5
}

seed_everything(CONFIG['SEED']) # Seed 고정

In [3]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [4]:
train = train.drop(['ID', 'label'], axis=1)
train.columns = ['sentence', 'type', 'polarity', 'tense', 'certainty']
train

Unnamed: 0,sentence,type,polarity,tense,certainty
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실
...,...,...,...,...,...
16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",사실형,긍정,과거,확실
16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",사실형,긍정,과거,확실
16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,사실형,긍정,과거,확실
16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,사실형,긍정,과거,불확실


In [5]:
test = test.drop(['ID'], axis=1)
test.columns = ['sentence']
test

Unnamed: 0,sentence
0,"장욱진의 ＇가족＇은 허물 없는 가족애를, 처음 공개되는 정약용의 ＇정효자전＇과 ＇정..."
1,"조지 W 부시, 버락 오바마 전 대통령도 전쟁 위험 때문에 버린 카드다."
2,지난해 1분기 128억원이었던 영업이익이 올해 1분기 505억원으로 급증했다.
3,수상 작가와 맺으려던 계약서 내용 가운데 일부가 ＇독소 조항＇으로 해석돼 수정을 요...
4,결국 최근 KDB산업은행은 대규모 손실 위기에 닥친 에어부산에 140억원 금융지원을...
...,...
7085,"2020 세계국가편람 모바일 앱은 세계 216개국의 국가개황과 주요 경제지표, 사회..."
7086,탈세계화 징후들이 반갑지 않은 이유다.
7087,"틱톡은 6월 ＇인터넷 안전의 달＇을 맞아 올바른 개인정보 보호 관리 방법, 앱 내 ..."
7088,만약 3개월 간 채굴자들의 투표를 거쳐 2/3 이상의 해시파워가 ＇채굴세＇ 도입에 ...


In [6]:
from sklearn.preprocessing import LabelEncoder

# 2. Label Encoding (유형, 극성, 시제, 확실성)
type_le = LabelEncoder()
train["type"] = type_le.fit_transform(train["type"].values)

polarity_le = LabelEncoder()
train["polarity"] = polarity_le.fit_transform(train["polarity"].values)

tense_le = LabelEncoder()
train["tense"] = tense_le.fit_transform(train["tense"].values)

certainty_le = LabelEncoder()
train["certainty"] = certainty_le.fit_transform(train["certainty"].values)
train

Unnamed: 0,sentence,type,polarity,tense,certainty
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,1,0,2,1
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,1,0,0,1
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,1,0,1,1
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",1,0,0,1
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,1,0,2,1
...,...,...,...,...,...
16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",1,0,0,1
16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",1,0,0,1
16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,1,0,0,1
16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,1,0,0,0


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(CONFIG['MODEL_NAME'])

In [8]:
from data_module import RoBertaDataModule

dm = RoBertaDataModule(tokenizer=tokenizer, train_df=train, predict_df=test, batch_size=CONFIG['BATCH_SIZE'], max_token_len=512)

In [9]:
from model import RoBERTa

roberta = RoBERTa(model_name=CONFIG['MODEL_NAME'])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

# make directory if it's not exists
if not os.path.exists('ckpts'):
    os.mkdir('ckpts')

trainer_config = {
    'max_epochs': 10,
    'accelerator': 'auto',
    'precision': 16,
    'amp_backend': 'apex',
    'callbacks': [
        ModelCheckpoint(
            dirpath='ckpts',
            filename='{epoch:02d}-{val/loss-total:.3f}',
            verbose=True,
            save_last=True,
            save_top_k=5,
            monitor='val/loss-total',
            mode='min'
        ),
        EarlyStopping(monitor='val/loss-total', min_delta=0.00, patience=2, verbose=True, mode='min'),
    ],
    'logger': WandbLogger(project="sentense_classification")
}

[34m[1mwandb[0m: Currently logged in as: [33mkevan[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
import pytorch_lightning as pl

trainer = pl.Trainer(**trainer_config)

Using 16bit apex Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(roberta, dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                 | Type         | Params
------------------------------------------------------
0 | roberta              | RobertaModel | 124 M 
1 | type_classifier      | Sequential   | 296 K 
2 | polarity_classifier  | Sequential   | 296 K 
3 | tense_classifier     | Sequential   | 296 K 
4 | certainty_classifier | Sequential   | 296 K 
------------------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params
503.326   Total estimated model params size (MB)


Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved. New best score: 0.403
Epoch 0, global step 827: 'val/loss-total' reached 0.40270 (best 0.40270), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=00-val/loss-total=0.403.ckpt' as top 5


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.058 >= min_delta = 0.0. New best score: 0.345
Epoch 1, global step 1654: 'val/loss-total' reached 0.34454 (best 0.34454), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=01-val/loss-total=0.345.ckpt' as top 5


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.040 >= min_delta = 0.0. New best score: 0.305
Epoch 2, global step 2481: 'val/loss-total' reached 0.30472 (best 0.30472), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=02-val/loss-total=0.305.ckpt' as top 5


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.016 >= min_delta = 0.0. New best score: 0.288
Epoch 3, global step 3308: 'val/loss-total' reached 0.28824 (best 0.28824), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=03-val/loss-total=0.288.ckpt' as top 5


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.010 >= min_delta = 0.0. New best score: 0.279
Epoch 4, global step 4135: 'val/loss-total' reached 0.27864 (best 0.27864), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=04-val/loss-total=0.279.ckpt' as top 5


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.002 >= min_delta = 0.0. New best score: 0.277
Epoch 5, global step 4962: 'val/loss-total' reached 0.27693 (best 0.27693), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=05-val/loss-total=0.277.ckpt' as top 5


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.004 >= min_delta = 0.0. New best score: 0.273
Epoch 6, global step 5789: 'val/loss-total' reached 0.27337 (best 0.27337), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=06-val/loss-total=0.273.ckpt' as top 5


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.000 >= min_delta = 0.0. New best score: 0.273
Epoch 7, global step 6616: 'val/loss-total' reached 0.27295 (best 0.27295), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=07-val/loss-total=0.273.ckpt' as top 5


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0


Validation: 0it [00:00, ?it/s]

Metric val/loss-total improved by 0.009 >= min_delta = 0.0. New best score: 0.264
Epoch 8, global step 7443: 'val/loss-total' reached 0.26380 (best 0.26380), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=08-val/loss-total=0.264.ckpt' as top 5


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 8270: 'val/loss-total' reached 0.26861 (best 0.26380), saving model to '/home/fourind/projects/sentense_clf/ckpts/epoch=09-val/loss-total=0.269.ckpt' as top 5
`Trainer.fit` stopped: `max_epochs=10` reached.


In [13]:
output = trainer.predict(roberta, dm, ckpt_path='best')

Restoring states from the checkpoint path at /home/fourind/projects/sentense_clf/ckpts/epoch=08-val/loss-total=0.264.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/fourind/projects/sentense_clf/ckpts/epoch=08-val/loss-total=0.264.ckpt


Predicting: 827it [00:00, ?it/s]

In [14]:
import torch
import torch.nn.functional as F

preds = pd.DataFrame()
for k in ['type', 'polarity', 'tense', 'certainty']:
    logit = torch.concat([o[k] for o in output], dim=0)
    _, preds[k] = torch.max(F.softmax(logit), 1)

In [15]:
# decoding
preds["type"] = type_le.inverse_transform(preds["type"].values)
preds["polarity"] = polarity_le.inverse_transform(preds["polarity"].values)
preds["tense"] = tense_le.inverse_transform(preds["tense"].values)
preds["certainty"] = certainty_le.inverse_transform(preds["certainty"].values)

In [16]:
# combine preds
preds['label'] = ['-'.join([row['type'],
                            row['polarity'],
                            row['tense'],
                            row['certainty']]) for i, row in preds.iterrows()]

In [17]:
# make submission csv
submission['label'] = preds['label']
submission.to_csv('submissions/submission_1.csv', index=False)