In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

import random
import os

from sklearn.metrics import f1_score,accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split

from model import BaseModel
from dataloader import BertDataset
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
# params = {
#     'EPOCHS':3,
#     'LEARNING_RATE':2e-5,
#     'BATCH_SIZE':16,
#     'SEED':45,
#     'TRAIN_DATA_PATH': '../../make_data/preprocessed_data/article_train.csv',
#     'VALID_DATA_PATH': '../../make_data/preprocessed_data/article_valid.csv',
#     'TEST_DATA_PATH': '../../make_data/preprocessed_data/article_test.csv',
#     'SAVE_PATH':'../Models/Bert_Article.pt',
#     'transformer':"bert-base-multilingual-cased",
#     'max_length':512
# }

In [3]:
params = {
    'EPOCHS':3,
    'LEARNING_RATE':2e-5,
    'BATCH_SIZE':16,
    'SEED':45,
    'TRAIN_DATA_PATH': '../../make_data/preprocessed_data/article_train.csv',
    'VALID_DATA_PATH': '../../make_data/preprocessed_data/article_valid.csv',
    'TEST_DATA_PATH': '../../make_data/preprocessed_data/article_test.csv',
    'SAVE_PATH':'../Models/Klue_Bert_Article.pt',
    'transformer':"klue/bert-base",
    'max_length':512
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(params['SEED']) # Seed 고정

In [5]:
train_data = pd.read_csv(params['TRAIN_DATA_PATH'],index_col=[0])
valid_data = pd.read_csv(params['VALID_DATA_PATH'],index_col=[0])
test_data = pd.read_csv(params['TEST_DATA_PATH'],index_col=[0])

train_data = train_data.sample(frac=1).reset_index(drop=True)
valid_data = valid_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

train_data.reset_index(drop=True,inplace=True)
valid_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [6]:
print(len(train_data))
print(len(train_data.drop_duplicates(['sent_jo'])))

print(len(valid_data))
print(len(valid_data.drop_duplicates(['sent_jo'])))

print(len(test_data))
print(len(test_data.drop_duplicates(['sent_jo'])))

print(train_data['label'].value_counts()/len(train_data))
print(valid_data['label'].value_counts()/len(valid_data))
print(test_data['label'].value_counts()/len(test_data))

64591
64591
11851
11851
15123
15123
0    0.91889
1    0.08111
Name: label, dtype: float64
0    0.944562
1    0.055438
Name: label, dtype: float64
0    0.956292
1    0.043708
Name: label, dtype: float64


In [7]:
train_dataset = BertDataset(train_data,params)
train_dataloader = DataLoader(train_dataset,batch_size=params['BATCH_SIZE'])
valid_dataset = BertDataset(valid_data,params)
valid_dataloader = DataLoader(valid_dataset,batch_size=params['BATCH_SIZE'])
test_dataset = BertDataset(test_data,params)
test_dataloader = DataLoader(test_dataset,batch_size=params['BATCH_SIZE'])

In [8]:
def train(model, train_loader,valid_loader, params, device):
    
    model.to(device)

    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(),lr=params['LEARNING_RATE'])
    
    best_score = 0
    best_model = "None"
    for epoch_num in range(1,params["EPOCHS"]+1):

        model.train()
        
        train_loss = []
        for input_ids,masks,labels in tqdm(train_loader):
            
            train_input_ids = input_ids.to(device)
            train_masks = masks.to(device)
            train_labels = labels.to(device)
            
            optimizer.zero_grad()

            output = model(train_input_ids,train_masks)     
            output = output.reshape(-1)
            
            batch_loss = criterion(output.to(torch.float32), train_labels.to(torch.float32)) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()

        val_loss, val_acc, val_f1 = validation(model, criterion, valid_loader, device)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] \
              Val Loss : [{np.mean(val_loss) :.5f}] Val Accuracy Score : [{val_acc:.5f}] Val F1 Score : [{val_f1:.5f}]')
        
        val_score = val_f1
        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model                         

In [9]:
def competition_metric(true, pred):
    return accuracy_score(true,pred),f1_score(true, pred, average="macro")

def validation(model, criterion, test_loader, device):
    model.eval()

    val_loss = []
    model_preds = []
    true_labels = []  
    with torch.no_grad():
        for input_ids, masks, labels in tqdm(test_loader):
            valid_labels = labels.to(device)
            valid_input_ids = input_ids.to(device)
            valid_masks = masks.to(device)

            output = model(valid_input_ids, valid_masks)
            output = output.reshape(-1)
            
            batch_loss = criterion(output.to(torch.float32), valid_labels.to(torch.float32)) 
            val_loss.append(batch_loss.item())      
            
            output[output>0.5] = 1
            output[output<=0.5] = 0
            model_preds += output.detach().cpu().numpy().tolist()
            true_labels += valid_labels.detach().cpu().numpy().tolist()
        val_acc, val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_acc, val_f1    

In [10]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    true_labels = []
    
    with torch.no_grad():
        for input_ids, masks, labels in tqdm(test_loader):
            test_labels = labels.to(device)
            test_input_ids= input_ids.to(device)
            test_masks = masks.to(device)

            output = model(test_input_ids, test_masks)  
            
            output[output>0.5] = 1
            output[output<=0.5] = 0
            test_predict += output.detach().cpu().numpy().tolist()
            true_labels += test_labels.detach().cpu().numpy().tolist()
    print('Done.')
    return test_predict

In [11]:
model = BaseModel(params)
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = params["LEARNING_RATE"])

infer_model = train(model, train_dataloader, valid_dataloader, params, device)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4037/4037 [23:42<00:00,  2.84it/s]
100%|██████████| 741/741 [01:31<00:00,  8.13it/s]


Epoch [1], Train Loss : [0.12895]               Val Loss : [0.07094] Val Accuracy Score : [0.97680] Val F1 Score : [0.88961]


100%|██████████| 4037/4037 [23:45<00:00,  2.83it/s]
100%|██████████| 741/741 [01:31<00:00,  8.13it/s]


Epoch [2], Train Loss : [0.06829]               Val Loss : [0.07060] Val Accuracy Score : [0.97941] Val F1 Score : [0.90114]


100%|██████████| 4037/4037 [23:45<00:00,  2.83it/s]
100%|██████████| 741/741 [01:31<00:00,  8.13it/s]

Epoch [3], Train Loss : [0.04792]               Val Loss : [0.07782] Val Accuracy Score : [0.97890] Val F1 Score : [0.89914]





In [12]:
test_labels = test_data['label']
test_preds = inference(infer_model,test_dataloader,device)

print(competition_metric(test_labels,test_preds))

print(confusion_matrix(test_labels,test_preds))

100%|██████████| 946/946 [01:56<00:00,  8.13it/s]


Done.
(0.981022283938372, 0.8892847379417568)
[[14301   161]
 [  126   535]]


In [13]:
labels = train_data['label']
preds = inference(infer_model,train_dataloader,device)

print(competition_metric(labels,preds))

print(confusion_matrix(labels,preds))

100%|██████████| 4037/4037 [08:17<00:00,  8.11it/s]


Done.
(0.9899057144184175, 0.9652619365439461)
[[59171   181]
 [  471  4768]]


In [14]:
labels = valid_data['label']
preds = inference(infer_model,valid_dataloader,device)

print(competition_metric(labels,preds))

print(confusion_matrix(labels,preds))

100%|██████████| 741/741 [01:31<00:00,  8.12it/s]

Done.
(0.9789047337777402, 0.8991427602697197)
[[11070   124]
 [  126   531]]





In [15]:
torch.save(infer_model.state_dict(),params['SAVE_PATH'])