In [1]:
!pip install sentence-transformers transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install wandb
# api key 입력
!wandb login

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[34m[1mwandb[0m: Currently logged in as: [33mkdb[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification, ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split
import numpy as np
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
import random
import torch
from functools import partial
import wandb
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from scipy.stats import pearsonr

In [5]:
train = pd.read_csv('/content/drive/MyDrive/NLP/df.csv')
test = pd.read_csv('/content/drive/MyDrive/NLP/test.csv')

In [6]:
train = train[['sentence1', 'sentence2', 'real-label', 'binary-label']]
test = test[['sentence1', 'sentence2', 'real-label', 'binary-label']]

In [7]:
test.rename(columns ={'real-label':'real_label', 'binary-label':'binary_label'}, inplace = True)

In [8]:
import html
import regex as re
from bs4 import BeautifulSoup

def preprocess(sentence):
    sen = BeautifulSoup(html.unescape(sentence), 'html.parser').text     # html parse
    sen = sen.replace("\n", " ")                                    # \n
    sen = re.sub('"',' ', sen)                                      # 따옴표 
    sen = re.sub("[^a-zA-Z0-9가-힣]", " ", sen)                  #영문, 한글, 숫자 만
    sen = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ·!』\\‘〈〉|\(\)\[\]\<\>`\'…》《]','', sen)    
    return sen



def preprocess_train(df):
    sentence1 = df['sentence1'].tolist()
    sentence2 = df['sentence2'].tolist()
    real_label = df['real-label'].tolist()
    label = df['binary-label'].tolist()

    processed1 = []
    processed2 = []

    for sen1 in sentence1:
        processed1.append(preprocess(sen1))
    for sen2 in sentence2:
        processed2.append(preprocess(sen2))
    
    processed_df = pd.DataFrame(list(zip(processed1, processed2, real_label, label)),
                        columns = ['sentence1', 'sentence2', 'real_label', 'binary_label'])


    return processed_df

In [9]:
train = preprocess_train(train)

In [10]:
train, val = train_test_split((train), test_size=0.1, random_state = 42)

In [11]:
seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [12]:
train_data = train.reset_index().drop(['index'], axis = 1)
valid_data = val.reset_index().drop(['index'], axis = 1)
test_data = test.reset_index().drop(['index'], axis = 1)

In [13]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

# available GPUs : 1
GPU name : Tesla T4
cuda


In [14]:
model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", num_labels = 1)
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classif

In [15]:
class CustomDataset(Dataset):

    def __init__(self, data) -> None: 
        self.data = data            
        self.input, self.label = list(zip(self.data['sentence1'], self.data['sentence2'])), self.data['real_label']

    def __len__(self):
        return len(self.label) # len(y)

    def __getitem__(self, index):
        return self.input[index], self.label[index]  

In [16]:
def custom_collate_fn(batch, max_length):

    global tokenizer
  
    input_list, target_list = zip(*batch) 
    tensorized_input = tokenizer.batch_encode_plus(

        [(sentences[0], sentences[1]) for sentences in input_list],
        max_length = max_length, # 
        padding= "max_length",
        add_special_tokens=True,
        truncation=True,
        return_tensors='pt',
    )
    
    tensorized_label = torch.tensor(target_list)
  
    return tensorized_input, tensorized_label

In [17]:
train_dataset = CustomDataset(train_data)
valid_dataset = CustomDataset(valid_data)
test_dataset = CustomDataset(test_data)

In [18]:
def save_checkpoint(model, optimizer, scheduler, epoch, loss):

    file_name = f'/content/drive/MyDrive/AI09/model_F.ckpt.{epoch}'
        
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss' : loss
        }, 
        file_name
    )
    
    print(f"Saving epoch {epoch} checkpoint at {file_name}")

In [19]:
def validate(model, dataloader):    

    model.eval()
    
    total_loss = 0
    batch_count = 0
    batch_loss = 0
    pred_list = None 

    for step, batch in enumerate(dataloader):       
        batch_count += 1
        batch = tuple(item.to(device) for item in batch)

        batch_input, batch_label = batch

        with torch.no_grad():
            outputs = model(**batch_input, labels = batch_label.float()) 

        loss = outputs.loss 
        pred = outputs.logits.squeeze()
        
        if pred_list is None:
           pred_list = pred.detach().cpu().numpy()
           label_list = batch_label.detach().cpu().numpy()
        else:
            pred_list = np.append(pred_list, pred.detach().cpu().numpy(), axis=0)
            label_list = np.append(label_list, batch_label.detach().cpu().numpy(), axis=0)        
        
        batch_loss += loss.item()
        total_loss += loss.item()

        if (step % 10) == 0 and step != 0:  
            print(f"Step : {step}, valid Loss : {batch_loss / batch_count:.4f}")
            wandb.log({'valid_loss': batch_loss / batch_count})    
            batch_loss = 0
            batch_count = 0

    fone_pred = np.where(pred_list >=3, 1, 0)
    fone_label = np.where(label_list >=3, 1, 0)     
    fone = f1_score(fone_pred, fone_label) * 100
    p_score = pearsonr(pred_list, label_list)[0] * 100  
       
    total_valid_loss = total_loss / (step + 1)              
           
    wandb.log({'total_valid_loss': total_valid_loss, "total_f1_score ": fone, "total_pearsonr" : p_score})     
   
    return total_valid_loss, fone, p_score

In [20]:
def train(model, optimizer, scheduler, train_dataloader, valid_dataloader, epochs):   

    wandb.watch(model, log="all", log_freq = 10)
      
    for epoch in range(epochs):
        print(f'****** Starting To Train Epoch #{epoch} ******')

        total_loss = 0
        batch_loss = 0
        batch_count = 0      

        model.to(device)
        model.train()

        
        for step, batch in enumerate(train_dataloader):
            batch_count += 1
            batch = tuple(item.to(device) for item in batch)
           
            batch_input, batch_label = batch
            model.zero_grad()

            outputs = model(**batch_input, labels = batch_label.float())
            loss = outputs.loss 
            
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
             
            clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            
            if (step % 10) == 0 and step != 0:
                wandb.log({'train_loss': batch_loss / batch_count, 'train_lr': optimizer.param_groups[0]['lr']})                    
                print(f"Epoch: {epoch}, Step : {step}, LR : {optimizer.param_groups[0]['lr']}, Avg Loss : {batch_loss / batch_count:.4f}")
                batch_loss, batch_count = 0,0
                
        wandb.log({'total_train_loss': total_loss / (step + 1), 'total_train_lr': optimizer.param_groups[0]['lr'], "epoch" : (epoch + 1)})
        print(f"Epoch {epoch} total_train_loss : {total_loss/(step+1):.4f}")
        print(f"***** Finish To Train Epoch {epoch} *****\n") 

        print(f"*****Epoch {epoch} Valid Start*****")
        total_valid_loss, fone, p_score = validate(model, valid_dataloader)
        print('total_valid_loss : ', total_valid_loss, "val_f1_score : ",  fone,  "val_pearsonr :",  p_score)  
        print(f"Epoch {epoch} total_Valid Loss : {total_valid_loss:.4f}") 
        print(f"*****Epoch {epoch} Valid Finish*****\n")
        save_checkpoint(model, optimizer, scheduler,  epoch, total_valid_loss)



    print("Train Finished")

In [21]:
sweep_config = {
    
    "name" : "AI09_F",   
    "method": "bayes",
    "metric": {
        "name" : "total_valid_loss", 
        "goal" : "minimize"
                },
    
    "parameters": { 
        "epochs" : {
            "distribution" : "categorical",
            "values" : [5]},                     
        "learning_rate" : {
            "distribution" : "categorical",
            "values" : [2e-5]},                     
        "eps" : {
            "distribution" : "categorical",
            "values" : [1e-8]
        },
        "train_batch_size" : {
            "distribution" : "categorical",
            "values" : [8]
        },
        "valid_batch_size" : {
            "distribution" : "categorical",
            "values" : [32]
        },
        "weight_decay" : {
            "distribution" : "categorical",
            "values" : [0]
        },
        "warm_up_ratio" : {
            "distribution" : "categorical",
            "values" : [0.1]
        },
        "max_length" : {
            "distribution" : "categorical",       
            "values" : [128]
        },
        "grad_norm" : {
            "distribution" : "categorical",
            "values" : [1.0]
        },
    },         
    "early_terminate" : {
        "type": "hyperband", 
        "min_iter" : 2,
        "eta" : 2
        }
}

In [22]:
def initializer(config=None):
    """
    설정에 맞춰서 wandb sweep 실행.
    """
    wandb.init(config=config)

    model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", num_labels = 1)
    
    no_decay = ['bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    

    optimizer = AdamW(
                      optimizer_grouped_parameters,
                      lr = 2e-5,
                      eps = 1e-8
                      ) 
    num_training_steps = epochs * len(train_dataloader)

    scheduler = get_linear_schedule_with_warmup(
                                                optimizer=optimizer, 
                                                num_warmup_steps= (num_training_steps * 0.1),
                                                num_training_steps = num_training_steps
                                                )
    
 
    train(model, optimizer, scheduler, train_dataloader, valid_dataloader, epochs)   

In [23]:
train_dataloader = DataLoader(
                              train_dataset,
                              batch_size = 8,
                              sampler = RandomSampler(train_dataset),
                              collate_fn = partial(custom_collate_fn, max_length=128)
                              )
valid_dataloader = DataLoader(
                              valid_dataset,
                              batch_size = 32,
                              sampler = SequentialSampler(valid_dataset),
                              collate_fn = partial(custom_collate_fn, max_length= 128)
                              )
test_dataloader = DataLoader(
                            test_dataset, 
                            batch_size = 32,
                            sampler = SequentialSampler(test_dataset),
                            collate_fn = partial(custom_collate_fn, max_length= 128)
                            )

In [24]:
epochs = 5
sweep_id = wandb.sweep(sweep_config, project = "AI09_f")
wandb.agent(sweep_id, initializer, count = 5)

Create sweep with ID: qm6ty8sp
Sweep URL: https://wandb.ai/kdb/AI09_f/sweeps/qm6ty8sp


[34m[1mwandb[0m: Agent Starting Run: cib6ykvf with config:
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	grad_norm: 1
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	max_length: 128
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	valid_batch_size: 32
[34m[1mwandb[0m: 	warm_up_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33mkdb[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classif

****** Starting To Train Epoch #0 ******
Epoch: 0, Step : 10, LR : 2.2529441884280595e-07, Avg Loss : 9.1273
Epoch: 0, Step : 20, LR : 4.301075268817205e-07, Avg Loss : 9.5161
Epoch: 0, Step : 30, LR : 6.34920634920635e-07, Avg Loss : 11.8375
Epoch: 0, Step : 40, LR : 8.397337429595495e-07, Avg Loss : 9.0661
Epoch: 0, Step : 50, LR : 1.044546850998464e-06, Avg Loss : 8.5693
Epoch: 0, Step : 60, LR : 1.2493599590373785e-06, Avg Loss : 7.5753
Epoch: 0, Step : 70, LR : 1.454173067076293e-06, Avg Loss : 7.3283
Epoch: 0, Step : 80, LR : 1.6589861751152075e-06, Avg Loss : 4.5207
Epoch: 0, Step : 90, LR : 1.8637992831541222e-06, Avg Loss : 4.9453
Epoch: 0, Step : 100, LR : 2.0686123911930364e-06, Avg Loss : 4.6219
Epoch: 0, Step : 110, LR : 2.273425499231951e-06, Avg Loss : 3.5462
Epoch: 0, Step : 120, LR : 2.4782386072708657e-06, Avg Loss : 3.0200
Epoch: 0, Step : 130, LR : 2.6830517153097803e-06, Avg Loss : 2.8527
Epoch: 0, Step : 140, LR : 2.8878648233486946e-06, Avg Loss : 2.6360
Epoch: 0

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▃▅▆█
total_f1_score,▁▅▆██
total_pearsonr,▁▅▇██
total_train_loss,█▃▂▁▁
total_train_lr,█▆▅▃▁
total_valid_loss,█▅▃▁▁
train_loss,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_lr,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
valid_loss,▅▄█▆▅▃▃▅▃▄▂▂▄▃▃▁▁▂▂▂▁▂▃▂▂

0,1
epoch,5.0
total_f1_score,92.3526
total_pearsonr,94.87124
total_train_loss,0.07568
total_train_lr,0.0
total_valid_loss,0.31413
train_loss,0.05954
train_lr,0.0
valid_loss,0.32229


[34m[1mwandb[0m: Agent Starting Run: hxnsbx2v with config:
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	grad_norm: 1
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	max_length: 128
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	valid_batch_size: 32
[34m[1mwandb[0m: 	warm_up_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classif

****** Starting To Train Epoch #0 ******
Epoch: 0, Step : 10, LR : 2.2529441884280595e-07, Avg Loss : 8.3281
Epoch: 0, Step : 20, LR : 4.301075268817205e-07, Avg Loss : 7.8835


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Epoch: 0, Step : 30, LR : 6.34920634920635e-07, Avg Loss : 7.9659


In [25]:
def test(model, dataloader):    

    model.to(device)
    model.eval()
    
    total_loss = 0
    batch_count = 0
    batch_loss = 0
    
    pred_np = None

    for step, batch in enumerate(dataloader):       
        batch_count += 1
        batch = tuple(item.to(device) for item in batch)

        batch_input, batch_label = batch

        with torch.no_grad():
             outputs = model(**batch_input, labels = batch_label)
    
        loss = outputs.loss
        pred = outputs.logits.squeeze()

        if pred_np is None:
            pred_np = pred.detach().cpu().numpy()
            label_np = batch_label.detach().cpu().numpy()
        else:
            pred_np = np.append(pred_np, pred.detach().cpu().numpy(), axis=0)
            label_np = np.append(label_np, batch_label.detach().cpu().numpy(), axis=0)
        
        batch_loss += loss.item()
        total_loss += loss.item()
                        
        if (step % 10) == 0 and step != 0:
            print('test_loss : ' ,batch_loss / batch_count)                           
            batch_loss, batch_count = 0, 0

    total_valid_loss = total_loss / (step + 1)

    fone_pred = np.where(pred_np >=3, 1, 0)
    fone_label = np.where(label_np >=3, 1, 0)
       
    fone= f1_score(fone_pred , fone_label) * 100
    p_score = pearsonr(pred_np, label_np)[0] * 100           
    print('total_test_loss : ' , total_valid_loss, "total_f1_score : " , fone, "total_pearsonr:" , p_score)

In [30]:
ckpt1 = '/content/drive/MyDrive/AI09/model_F.ckpt.0'
ckpt2 = '/content/drive/MyDrive/AI09/model_F.ckpt.1'
ckpt3 = '/content/drive/MyDrive/AI09/model_F.ckpt.2'
ckpt4 = '/content/drive/MyDrive/AI09/model_F.ckpt.3'
ckpt5 = '/content/drive/MyDrive/AI09/model_F.ckpt.4'

In [31]:
all_checkpoints = [ckpt1, ckpt2, ckpt3, ckpt4, ckpt5]

for checkpoint in all_checkpoints:
    loaded_ckpt = torch.load(checkpoint)
    loaded_ckpt['epoch'], loaded_ckpt['loss']
    model.load_state_dict(loaded_ckpt["model_state_dict"])
    test(model, test_dataloader)

test_loss :  0.4823445234100174
total_test_loss :  0.4942567559131 total_f1_score :  81.34831460674155 total_pearsonr: 89.38817275926439
test_loss :  0.46479722392225253
total_test_loss :  0.49530772425431957 total_f1_score :  84.2315369261477 total_pearsonr: 90.40347654920467
test_loss :  0.3967039429870596
total_test_loss :  0.42338890310288474 total_f1_score :  86.53061224489797 total_pearsonr: 92.00251292662712
test_loss :  0.32531930094422834
total_test_loss :  0.33462238605657596 total_f1_score :  85.53459119496856 total_pearsonr: 92.24971720129449
test_loss :  0.35813261458811746
total_test_loss :  0.37279451583115897 total_f1_score :  85.06224066390043 total_pearsonr: 92.32472216235828
