In [1]:
!pip install sentence-transformers transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 4.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 20.3 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.17-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 38.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 34.3 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.ma

In [2]:
import os
import sys
import re
import pandas as pd
import numpy as np 
import torch
import random
import tarfile
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split
import torch.nn.functional as F
from torch.optim import AdamW
from torch.nn import CosineSimilarity, MSELoss
from torch.nn.utils import clip_grad_norm_
from sklearn.metrics import f1_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd /content/drive/MyDrive/NLP

/content/drive/MyDrive/NLP


In [5]:
df = pd.read_csv('/content/drive/MyDrive/NLP/df.csv')
test = pd.read_csv('/content/drive/MyDrive/NLP/test.csv')

In [6]:
df = df[['sentence1', 'sentence2', 'real-label', 'binary-label']]
test = test[['sentence1', 'sentence2', 'real-label', 'binary-label']]

In [7]:
train, val = train_test_split(df, test_size=0.1, shuffle=True)

In [8]:
!pip install transformers
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer
from transformers import ElectraModel, ElectraTokenizer
from transformers import get_linear_schedule_with_warmup, get_constant_schedule

In [10]:
# seed
seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [11]:
class CustomDataset(Dataset):
    def __init__(self, sentence1, sentence2, real_label):
        self.X1 = sentence1 #list str
        self.X2 = sentence2 #list str
        self.Y = real_label #list float

    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, index):
        return self.X1[index], self.X2[index], self.Y[index]

In [12]:
def generate_dataset(df, flag):
    sen_one = df['sentence1'].tolist()
    sen_two = df['sentence2'].tolist()
    lab = df['binary-label'].tolist()
    real_lab = df['real-label'].tolist()
    
    if flag:
        return CustomDataset(sen_one, sen_two, real_lab)
    else:
        return CustomDataset(sen_one, sen_two, lab)

In [13]:
train_dataset = generate_dataset(train, True)
val_dataset = generate_dataset(val, True)
test_dataset = generate_dataset(test, True)

In [14]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [15]:
def CustomCollateFn(batch):
    sen_one_list = []
    sen_two_list = []
    label_list = []


    for sen_one, sen_two, label in batch:
        sen_one_list.append(sen_one)
        sen_two_list.append(sen_two)
        label_list.append(label/5.0)
    
    tokenized_sen_one = tokenizer(sen_one_list, add_special_tokens=True, padding='max_length',
                                truncation=True, max_length=128, return_tensors='pt')
    tokenized_sen_two = tokenizer(sen_two_list, add_special_tokens=True, padding='max_length',
                                truncation=True, max_length=128, return_tensors='pt')

    label_list = torch.Tensor(label_list)


    return (tokenized_sen_one, tokenized_sen_two, label_list)

def CustomCollateFn_dev(batch):
    sen_one_list = []
    sen_two_list = []
    label_list = []


    for sen_one, sen_two, label in batch:
        sen_one_list.append(sen_one)
        sen_two_list.append(sen_two)
        label_list.append(label)

    tokenized_sen_one = tokenizer(sen_one_list, add_special_tokens=True, padding='max_length',
                                truncation=True, max_length=128, return_tensors='pt')
    tokenized_sen_two = tokenizer(sen_two_list, add_special_tokens=True, padding='max_length',
                                truncation=True, max_length=128, return_tensors='pt')

    label_list = torch.Tensor(label_list)

    return (tokenized_sen_one, tokenized_sen_two, label_list)    

In [16]:
def mean_pooling_fn(output, attention_mask):
    embedding = output.last_hidden_state # (batch len, longest sentence length, 1024)
    att_msk = attention_mask # (batch_len, 1024)
    mask = att_msk.unsqueeze(-1).expand(output.last_hidden_state.size()).float() # (batch len, longest sentence length, 1024)
    masked_embedding = output.last_hidden_state * mask # (batch_len, longest sen len, 1024)
    me_sum = torch.sum(masked_embedding, 1) # (batch_len, 1024)
    ms_sum = torch.clamp(mask.sum(1), min=1e-9) # (batch_len, 1024)
    mean_pool = me_sum/ms_sum # batch_len, 1024
    return mean_pool

In [17]:
class CustomPooling(nn.Module):
    def __init__(self):
        super(CustomPooling, self).__init__()

        self.robert = AutoModel.from_pretrained("klue/roberta-base")


        self.cos_score = nn.Sequential(
            nn.Identity()
        )
    
    def forward(self, senone, sentwo):
        output_one = self.robert(input_ids=senone['input_ids'], attention_mask=senone['attention_mask'],
                             token_type_ids=senone['token_type_ids'])
        output_two = self.robert(input_ids=sentwo['input_ids'], attention_mask=sentwo['attention_mask'],
                             token_type_ids=sentwo['token_type_ids'])

        pooled_one = mean_pooling_fn(output_one, senone['attention_mask'])
        pooled_two = mean_pooling_fn(output_two, sentwo['attention_mask'])


        cos_sim = torch.cosine_similarity(pooled_one, pooled_two)
        logit = self.cos_score(cos_sim)

        return logit

In [18]:
def initializer(input_dataloader, epochs):

    model = CustomPooling()
    optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
    print(f'total step: {len(input_dataloader) * epochs}')

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = round(len(input_dataloader)*0.1),
        num_training_steps = len(input_dataloader) * epochs,

    )

    return model, optimizer, scheduler

In [19]:
def initializer(input_dataloader, epochs):
    """
    설정에 맞춰서 wandb sweep 실행.
    """
    wandb.init(config=sweep_config)
    model = CustomPooling()   
    w_config = wandb.config   

    optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8) 
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = round(len(input_dataloader)*0.1),
        num_training_steps = len(input_dataloader) * epochs,

    )
    print(f'total step: {len(input_dataloader) * epochs}') 
    text_table = wandb.Table(columns=["epoch", "step", "text", 'true_label', 'pred_label'])
    wandb.log({f"error-text-{wandb.run.name}" : text_table})
    return model, optimizer, scheduler    

In [20]:
def save_checkpoint(path, model, optimizer, scheduler, epoch, loss):
    file_name = f'/content/drive/MyDrive/data/checkpoints/sts_hyper.ckpt.{epoch}'
    torch.save({
        'epoch':epoch,
        'model_state_dict':model.state_dict(),
        'optimizer_state_dict':optimizer.state_dict(),
        'scheduler_state_dict':scheduler.state_dict(),
        'loss':loss
    }, file_name)

    print(f'SAVING EPOCH {epoch} ...')

In [21]:
def train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, epochs):
    for epoch in range(epochs):
        print(f'****** STARTING TO TRAIN EPOCH #{epoch} ******')

        wandb.watch(model, log="all", log_freq = 10)
        total_loss = 0
        batch_loss = 0
        batch_count = 0

        model.train()
        model.to(device)

        for step, batch in enumerate(train_dataloader):
            batch_count += 1
            batch = tuple(items.to(device) for items in batch)

            (x_batch_one, x_batch_two, y_batch) = batch

            model.zero_grad()

            logit = model(x_batch_one, x_batch_two)
            loss = loss_fct(logit, y_batch)

            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
            clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if(step % 10 == 0 and step != 0):
                wandb.log({'train_loss': batch_loss / batch_count, 'train_lr': optimizer.param_groups[0]['lr']})
                print(f"Step : {step + 1}, train Loss : {batch_loss / batch_count:.4f}")                      
                # reset 
                batch_loss, batch_count = 0,0

        wandb.log({'total_train_loss': total_loss / (step + 1), 'total_train_lr': optimizer.param_groups[0]['lr'], "epoch" : (epoch + 1)})
  

        print(f"Epoch {epoch} Total Mean Loss : {total_loss/(step+1):.4f}")
        print(f"*****Epoch {epoch} Train Finish*****\n")
            
        if valid_dataloader is not None:
            print(f"*****Epoch {epoch} Valid Start*****")
            valid_loss, valid_pearson, valid_f1 = validate(model, loss_fct, valid_dataloader)
            print(f"Epoch {epoch} Valid Loss : {valid_loss} Valid Pearsonr : {valid_pearson} ValidF1 : {valid_f1}")
            print(f"*****Epoch {epoch} Valid Finish*****\n")
  
        save_checkpoint(".", model, optimizer, scheduler, epoch, total_loss/(step+1))

    print('** Train Completed! **')

In [22]:
!pip install audtorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting audtorch
  Downloading audtorch-0.6.4-py3-none-any.whl (54 kB)
[?25l[K     |██████                          | 10 kB 24.9 MB/s eta 0:00:01[K     |████████████                    | 20 kB 29.7 MB/s eta 0:00:01[K     |██████████████████              | 30 kB 19.2 MB/s eta 0:00:01[K     |████████████████████████        | 40 kB 16.3 MB/s eta 0:00:01[K     |██████████████████████████████  | 51 kB 8.8 MB/s eta 0:00:01[K     |████████████████████████████████| 54 kB 2.5 MB/s 
Collecting audiofile
  Downloading audiofile-1.1.0-py3-none-any.whl (11 kB)
Collecting audeer
  Downloading audeer-1.18.0-py3-none-any.whl (20 kB)
Collecting sox
  Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)
Installing collected packages: sox, audeer, audiofile, audtorch
Successfully installed audeer-1.18.0 audiofile-1.1.0 audtorch-0.6.4 sox-1.4.1


In [23]:
from audtorch.metrics.functional import pearsonr
from sklearn.metrics import f1_score
from scipy import stats

In [24]:
def validate(model, loss_fct, valid_dataloader):

    model.eval()
    model.to(device)

    total_loss = 0
    total_acc = 0
    all_prediction = []
    all_reallabel = []

    for step, batch in enumerate(valid_dataloader):
        batch = tuple(items.to(device) for items in batch)

        (x_batch_one, x_batch_two, batch_y) = batch

        with torch.no_grad():
            logit = model(x_batch_one, x_batch_two)

        logit = logit*5
        loss = loss_fct(logit, batch_y)
        total_loss += loss.item()


        logit = logit[:].cpu()
        batch_y = batch_y.cpu()

        print(f'Step: {step},  Pearson: {pearsonr(logit, batch_y)}')

        all_prediction = all_prediction + logit.tolist()
        all_reallabel = all_reallabel + batch_y.tolist()

    #pearson

    pred = torch.Tensor(all_prediction) # x
    real = torch.Tensor(all_reallabel) # y
    
    pearson = pearsonr(pred, real)
    
    #loss
    total_loss = total_loss / (step+1)

    #f1
    fone = f1_process(pred, real)

    wandb.log({'total_valid_loss': total_loss, "total_f1_score ": fone, "total_pearsonr" : pearson})  
    print('total_valid_loss : ', total_loss, "total_f1_score : ",  fone,  "total_pearsonr :", pearson)  
    return total_loss, pearson, fone

In [25]:
def f1_process(pred, real):
    bin_real = []
    bin_pred = []

    for index in range(len(real)):
        if real[index] < 3:
            bin_real.append(0)
        else:
            bin_real.append(1)
    
        if pred[index] < 3:
            bin_pred.append(0)
        else:
            bin_pred.append(1)

    return f1_score(bin_real, bin_pred)

In [26]:
device = torch.device("cuda")

train_dataloader = DataLoader(
    train_dataset,
    batch_size = 16,
    sampler = RandomSampler(train_dataset),
    collate_fn = CustomCollateFn,
)
valid_dataloader = DataLoader(
    val_dataset,
    batch_size = 32,
    sampler = SequentialSampler(val_dataset),
    collate_fn = CustomCollateFn_dev,
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size = 32,
    sampler = SequentialSampler(test_dataset),
    collate_fn = CustomCollateFn_dev,
) 

In [27]:
sweep_config = {
    
    "name" : "wandb-sts",   
    "method": "bayes",
    "metric": {
        "name" : "total_valid_loss", 
        "goal" : "minimize"
                },
    
    "parameters": { 
        "epochs" : {
            "distribution" : "categorical",
            "values" : [2]},
        "learning_rate" : {
            "distribution" : "categorical",
            "values" : [1e-5, 3e-5, 5e-5]},                     
        "eps" : {
            "distribution" : "categorical",
            "values" : [1e-8]
        },
        "train_batch_size" : {
            "distribution" : "categorical",
            "values" : [16]
        },
        "valid_batch_size" : {
            "distribution" : "categorical",
            "values" : [32]
        },
        "weight_decay" : {
            "distribution" : "categorical",
            "values" : [0, 0.01]
        },
        "warm_up_ratio" : {
            "distribution" : "categorical",
            "values" : [0, 0.1, 0.2]
        },
        "max_length" : {
            "distribution" : "categorical",
            "values" : [128]
        },
        "grad_norm" : {
            "distribution" : "categorical",
            "values" : [1.0]
        },
    },         
    "early_terminate" : {
        "type": "hyperband", # metric이 2번 이상 개선되지 않을 경우 조기 종료
        "min_iter" : 2,
        "eta" : 2
        }
}

In [28]:
def run_sweeep(config=None):
    """
    설정에 맞춰서 wandb sweep 실행.
    """
    model = CustomPooling() 
    wandb.init(config=config)
    w_config = wandb.config   
    
    optimizer = AdamW(model.parameters(), lr = w_config.learning_rate, eps =  w_config.eps) 
    num_training_steps = w_config.epochs * len(train_dataloader)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps= (num_training_steps * w_config.warm_up_ratio),
                                                num_training_steps = num_training_steps)
    loss_fct = MSELoss()
    train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, w_config.epochs)

In [29]:
!pip install wandb
!wandb login
import wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
sweep_id = wandb.sweep(sweep_config, project = "wandb")
wandb.agent(sweep_id, run_sweeep, count = 5)

Create sweep with ID: atiec07q
Sweep URL: https://wandb.ai/kdb/wandb/sweeps/atiec07q


[34m[1mwandb[0m: Agent Starting Run: 29n0zw5b with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	grad_norm: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	max_length: 128
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	valid_batch_size: 32
[34m[1mwandb[0m: 	warm_up_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0


Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for

****** STARTING TO TRAIN EPOCH #0 ******
Step : 11, train Loss : 0.1919
Step : 21, train Loss : 0.1466
Step : 31, train Loss : 0.1002
Step : 41, train Loss : 0.0473
Step : 51, train Loss : 0.0411
Step : 61, train Loss : 0.0356
Step : 71, train Loss : 0.0363
Step : 81, train Loss : 0.0314
Step : 91, train Loss : 0.0313
Step : 101, train Loss : 0.0292
Step : 111, train Loss : 0.0281
Step : 121, train Loss : 0.0310
Step : 131, train Loss : 0.0243
Step : 141, train Loss : 0.0265
Step : 151, train Loss : 0.0327
Step : 161, train Loss : 0.0294
Step : 171, train Loss : 0.0276
Step : 181, train Loss : 0.0303
Step : 191, train Loss : 0.0261
Step : 201, train Loss : 0.0256
Step : 211, train Loss : 0.0300
Step : 221, train Loss : 0.0305
Step : 231, train Loss : 0.0305
Step : 241, train Loss : 0.0308
Step : 251, train Loss : 0.0277
Step : 261, train Loss : 0.0211
Step : 271, train Loss : 0.0275
Step : 281, train Loss : 0.0301
Step : 291, train Loss : 0.0203
Step : 301, train Loss : 0.0225
Step : 3

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁█
total_f1_score,▁█
total_pearsonr,▁█
total_train_loss,█▁
total_train_lr,█▁
total_valid_loss,█▁
train_loss,█▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_lr,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
epoch,2.0
total_f1_score,0.89988
total_pearsonr,0.92125
total_train_loss,0.0094
total_train_lr,0.0
total_valid_loss,0.52941
train_loss,0.01039
train_lr,0.0


[34m[1mwandb[0m: Agent Starting Run: 9hg8h32t with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	grad_norm: 1
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	max_length: 128
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	valid_batch_size: 32
[34m[1mwandb[0m: 	warm_up_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01
Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model tha

****** STARTING TO TRAIN EPOCH #0 ******
Step : 11, train Loss : 0.1953
Step : 21, train Loss : 0.1708
Step : 31, train Loss : 0.0984
Step : 41, train Loss : 0.0575
Step : 51, train Loss : 0.0387
Step : 61, train Loss : 0.0322
Step : 71, train Loss : 0.0342
Step : 81, train Loss : 0.0357
Step : 91, train Loss : 0.0293
Step : 101, train Loss : 0.0283
Step : 111, train Loss : 0.0250
Step : 121, train Loss : 0.0241
Step : 131, train Loss : 0.0253
Step : 141, train Loss : 0.0271
Step : 151, train Loss : 0.0210
Step : 161, train Loss : 0.0257
Step : 171, train Loss : 0.0242
Step : 181, train Loss : 0.0258
Step : 191, train Loss : 0.0328
Step : 201, train Loss : 0.0271
Step : 211, train Loss : 0.0311
Step : 221, train Loss : 0.0238
Step : 231, train Loss : 0.0272
Step : 241, train Loss : 0.0345
Step : 251, train Loss : 0.0221
Step : 261, train Loss : 0.0273
Step : 271, train Loss : 0.0244
Step : 281, train Loss : 0.0274
Step : 291, train Loss : 0.0209
Step : 301, train Loss : 0.0235
Step : 3

In [None]:
"""
loss_fct = MSELoss()
model, optimizer, scheduler = initializer(train_dataloader, 5)
sweep_id = wandb.sweep(sweep_config, project = "wandb-sts")
wandb.agent(sweep_id, train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, 5), count = 5)
"""