In [15]:
import os, sys
import logging
import argparse
import csv
import gc
from tqdm import tqdm
from collections import OrderedDict, Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LambdaLR
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import AdamW, get_scheduler, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import WEIGHTS_NAME, CONFIG_NAME

from CustomDataset import MovieDataset
from CustomModel import SentimentBertModel
from util.log import setup_default_logging
from util.metrics import AverageMeter
from util.collate import customCollate

try:
    import wandb
    has_wandb = True
except ImportError: 
    has_wandb = False

In [2]:
_logger = logging.getLogger('train')
parser = argparse.ArgumentParser(description='Train Config', add_help=False)

In [3]:
def get_dataset(data_path, tokenizer, max_len, random_seed, is_train=True):
    document, target = [], []
    with open(data_path, 'r') as f:
        lines = list(csv.reader(f, delimiter='\t'))
        header = lines.pop(0)
        for line in lines:
            document.append(line[1])
            target.append(int(line[2]))
    if is_train:
        train_doc, valid_doc, train_target, valid_target = train_test_split(
            document, target, test_size=0.1, shuffle=True, stratify=target, random_state=random_seed
        )
        train_dataset = MovieDataset(tokenizer, train_doc, train_target, max_len)
        valid_dataset = MovieDataset(tokenizer, valid_doc, valid_target, max_len)
        _logger.info(f'train dataset : {len(train_dataset)}, valid dataset : {len(valid_dataset)}')
        _logger.info(f'train dataset : {Counter(train_target)}, valid dataset : {Counter(valid_target)}')
        return train_dataset, valid_dataset
    else:
        test_dataset = MovieDataset(tokenizer, document, target, max_len)
        _logger.info(f'test dataset :{len(test_dataset)}')
        _logger.info(f'test dataset :{Counter(target)}')
        return test_dataset

In [4]:
def initalize_model(prev_model, max_len, finetune=False):
    model = SentimentBertModel.from_pretrained(prev_model, 
                                                n_classes=2,
                                                max_length=max_len)
    if finetune:
        for param in model.parameters():
            param.requires_grad = True
    else:
        for name, param in model.named_parameters():
            if 'classifier' in name:
                param.requires_grad = True
            else:
                param.requires_grad = False

    return model

In [5]:
def train_one_epoch(model, loader, device, loss_fn, optimizer):
    ### argument
    log_interval = 1
    ###

    model.train()
    
    train_loss_m = AverageMeter()
    last_idx = len(loader) - 1
    for idx, batch in tqdm(enumerate(loader), total=len(loader)):
        last_batch = idx == last_idx
        optimizer.zero_grad()    
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(input_ids=batch['input_ids'],
                        token_type_ids=batch['token_type_ids'],
                        attention_mask=batch['attention_mask'])
        
        loss = loss_fn(logits.view(-1, 2), batch['labels'].view(-1))
        train_loss_m.update(loss.data.item(), batch['input_ids'].size(0))
        # _logger.info(f'batch_train_loss : {loss.data.item()}')
        loss.backward()

        clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        

        if last_batch or (idx+1 % log_interval == 0):
            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
            avg_lr = sum(lrl)/len(lrl)
            _logger.info(f'avg_train_loss : {train_loss_m.avg}, LR : {avg_lr}')

        del batch, loss

    del loader
    gc.collect()

    metrics = OrderedDict([('loss', train_loss_m.avg)])

    return metrics

In [6]:
def train_one_epoch2(model, loader, device, loss_fn, optimizer):
    ### argument
    log_interval = 1
    ###

    model.train()
    
    train_loss_m = AverageMeter()
    last_idx = len(loader) - 1
    for idx, batch in tqdm(enumerate(loader), total=len(loader)):
        last_batch = idx == last_idx
        optimizer.zero_grad()    
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'],
                        token_type_ids=batch['token_type_ids'],
                        attention_mask=batch['attention_mask'],
                        labels=batch['labels'])
        
        loss = outputs[0]
        train_loss_m.update(loss.data.item(), batch['input_ids'].size(0))
        # _logger.info(f'batch_train_loss : {loss.data.item()}')
        loss.backward()

        clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        

        if last_batch or (idx+1 % log_interval == 0):
            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
            avg_lr = sum(lrl)/len(lrl)
            _logger.info(f'avg_train_loss : {train_loss_m.avg}, LR : {avg_lr}')

        del batch, loss

    del loader
    gc.collect()

    metrics = OrderedDict([('loss', train_loss_m.avg)])

    return metrics

In [7]:
def validation(model, loader, device, loss_fn):
    ### argument
    log_interval = 1
    ###

    model.eval()
    val_loss_m = AverageMeter()
    acc_m = AverageMeter()

    last_idx = len(loader) - 1
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(loader), total=len(loader)):
            last_batch = idx == last_idx
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(input_ids=batch['input_ids'],
                                 token_type_ids=batch['token_type_ids'],
                                 attention_mask=batch['attention_mask'])
            
            loss = loss_fn(logits.view(-1, 2), batch['labels'].view(-1))
            val_loss_m.update(loss.data.item(), batch['input_ids'].size(0))
            
            acc = sum([i == j for i, j in zip(torch.argmax(logits, 1).tolist(), batch['labels'])]) / len(batch['labels'])
            acc_m.update(acc)

            del batch, loss, logits            

        if last_batch or (idx+1 % log_interval == 0):
            _logger.info(f'avg_val_loss : {val_loss_m.avg}, avg_accuracy : {acc_m.avg}')

    metrics = OrderedDict([('loss', val_loss_m.avg), ('accuracy', acc_m.avg)])

    del loader
    gc.collect()

    return metrics

In [8]:
def validation2(model, loader, device, loss_fn):
    ### argument
    log_interval = 1
    ###

    model.eval()
    # val_loss_m = AverageMeter()
    acc_m = AverageMeter()

    last_idx = len(loader) - 1
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(loader), total=len(loader)):
            last_batch = idx == last_idx
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'],
                                 token_type_ids=batch['token_type_ids'],
                                 attention_mask=batch['attention_mask'])
            
            logits = outputs[0]
            # val_loss_m.update(loss.data.item(), batch['input_ids'].size(0))
            
            acc = sum([i == j for i, j in zip(torch.argmax(logits, 1).tolist(), batch['labels'])]) / len(batch['labels'])
            acc_m.update(acc)

            del batch, outputs            

        if last_batch or (idx+1 % log_interval == 0):
            _logger.info(f'avg_accuracy : {acc_m.avg}')

    metrics = OrderedDict([('accuracy', acc_m.avg)])

    del loader
    gc.collect()

    return metrics

In [9]:
def save(model, tokenizer, new_model):
    _logger.info('saving model...')
    os.makedirs(new_model, exist_ok=True)
    torch.save(model.state_dict(), new_model + WEIGHTS_NAME)
    model.config.to_json_file(new_model + CONFIG_NAME)
    tokenizer.save_pretrained(new_model)
    _logger.info(f'saved! {new_model}')

In [10]:
### argument
experiment = "sbse"
random_seed = 1234
# prev_model = "bert-base-multilingual-cased"
prev_model = "monologg/kobert"
train_data_path = "/home/ubuntu/workspace/kaist.sbse/proj/data/ratings_train.txt"
new_model = "/home/ubuntu/workspace/kaist.sbse/proj/model/bert.v4/"
max_len = 512
batch_size = 8
lr = 5e-5
epochs = 10
valid_every_n_batch = 1
save_best = "loss"
val_metric = "loss"
###

In [11]:
if has_wandb:
        wandb.init(project=experiment)
else: 
    _logger.warning("You've requested to log metrics to wandb but package not found. "
                    "Metrics not being logged to wandb, try `pip install wandb`")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhannabros[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [12]:
setup_default_logging()
if torch.cuda.is_available():
    device = torch.device("cuda")
    _logger.info(f'GPU: {torch.cuda.get_device_name(0)}')
else:
    device = torch.device("cpu")
torch.manual_seed(random_seed)

GPU: Tesla T4


<torch._C.Generator at 0x7f6f7e8fd330>

In [13]:
tokenizer = BertTokenizer.from_pretrained(prev_model, do_lower_case=False)
train_dataset, valid_dataset = get_dataset(train_data_path, tokenizer, max_len, random_seed, is_train=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=customCollate)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=customCollate)

Lock 140116741901968 acquired on /home/ubuntu/.cache/huggingface/transformers/efee434f5f4c5c89b5a7d8d5f30bbb0496f1540349fcfa21729cec5b96cfd2d1.719459e20bc981bc2093e859b02c3a3e51bab724d6b58927b23b512a3981229f.lock


Downloading:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Lock 140116741901968 released on /home/ubuntu/.cache/huggingface/transformers/efee434f5f4c5c89b5a7d8d5f30bbb0496f1540349fcfa21729cec5b96cfd2d1.719459e20bc981bc2093e859b02c3a3e51bab724d6b58927b23b512a3981229f.lock
Lock 140116734508880 acquired on /home/ubuntu/.cache/huggingface/transformers/d1c07e179f5e00959a3c8e4a150eaa4907dfe26544e4a71f2b0163982a476523.767d1b760a83978bae6c324157fad57ee513af333a7cea6986e852579f6f0dd1.lock


Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Lock 140116734508880 released on /home/ubuntu/.cache/huggingface/transformers/d1c07e179f5e00959a3c8e4a150eaa4907dfe26544e4a71f2b0163982a476523.767d1b760a83978bae6c324157fad57ee513af333a7cea6986e852579f6f0dd1.lock
Lock 140116734245456 acquired on /home/ubuntu/.cache/huggingface/transformers/31dc8da633439f22ed80bede01f337996bc709eb8429f86f2b24e2103558b039.89a06cdfd16840fd89cc5c2493ef63cd0b6068e85f70ac988a3673e2722cab2e.lock


Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

Lock 140116734245456 released on /home/ubuntu/.cache/huggingface/transformers/31dc8da633439f22ed80bede01f337996bc709eb8429f86f2b24e2103558b039.89a06cdfd16840fd89cc5c2493ef63cd0b6068e85f70ac988a3673e2722cab2e.lock
train dataset : 135000, valid dataset : 15000
train dataset : Counter({0: 67656, 1: 67344}), valid dataset : Counter({0: 7517, 1: 7483})


In [14]:
model = initalize_model(prev_model, max_len)
model = model.to(device)

Lock 140116790583376 acquired on /home/ubuntu/.cache/huggingface/transformers/9525d6f96682baa1f21538ea58d36263fe16a46345dd9637e3e28a4df2f9380f.ebe6e13ff204bebbffd4764cda3d5a97dc690a9c4110bde6d909ddc3ed5c4585.lock


Downloading:   0%|          | 0.00/369M [00:00<?, ?B/s]

Lock 140116790583376 released on /home/ubuntu/.cache/huggingface/transformers/9525d6f96682baa1f21538ea58d36263fe16a46345dd9637e3e28a4df2f9380f.ebe6e13ff204bebbffd4764cda3d5a97dc690a9c4110bde6d909ddc3ed5c4585.lock
Some weights of SentimentBertModel were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.0.weight', 'classifier.0.bias', 'classifier.2.bias', 'classifier.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
### Sequence Classification
model = BertForSequenceClassification.from_pretrained(prev_model, num_labels=2)

for name, param in model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [16]:
total_steps = len(train_loader) * epochs
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2, num_training_steps=total_steps)
# scheduler = LambdaLR(optimizer=optimizer, lr_lambda=lambda epoch: 0.9 ** epoch)
warmup_step = int(total_steps * 0.1)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss()

In [17]:
min_val_loss = 10.0
max_accuracy = 0.0

try:
    for epoch in range(epochs):
        rowd = OrderedDict(epoch=epoch)
        gc.collect()
        _logger.info(f'Train {epoch} epoch')
        train_metrics = train_one_epoch(model, train_loader, device, loss_fn, optimizer)
        scheduler.step()
        rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
        gc.collect()

        if (epoch + 1) % valid_every_n_batch == 0:
            _logger.info('validation ...')
            valid_metrics = validation(model, valid_loader, device, loss_fn)
            rowd.update([('eval_' + k, v) for k, v in valid_metrics.items()])
            # scheduler.step()

            gc.collect()

            # Save / Early Stop
            if save_best.lower().startswith('loss'):
                _logger.info(f'best loss was {min_val_loss}')
                if valid_metrics[val_metric] < min_val_loss:
                    min_val_loss = valid_metrics[val_metric]
                    _logger.info(f'best loss changed to {min_val_loss}')
                    save(model, tokenizer, new_model)
            elif save_best.lower().startswith('accuracy'):
                _logger.info(f'best accuracy was {max_accuracy}')                    
                if valid_metrics[val_metric] > max_accuracy:
                    max_accuracy = valid_metrics[val_metric]
                    _logger.info(f'best accuracy changed to {max_accuracy}')
                    save(model, tokenizer, new_model)
        wandb.log(rowd)

except KeyboardInterrupt:
    if save_best.lower().startswith('accuracy'):
        _logger.info(f'Model accuracy was {max_accuracy}')
    elif save_best.lower().startswith('loss'):
        _logger.info(f'Model valid loss was {min_val_loss}')
    elif save_best.lower().startswith('weight'):
        _logger.info(f'Model weighted valid loss was {min_val_loss}')
    _logger.info('Bye!')
    

Train 0 epoch
avg_train_loss : 0.7073715925216675, LR : 0.0
100%|█████████▉| 16874/16875 [1:23:55<00:00,  3.35it/s]avg_train_loss : 0.6945980079191703, LR : 0.0
100%|██████████| 16875/16875 [1:23:55<00:00,  3.35it/s]
validation ...
100%|██████████| 1875/1875 [08:33<00:00,  3.65it/s]
avg_val_loss : 0.6944197644233704, avg_accuracy : 0.49406668543815613
best loss was 10.0
best loss changed to 0.6944197644233704
saving model...
saved! /home/ubuntu/workspace/kaist.sbse/proj/model/bert.v4/
Train 1 epoch
  0%|          | 0/16875 [00:00<?, ?it/s]avg_train_loss : 0.7007497549057007, LR : 2.9629629629629633e-09
100%|█████████▉| 16874/16875 [1:24:04<00:00,  3.35it/s]avg_train_loss : 0.6943976277210094, LR : 2.9629629629629633e-09
100%|██████████| 16875/16875 [1:24:05<00:00,  3.34it/s]
validation ...
100%|██████████| 1875/1875 [08:34<00:00,  3.64it/s]
avg_val_loss : 0.6941889005343119, avg_accuracy : 0.4939333498477936
best loss was 0.6944197644233704
best loss changed to 0.6941889005343119
savin

In [45]:
### argument
test_data_path = "/home/ubuntu/workspace/kaist.sbse/proj/data/ratings_test.txt"
model_path = "/home/ubuntu/workspace/kaist.sbse/proj/model/bert.v2/"
###

In [23]:
test_dataset = get_dataset(test_data_path, tokenizer, max_len, random_seed, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=customCollate)

test dataset :1000
test dataset :Counter({1: 25173, 0: 24827})


In [46]:
acc_m = AverageMeter()
pred_ids, true_ids = [], []
last_idx = len(test_loader) - 1
model.eval()
with torch.no_grad():
    for idx, batch in tqdm(enumerate(test_loader), total=len(test_loader)):
        last_batch = idx == last_idx
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(input_ids=batch['input_ids'],
                                 token_type_ids=batch['token_type_ids'],
                                 attention_mask=batch['attention_mask'])
            
        loss = loss_fn(logits.view(-1, 2), batch['labels'].view(-1))
            
        acc = sum([i == j for i, j in zip(torch.argmax(logits, 1).tolist(), batch['labels'])]) / len(batch['labels'])
        acc_m.update(acc)

        pred_ids.extend(torch.argmax(logits, 1).tolist())
        true_ids.extend(batch['labels'].tolist())

        if last_batch:
            _logger.info(f'avg_accuracy : {acc_m.avg}')

  0%|          | 0/125 [00:00<?, ?it/s]


AttributeError: 'SequenceClassifierOutput' object has no attribute 'view'

In [47]:
acc_m = AverageMeter()
pred_ids, true_ids = [], []
last_idx = len(test_loader) - 1
model.eval()
with torch.no_grad():
    for idx, batch in tqdm(enumerate(test_loader), total=len(test_loader)):
        last_batch = idx == last_idx
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'],
                                 token_type_ids=batch['token_type_ids'],
                                 attention_mask=batch['attention_mask'])
        logits = outputs[0]
            
        acc = sum([i == j for i, j in zip(torch.argmax(logits, 1).tolist(), batch['labels'])]) / len(batch['labels'])
        acc_m.update(acc)

        pred_ids.extend(torch.argmax(logits, 1).tolist())
        true_ids.extend(batch['labels'].tolist())

        if last_batch:
            _logger.info(f'avg_accuracy : {acc_m.avg}')

 99%|█████████▉| 124/125 [00:30<00:00,  3.97it/s]avg_accuracy : 0.64000004529953
100%|██████████| 125/125 [00:30<00:00,  4.05it/s]


In [48]:
_logger.info(classification_report(true_ids, pred_ids))

              precision    recall  f1-score   support

           0       0.62      0.71      0.66       492
           1       0.67      0.57      0.62       508

    accuracy                           0.64      1000
   macro avg       0.64      0.64      0.64      1000
weighted avg       0.64      0.64      0.64      1000

