In [None]:
%matplotlib inline
from glob import glob
import os
import matplotlib.pyplot as plt
import json
import copy
from collections import defaultdict
import gc
gc.enable()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)

from transformers import AutoConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup
)
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
from IPython.display import clear_output
from tqdm import tqdm, trange
from transformers.modeling_utils import SequenceSummary

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
from sklearn import model_selection
def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state=2021)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'kfold'] = f
    return data
train = create_folds(train, num_splits=5)


In [None]:
train

In [70]:
class csvDataset(Dataset):
    def __init__(self, df_data):
        self.data = copy.deepcopy(df_data)
        self.data['excerpt'] = self.data['excerpt'].map(lambda x : x.replace('\n', ' '))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        self.excerpts = self.data.excerpt.values.tolist()
        if 'target' in self.data.columns:
            self.targets = self.data.target.values.tolist()
            return {'excerpts': self.excerpts[index], 'targets': self.targets[index]}
        else:
            return {'excerpts': self.excerpts[index]}
            
        
        

class BatchGenerator:
    def __init__(self, tokenizer, max_len=256):
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __call__(self, batch):
        self.tokenizer.padding_side = "right"
        data_str =[item['excerpts'] for item in batch]
        data_batch = self.tokenizer.batch_encode_plus(data_str, 
                                                        padding='longest', 
                                                        max_length=self.max_len,
                                                        truncation=True, 
                                                        return_tensors='pt')
        
        if 'targets' in batch[0]:
            targets =torch.tensor([item['targets'] for item in batch])#,dtype=torch.double)
            
            return {'input_ids': data_batch.input_ids,
                'token_type_ids': data_batch.token_type_ids,
                'attention_mask': data_batch.attention_mask,
                'label':targets
                   }
        


        return {'input_ids': data_batch.input_ids,
                'token_type_ids': data_batch.token_type_ids,
                'attention_mask': data_batch.attention_mask}
        

def get_dataloader(dataset, batch_generator, batch_size=4, shuffle=True):
    data_loader = DataLoader(dataset, 
                             batch_size=batch_size, 
                             shuffle=shuffle, 
                             collate_fn=batch_generator,
                             num_workers=4,
                             pin_memory=True)
    
    return data_loader


In [93]:
from transformers.models.roberta.modeling_roberta import RobertaClassificationHead

class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        super(CommonLitModel, self).__init__()
        self.config = config
        self.clmodel = AutoModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        self.sequence_summary = SequenceSummary(config)  
        #self.regressor = nn.Linear(config.hidden_size*2, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
        #self.classifier = RobertaClassificationHead(config)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.clmodel(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        #logits = self.classifier(sequence_output)
        output = self.sequence_summary(sequence_output)
        logits = self.regressor(output)

    
        loss = None
        if labels is not None:
            # regression task
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
            #loss = loss_fn(logits, labels.view(-1))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

In [94]:
class Lamb(Optimizer):
    # Reference code: https://github.com/cybertronai/pytorch-lamb

    def __init__(
        self,
        params,
        lr: float = 1e-3,
        betas = (0.9, 0.999),
        eps: float = 1e-6,
        weight_decay: float = 0,
        clamp_value: float = 10,
        adam: bool = False,
        debias: bool = False,
    ):
        if lr <= 0.0:
            raise ValueError('Invalid learning rate: {}'.format(lr))
        if eps < 0.0:
            raise ValueError('Invalid epsilon value: {}'.format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError(
                'Invalid beta parameter at index 0: {}'.format(betas[0])
            )
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError(
                'Invalid beta parameter at index 1: {}'.format(betas[1])
            )
        if weight_decay < 0:
            raise ValueError(
                'Invalid weight_decay value: {}'.format(weight_decay)
            )
        if clamp_value < 0.0:
            raise ValueError('Invalid clamp value: {}'.format(clamp_value))

        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.clamp_value = clamp_value
        self.adam = adam
        self.debias = debias

        super(Lamb, self).__init__(params, defaults)

    def step(self, closure = None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    msg = (
                        'Lamb does not support sparse gradients, '
                        'please consider SparseAdam instead'
                    )
                    raise RuntimeError(msg)

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                # m_t
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                # v_t
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                # Paper v3 does not use debiasing.
                if self.debias:
                    bias_correction = math.sqrt(1 - beta2 ** state['step'])
                    bias_correction /= 1 - beta1 ** state['step']
                else:
                    bias_correction = 1

                # Apply bias to lr to avoid broadcast.
                step_size = group['lr'] * bias_correction

                weight_norm = torch.norm(p.data).clamp(0, self.clamp_value)

                adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
                if group['weight_decay'] != 0:
                    adam_step.add_(p.data, alpha=group['weight_decay'])

                adam_norm = torch.norm(adam_step)
                if weight_norm == 0 or adam_norm == 0:
                    trust_ratio = 1
                else:
                    trust_ratio = weight_norm / adam_norm
                state['weight_norm'] = weight_norm
                state['adam_norm'] = adam_norm
                state['trust_ratio'] = trust_ratio
                if self.adam:
                    trust_ratio = 1

                p.data.add_(adam_step, alpha=-step_size * trust_ratio)

        return loss

In [95]:
def get_optimizer_params(model):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 5e-5
    no_decay = ['bias', 'gamma', 'beta']
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
    optimizer_parameters = [
        {'params': [p for n, p in model.clmodel.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.01},
        {'params': [p for n, p in model.clmodel.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.01, 'lr': learning_rate/2.6},
        {'params': [p for n, p in model.clmodel.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.01, 'lr': learning_rate},
        {'params': [p for n, p in model.clmodel.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.01, 'lr': learning_rate*2.6},
        {'params': [p for n, p in model.clmodel.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.clmodel.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': learning_rate/2.6},
        {'params': [p for n, p in model.clmodel.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': learning_rate},
        {'params': [p for n, p in model.clmodel.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': learning_rate*2.6},
        {'params': [p for n, p in model.named_parameters() if "clmodel" not in n], 'lr':1e-3, "momentum" : 0.99},
    ]
    return optimizer_parameters

In [96]:
#param_optimizer = list(model.named_parameters())


In [97]:
def make_model(model_name='../input/xlnet-large-cased/', num_labels=1):
    tokenizer = AutoTokenizer.from_pretrained('xlnet-large-cased')
    config = AutoConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    
    ### add below ###
    config.update({ 
        "attention_probs_dropout_prob": 0.0,
        "hidden_dropout_prob": 0.0
        })
    
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer

def make_optimizer(model, optimizer_name="AdamW"):
    optimizer_grouped_parameters = get_optimizer_params(model)
    kwargs = {
            'lr':5e-5,
            'weight_decay':0.01,
            # 'betas': (0.9, 0.98),
            # 'eps': 1e-06
    }
    if optimizer_name == "LAMB":
        optimizer = Lamb(optimizer_grouped_parameters, **kwargs)
        return optimizer
    elif optimizer_name == "Adam":
        from torch.optim import Adam
        optimizer = Adam(optimizer_grouped_parameters, **kwargs)
        return optimizer
    elif optimizer_name == "AdamW":
        optimizer = AdamW(optimizer_grouped_parameters, **kwargs)
        return optimizer
    else:
        raise Exception('Unknown optimizer: {}'.format(optimizer_name))

def make_scheduler(optimizer, decay_name='linear', t_max=None, warmup_steps=None):
    if decay_name == 'step':
        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=[30, 60, 90],
            gamma=0.1
        )
    elif decay_name == 'cosine':
        scheduler = lrs.CosineAnnealingLR(
            optimizer,
            T_max=t_max
        )
    elif decay_name == "cosine_warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_max
        )
    elif decay_name == "linear":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=warmup_steps, 
            num_training_steps=t_max
        )
    else:
        raise Exception('Unknown lr scheduler: {}'.format(decay_type))    
    return scheduler    

def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
    fold=0
):
    train_set, valid_set = data[data['kfold']!=fold], data[data['kfold']==fold]
    train_set=csvDataset(data)
    train_generator = BatchGenerator(tokenizer)
    train_loader = get_dataloader(train_set,train_generator,batch_size)
    
    valid_set=csvDataset(valid_set)
    valid_generator = BatchGenerator(tokenizer)
    valid_loader = get_dataloader(valid_set,valid_generator,batch_size//2)
    

    return train_loader, valid_loader

In [98]:
class Trainer:
    def __init__(self, model, optimizer, scheduler, log_interval=1, evaluate_interval=1):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.log_interval = log_interval
        self.evaluate_interval = evaluate_interval
        self.evaluator = Evaluator(self.model)
        self.gradient_accumulation_steps = 4

    def train(self, train_loader, valid_loader, epoch, 
              result_dict, tokenizer, fold):
        count = 0
        self.model.train()
        device = 'cuda:1'
        
        
        for batch_idx, batch_data in enumerate(train_loader):
            input_ids, attention_mask, token_type_ids, labels = batch_data['input_ids'], \
                batch_data['attention_mask'], batch_data['token_type_ids'], batch_data['label']
            input_ids, attention_mask, token_type_ids, labels = \
                input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)
                #input_ids.cuda(), attention_mask.cuda(), token_type_ids.cuda(), labels.cuda()
          
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )

            loss, logits = outputs[:2]
            count += labels.size(0)
            
            loss = loss / self.gradient_accumulation_steps
            loss.backward()
            
            if batch_idx % self.gradient_accumulation_steps:
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.scheduler.step()
    

            if batch_idx % self.log_interval == 0:
                _s = str(len(str(len(train_loader.sampler))))
                ret = [
                    ('epoch: {:0>3} [{: >' + _s + '}/{} ({: >3.0f}%)]').format(epoch, count, len(train_loader.sampler), 100 * count / len(train_loader.sampler)),
                    'train_loss: {: >4.5f}'.format(loss),
                ]
                print(', '.join(ret))
            
            if batch_idx % self.evaluate_interval == 0:
                result_dict = self.evaluator.evaluate(
                    valid_loader, 
                    epoch, 
                    result_dict, 
                    tokenizer
                )
                if result_dict['val_loss'][-1] < result_dict['best_val_loss']:
                    print("{} epoch, best epoch was updated! valid_loss: {: >4.5f}".format(epoch, result_dict['val_loss'][-1]))
                    result_dict["best_val_loss"] = result_dict['val_loss'][-1]
                    torch.save(self.model.state_dict(), f"model{fold}.bin")

        result_dict['train_loss'].append(loss)
        return result_dict

In [99]:
class Evaluator:
    def __init__(self, model):
        self.model = model
    
    def worst_result(self):
        ret = {
            'loss':float('inf'),
            'accuracy':0.0
        }
        return ret

    def result_to_str(self, result):
        ret = [
            'epoch: {epoch:0>3}',
            'loss: {loss: >4.2e}'
        ]
        for metric in self.evaluation_metrics:
            ret.append('{}: {}'.format(metric.name, metric.fmtstr))
        return ', '.join(ret).format(**result)

    def save(self, result):
        with open('result_dict.json', 'w') as f:
            f.write(json.dumps(result, sort_keys=True, indent=4, ensure_ascii=False))
    
    def load(self):
        result = self.worst_result
        if os.path.exists('result_dict.json'):
            with open('result_dict.json', 'r') as f:
                try:
                    result = json.loads(f.read())
                except:
                    pass
        return result

    def evaluate(self, data_loader, epoch, result_dict, tokenizer):
        device = 'cuda:1'        
        losses = AverageMeter()

        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids, labels = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids'], batch_data['label']
                input_ids, attention_mask, token_type_ids, labels = \
                input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)
                
              
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels
                )

                loss, logits = outputs[:2]
                losses.update(loss.item(), input_ids.size(0))

        print('----Validation Results Summary----')
        print('Epoch: [{}] valid_loss: {: >4.5f}'.format(epoch, losses.avg))

        result_dict['val_loss'].append(losses.avg)        
        return result_dict

In [100]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
epochs = 8
max_len = 1024
batch_size = 16

#model, tokenizer = make_model(model_name='xlnet-large-cased', num_labels=1)



In [101]:
from  transformers import XLNetForSequenceClassification, RobertaForSequenceClassification
#model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels = 1)
#model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels = 1)
tokenizer = AutoTokenizer.from_pretrained('xlnet-large-cased')
device= "cuda:1"
#model = model.to(device)


In [102]:
#config = AutoConfig.from_pretrained('roberta-base')
# config.update({'num_labels':1})
# model = CommonLitModel('roberta-base', config)
model_name = 'xlnet-large-cased'
config = AutoConfig.from_pretrained(model_name)
config.update({'num_labels':1})
model = CommonLitModel(model_name, config=config)
model = model.to(device)


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenizer('hello')

In [60]:
reinit_layers = 2
_model_type = 'clmodel'
_pretrained_model = 'roberta-base'
if reinit_layers > 0:
    print(f'Reinitializing Last {reinit_layers} Layers ...')
    encoder_temp = getattr(model, _model_type)
    for layer in encoder_temp.encoder.layer[-reinit_layers:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    print('Done.!')

Reinitializing Last 2 Layers ...
Done.!


In [78]:
tokenizer('hello')

{'input_ids': [24717, 4, 3], 'token_type_ids': [0, 0, 2], 'attention_mask': [1, 1, 1]}

In [66]:
#model.load_state_dict(torch.load('model5.bin'))

In [71]:
def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
    fold=0
):
    train_set, valid_set = data[data['kfold']!=fold], data[data['kfold']==fold]
    train_set=csvDataset(train_set)
    train_generator = BatchGenerator(tokenizer)
    train_loader = get_dataloader(train_set,train_generator,batch_size)
    
    valid_set=csvDataset(valid_set)
    valid_generator = BatchGenerator(tokenizer)
    valid_loader = get_dataloader(valid_set,valid_generator,batch_size//2)
    

    return train_loader, valid_loader

In [72]:
train_loader, valid_loader = make_loader(
    train, tokenizer, max_len=max_len,
    batch_size=batch_size, fold=0
)

In [73]:
next(iter(train_loader))

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/centos/anaconda3/envs/jisu_env/lib/python3.6/site-packages/transformers/tokenization_utils_base.py", line 242, in __getattr__
    return self.data[item]
KeyError: 'token_type_ids'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/centos/anaconda3/envs/jisu_env/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/centos/anaconda3/envs/jisu_env/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "<ipython-input-70-7404f9948c45>", line 37, in __call__
    'token_type_ids': data_batch.token_type_ids,
  File "/home/centos/anaconda3/envs/jisu_env/lib/python3.6/site-packages/transformers/tokenization_utils_base.py", line 244, in __getattr__
    raise AttributeError
AttributeError


In [64]:
EPOCHS = 16
BATCH_SIZE = 8
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

total_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [65]:
from sklearn.metrics import mean_squared_error

def train_epoch(model, data_loader, optimizer, device, scheduler):

    model = model.train()
    losses = []
    acc = 0
    counter = 0

    for batch_idx, batch_data in enumerate(data_loader):

        input_ids, attention_mask, labels = batch_data['input_ids'], \
        batch_data['attention_mask'], batch_data['label']
        input_ids, attention_mask, labels = \
            input_ids.to(device), attention_mask.to(device), labels.to(device)
            #input_ids.cuda(), attention_mask.cuda(), token_type_ids.cuda(), labels.cuda()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids,
            labels=labels
        )

        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        labels = labels.cpu().detach().numpy()
        prediction = outputs[1].cpu().detach().numpy()
        accuracy=mean_squared_error(labels,prediction)

        acc += accuracy
        losses.append(loss.item())

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        counter = counter + 1

    return acc / counter, np.mean(losses)


In [52]:
@torch.no_grad()
def evluate_epoch(model, data_loader, device):

    model = model.train()
    losses = []
    acc = 0
    counter = 0

    for batch_idx, batch_data in enumerate(data_loader):

        input_ids, attention_mask, labels = batch_data['input_ids'], \
        batch_data['attention_mask'], batch_data['label']
        input_ids, attention_mask, labels = \
            input_ids.to(device), attention_mask.to(device), labels.to(device)
            #input_ids.cuda(), attention_mask.cuda(), token_type_ids.cuda(), labels.cuda()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids,
            labels=labels
        )

        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        labels = labels.cpu().detach().numpy()
        prediction = outputs[1].cpu().detach().numpy()
        accuracy=mean_squared_error(labels,prediction)

        acc += accuracy
        counter = counter + 1

    return acc / counter


In [53]:
%%time
history = defaultdict(list)
best_accuracy = 999

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,     
        optimizer, 
        device, 
        scheduler
    )
    valid_loss = evluate_epoch(
        model,
        valid_loader,     
        device
    )

    print(f'Train loss {train_acc} Valid loss {valid_loss}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)    

    if valid_loss < best_accuracy:
        torch.save(model.state_dict(), "model0.bin")
        best_accuracy = valid_loss

Epoch 1/16
----------
Train loss 0.6325833246531621 Valid loss 0.3086862495877373
Epoch 2/16
----------
Train loss 0.27090589851665664 Valid loss 0.3561110838618077
Epoch 3/16
----------
Train loss 0.19169292173964878 Valid loss 0.2647916595385948
Epoch 4/16
----------
Train loss 0.13087898429850459 Valid loss 0.2808031904655443
Epoch 5/16
----------
Train loss 0.09727156159638518 Valid loss 0.3122894480614595
Epoch 6/16
----------
Train loss 0.07133061402070691 Valid loss 0.3017549442363457
Epoch 7/16
----------
Train loss 0.061249757736501556 Valid loss 0.26380499649110817
Epoch 8/16
----------
Train loss 0.050833152725138296 Valid loss 0.28153270839805333
Epoch 9/16
----------
Train loss 0.03911829486646703 Valid loss 0.2689841369193205
Epoch 10/16
----------
Train loss 0.036853269542711724 Valid loss 0.26640812895247634
Epoch 11/16
----------
Train loss 0.029461589727130994 Valid loss 0.2454002123903221
Epoch 12/16
----------
Train loss 0.031125376401075596 Valid loss 0.25306942696

In [34]:
## Loberta
# Large model 0.24114447321132035
# Base model also get 0.249

In [None]:
# test = pd.read_csv('input/test.csv')

# test_set=csvDataset(test)
# generator = BatchGenerator(tokenizer)
# test_loader = get_dataloader(test_set,generator,batch_size)

# input_ids, attention_mask, token_type_ids = test_batch['input_ids'], \
# test_batch['attention_mask'], test_batch['token_type_ids']
# input_ids, attention_mask, token_type_ids = \
#     input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)


In [None]:
test_batch = next(iter(train_loader))

In [29]:
input_ids, attention_mask= test_batch['input_ids'], \
test_batch['attention_mask']
input_ids, attention_mask = \
    input_ids.to(device), attention_mask.to(device)
    #input_ids.cuda(), attention_mask.cuda(), token_type_ids.cuda(), labels.cuda()

In [30]:
model(
    input_ids=input_ids,
    attention_mask=attention_mask
)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.1450],
        [-0.0205],
        [-1.3368],
        [-1.3012],
        [-0.6730],
        [-0.8062],
        [-1.4579],
        [-0.0270],
        [-0.2827],
        [-2.2349],
        [-0.3861],
        [-0.5837],
        [-1.8859],
        [-1.2470],
        [ 0.9654],
        [-3.3354]], device='cuda:1', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [31]:
test_batch['label']

tensor([-1.1969, -0.0868, -1.2149, -1.2125, -0.8335, -0.7310, -1.3928,  0.0215,
        -0.2370, -2.2534, -0.4163, -0.3716, -1.7088, -1.0768,  1.0909, -3.3092])

In [34]:
model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids,
)

XLNetForSequenceClassificationOutput(loss=None, logits=tensor([[-1.8539],
        [-1.7061],
        [-1.5374],
        [-1.6646],
        [-2.0674],
        [-0.8819],
        [-0.0977],
        [-2.3524],
        [-0.1410],
        [-0.3518],
        [ 1.2315],
        [-2.3641],
        [ 1.5429],
        [ 1.4368],
        [-0.1662],
        [-1.1045]], device='cuda:1', grad_fn=<AddmmBackward>), mems=None, hidden_states=None, attentions=None)