In [2]:
import transformers
from transformers import AutoModelForQuestionAnswering, AutoModel, AutoConfig, get_linear_schedule_with_warmup
from transformers.optimization import AdamW
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
import pandas as pd
import numpy as np
from pathlib import Path
import os
from itertools import compress
import utils
from sklearn.model_selection import StratifiedKFold

In [3]:
from torch.utils.data import DataLoader
from apex.optimizers.fused_lamb import FusedLAMB as Lamb
from ranger import Ranger
from fairseq import criterions 
from functools import partial
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from fastai.core import *
from fastai.text import *

In [4]:
path = Path.cwd()
file_dir = Path.cwd()/'data'

In [5]:
train_df = pd.read_csv(file_dir/'train.csv')
train_df['text'] = train_df['text'].apply(lambda x: str(x))
train_df['sentiment'] = train_df['sentiment'].apply(lambda x: str(x))
train_df['selected_text'] = train_df['selected_text'].apply(lambda x: str(x))

In [46]:
test_df = pd.read_csv(file_dir/'test.csv')
test_df['text'] = test_df['text'].apply(lambda x: str(x))
test_df['sentiment'] = test_df['sentiment'].apply(lambda x: str(x))

In [7]:
max_len = 128
bs = 32
tokenizer = transformers.RobertaTokenizerFast('input/roberta-base/vocab.json', 'input/roberta-base/merges.txt',
                                  lowercase=True,add_prefix_space=True).tokenizer



In [8]:
def preprocess(sentiment, tweet, selected, tokenizer, max_len):
    _input = tokenizer.encode(sentiment, tweet)
    _span = tokenizer.encode(selected, add_special_tokens=False)
    
    len_span = len(_span.ids)
    start_idx = None
    end_idx = None
    
    for ind in (i for i, e in enumerate(_input.ids) if e == _span.ids[0]):
        if _input.ids[ind: ind + len_span] == _span.ids:
            start_idx = ind
            end_idx = ind + len_span - 1
            break
    
    # Handles cases where tokenizing input & span separately produces different outputs
    if not start_idx:
        idx0 = tweet.find(selected)
        idx1 = idx0 + len(selected)
        
        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1):
                char_targets[ct] = 1
                
        tweet_offsets = _input.offsets[4:-1]
        
        target_idx = []
        for j, (offset1, offset2) in enumerate(tweet_offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)
                
        start_idx, end_idx = target_idx[0] + 4 , target_idx[-1] + 4
        
    _input.start_target = start_idx
    _input.end_target = end_idx
    _input.tweet = tweet
    _input.sentiment = sentiment
    _input.selected = selected
    
    _input.pad(max_len)
    
    return _input

In [9]:
def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss    

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, ε:float=0.1, reduction='mean'):
        super().__init__()
        self.ε,self.reduction = ε,reduction
    
    def forward(self, output, target):
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return torch.lerp(nll, loss/c, self.ε) 

In [10]:
class TweetDataset(Dataset):
    def __init__(self, dataset, test = None):
        self.df = dataset
        self.test = test
        
    def __getitem__(self, idx):
        if not self.test:
            sentiment, tweet, selected = (self.df[col][idx] for col in ['sentiment', 'text', 'selected_text'])
            _input = preprocess(sentiment, tweet, selected, tokenizer, max_len)
            
            yb = [torch.tensor(_input.start_target), torch.tensor(_input.end_target)]
            
        else:
            _input = tokenizer.encode(self.df.sentiment[idx], self.df.text[idx])
            _input.pad(max_len)
            
            yb = 0

        xb = [torch.LongTensor(_input.ids),
              torch.LongTensor(_input.attention_mask),
              np.array(_input.offsets)]

        return xb, yb     

    def __len__(self):
        return len(self.df)

In [11]:
roberta_path = 'input/roberta-base'
pt_model = AutoModel.from_pretrained(roberta_path)

INFO:transformers.configuration_utils:loading configuration file input/roberta-base/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 1

In [39]:
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x): 
        
        x = x * (torch.tanh(F.softplus(x)))

        return x

In [12]:
class SpanModel(nn.Module):
    def __init__(self,pt_model):
        super().__init__()
        self.model = pt_model
        self.drop_out = nn.Dropout(0.1)
        self.qa_outputs1c = torch.nn.Conv1d(768*2, 128, 2)
        self.qa_outputs2c = torch.nn.Conv1d(768*2, 128, 2)
        self.qa_outputs1 = nn.Linear(128, 1)
        self.qa_outputs2 = nn.Linear(128, 1)
        
#         self.qa_outputs = nn.Linear(768 * 2, 2) # update hidden size

    # could pass offsets here and not use - can grab in last_input
    def forward(self, input_ids, attention_mask, offsets = None):
        
        _, _, hidden_states = self.model(input_ids, attention_mask=attention_mask)

        out = torch.cat((hidden_states[-1], hidden_states[-2]), dim=-1)
        out = self.drop_out(out)
        out = torch.nn.functional.pad(out.transpose(1,2), (1, 0))
        
        out1 = self.qa_outputs1c(out).transpose(1,2)
        out2 = self.qa_outputs2c(out).transpose(1,2)

        start_logits = self.qa_outputs1(self.drop_out(out1)).squeeze(-1)
        end_logits = self.qa_outputs2(self.drop_out(out2)).squeeze(-1)
        return start_logits, end_logits

In [40]:
class SpanModel2(nn.Module):
    def __init__(self,pt_model):
        super().__init__()
        self.model = pt_model
        self.drop_out = nn.Dropout(0.2)
        self.qa_outputs1c = torch.nn.Conv1d(768*2, 128, 2)
        self.qa_outputs2c = torch.nn.Conv1d(768*2, 128, 2)
#         self.LRelu = torch.nn.LeakyReLU()
        self.mish = Mish()
        self.qa_outputs1 = nn.Linear(128, 1)
        self.qa_outputs2 = nn.Linear(128, 1)
        
#         self.qa_outputs = nn.Linear(768 * 2, 2) # update hidden size

    # could pass offsets here and not use - can grab in last_input
    def forward(self, input_ids, attention_mask, offsets = None):
        
        _, _, hidden_states = self.model(input_ids, attention_mask=attention_mask)

        out = torch.cat((hidden_states[-1], hidden_states[-2]), dim=-1)
        out = self.drop_out(out)
        out = torch.nn.functional.pad(out.transpose(1,2), (1, 0))
        
        out1 = self.qa_outputs1c(out).transpose(1,2)
        out1 = self.mish(out1)
        
        out2 = self.qa_outputs2c(out).transpose(1,2)
        out2 = self.mish(out2)

        start_logits = self.qa_outputs1(self.drop_out(out1)).squeeze(-1)
        end_logits = self.qa_outputs2(self.drop_out(out2)).squeeze(-1)
        return start_logits, end_logits

In [13]:
def dist_between(start_logits, end_logits, device='cpu', max_seq_len=128):
    """get dist btw. pred & ground_truth"""

    linear_func = torch.tensor(np.linspace(0, 1, max_seq_len, endpoint=False), requires_grad=False)
    linear_func = linear_func.to(device)

    start_pos = (start_logits*linear_func).sum(axis=1)
    end_pos = (end_logits*linear_func).sum(axis=1)

    diff = end_pos-start_pos

    return diff.sum(axis=0)/diff.size(0)


def dist_loss(start_logits, end_logits, start_positions, end_positions, device='cpu', max_seq_len=128, scale=1):
    """calculate distance loss between prediction's length & GT's length
    
    Input
    - start_logits ; shape (batch, max_seq_len{128})
        - logits for start index
    - end_logits
        - logits for end index
    - start_positions ; shape (batch, 1)
        - start index for GT
    - end_positions
        - end index for GT
    """
    start_logits = torch.nn.Softmax(1)(start_logits) # shape ; (batch, max_seq_len)
    end_logits = torch.nn.Softmax(1)(end_logits)
    
    start_one_hot = torch.nn.functional.one_hot(start_positions, num_classes=max_seq_len).to(device)
    end_one_hot = torch.nn.functional.one_hot(end_positions, num_classes=max_seq_len).to(device)
    
    pred_dist = dist_between(start_logits, end_logits, device, max_seq_len)
    gt_dist = dist_between(start_one_hot, end_one_hot, device, max_seq_len) # always positive
    diff = (gt_dist-pred_dist)

    rev_diff_squared = 1-torch.sqrt(diff*diff) # as diff is smaller, make it get closer to the one
    loss = -torch.log(rev_diff_squared) # by using negative log function, if argument is near zero -> inifinite, near one -> zero

    return loss*scale

In [14]:
class JaccardExpectationLoss(nn.Module):
    def __init__(self):
        super(JaccardExpectationLoss, self).__init__()
        self.softmax = nn.Softmax(dim=-1)
        self.relu = nn.ReLU()

    def forward(self, start_logits, end_logits, start_positions, end_positions):
        indexes = torch.arange(start_logits.size()[1]).unsqueeze(0).cuda()

        start_pred = torch.sum(self.softmax(start_logits) * indexes, dim=1)
        end_pred = torch.sum(self.softmax(end_logits) * indexes, dim=1)

        len_true = end_positions - start_positions + 1
        intersection = len_true - self.relu(start_pred - start_positions) - self.relu(end_positions - end_pred)
        union = len_true + self.relu(start_positions - start_pred) + self.relu(end_pred - end_positions)

        jel = 1 - intersection / union
        jel = torch.mean(jel)
        return jel

In [15]:
class join_loss(Module):
    def __init__(self, ce_loss_fn = nn.CrossEntropyLoss(), device = 'cpu'): 
        self.loss_fn = ce_loss_fn
        self.device = device
        
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs # assumes tuple input
        
        logits = torch.cat([start_logits, end_logits]).contiguous()
        
        targets = torch.cat([start_targets, end_targets]).contiguous()
        
        ce_loss = self.loss_fn(logits, targets)
        
        distance_loss = dist_loss(start_logits, end_logits, 
                                  start_targets, end_targets, device=device, max_seq_len =128)
        return ce_loss+distance_loss

In [16]:
class join_loss2(Module):
    def __init__(self, ce_loss_fn = nn.CrossEntropyLoss()): 
        self.loss_fn = ce_loss_fn
        self.jel = JaccardExpectationLoss()
        
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs # assumes tuple input
        
        logits = torch.cat([start_logits, end_logits]).contiguous()
        
        targets = torch.cat([start_targets, end_targets]).contiguous()
        
        ce_loss = self.loss_fn(logits, targets)
        
        jel_loss = self.jel(start_logits, end_logits, 
                                  start_targets, end_targets)
        
        return ce_loss+jel_loss

In [17]:
class CELoss(Module):
    def __init__(self, loss_fn = nn.CrossEntropyLoss()): 
        self.loss_fn = loss_fn
        
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs # assumes tuple input
        
        logits = torch.cat([start_logits, end_logits]).contiguous()
        
        targets = torch.cat([start_targets, end_targets]).contiguous()
        
        return self.loss_fn(logits, targets)

In [18]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [19]:
def get_best_start_end_idxs(start_logits, end_logits):
    max_len = len(start_logits)
    start_logits, end_logits = start_logits.cpu(),end_logits.cpu()
    
    a = np.tile(start_logits, (max_len, 1))
    b = np.tile(end_logits, (max_len, 1))
    c = np.tril(a + b.T, k=0).T
    c[c == 0] = -1000
    return np.unravel_index(c.argmax(), c.shape)

In [19]:
# Note that validation ds is by default not shuffled in fastai - so indexing like this will work for Callback
# https://forums.fast.ai/t/how-to-set-shuffle-false-of-train-and-val/33730

class JaccardScore(Callback):
    "Stores predictions and targets to perform calculations on epoch end."
    def __init__(self, valid_ds): 
        self.valid_ds = valid_ds
        self.context_text = valid_ds.df.text
        self.answer_text = valid_ds.df.selected_text
        
    def on_epoch_begin(self, **kwargs):
        self.jaccard_scores = []  
        self.valid_ds_idx = 0
        
        
    def on_batch_end(self, last_input:Tensor, last_output:Tensor, last_target:Tensor, **kwargs):
              
        input_ids = last_input[0]
        offsets = last_input[2]

        start_logits, end_logits = last_output
        
        # for id in batch of ids
        for i in range(len(input_ids)):
            
            _offsets = offsets[i]
#             start_idx, end_idx = torch.argmax(start_logits[i]), torch.argmax(end_logits[i])
            
            start_idx, end_idx = get_best_start_end_idxs(start_logits[i], end_logits[i])
            
            _answer_text = self.answer_text[self.valid_ds_idx]
            original_start, original_end = _offsets[start_idx][0], _offsets[end_idx][1]
            pred_span = self.context_text[self.valid_ds_idx][original_start : original_end]
                
            score = jaccard(pred_span, _answer_text)
            self.jaccard_scores.append(score)

            self.valid_ds_idx += 1
            
    def on_epoch_end(self, last_metrics, **kwargs):        
        res = np.mean(self.jaccard_scores)
        return add_metrics(last_metrics, res)

In [20]:
from fastai.callbacks import *

def new_on_train_begin(self, **kwargs:Any)->None:
    "Initializes the best value."
    if not hasattr(self, 'best'):
        self.best = float('inf') if self.operator == np.less else -float('inf')

SaveModelCallback.on_train_begin = new_on_train_begin

In [22]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [23]:
fold_indices = list(skf.split(train_df, train_df.sentiment))

In [23]:
device = torch.device('cuda')

def run_fold(fold):
    wd = 0.01 # high value but worked quite well
    lr = 1e-4
    
    tr_df  = train_df.iloc[fold_indices[fold][0]].reset_index(drop=True)
    val_df = train_df.iloc[fold_indices[fold][1]].reset_index(drop=True)
    
    train_ds, valid_ds = [TweetDataset(i) for i in [tr_df,val_df]]
    data = DataBunch.create(train_ds, valid_ds,bs = bs)
#     loss_fn = partial(CELoss, LabelSmoothingCrossEntropy())
#     loss_fn = join_loss(LabelSmoothingCrossEntropy(), device) # no need to initialise later - no partial 
    loss_fn = join_loss2(LabelSmoothingCrossEntropy())

    pt_model = AutoModel.from_pretrained(roberta_path)
    model = SpanModel(pt_model)
    learner = Learner(data, model, loss_func = loss_fn, opt_func = Ranger, metrics = [JaccardScore(valid_ds)])
    
    early_stop_cb = EarlyStoppingCallback(learner, monitor='jaccard_score',mode='max',patience=2)
    save_model_cb = SaveModelCallback(learner,every='improvement',monitor='jaccard_score',name=f'roberta_conv_news_fold_{fold}')
    csv_logger_cb = CSVLogger(learner, f"roberta_conv_logs_{fold}", True)

    learner.fit_fc(11, 3e-5 ,start_pct=0.7, wd=wd, callbacks=[early_stop_cb, save_model_cb, csv_logger_cb])
    
#     learner.fit_fc(3e-5, lr ,start_pct=0.7, wd=wd, callbacks=[early_stop_cb, save_model_cb, csv_logger_cb])

    learner.save(f'roberta_conv_news_after_epoch_{fold}', with_opt=False)
    del learner; del pt_model; gc.collect()

In [24]:
for fold in range(5):run_fold(fold)

INFO:transformers.configuration_utils:loading configuration file input/roberta-base/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 1

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,2.202166,2.171898,0.688536,09:39
1,2.070406,2.09977,0.698596,09:25
2,1.975594,2.051471,0.700281,09:22
3,1.91252,2.031664,0.707751,09:22
4,1.8935,2.041682,0.706538,09:22
5,1.827562,2.064327,0.705566,09:22


Better model found at epoch 0 with jaccard_score value: 0.6885358314943907.
Better model found at epoch 1 with jaccard_score value: 0.6985955435016602.
Better model found at epoch 2 with jaccard_score value: 0.7002808767335673.
Better model found at epoch 3 with jaccard_score value: 0.7077513993832506.
Epoch 6: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file input/roberta-base/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 1

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,2.265704,2.160805,0.687839,09:22
1,2.105697,2.066097,0.707562,09:22
2,1.993706,2.031105,0.712601,09:22
3,1.976372,2.01562,0.707488,09:23
4,1.890057,2.005164,0.704842,09:23


Better model found at epoch 0 with jaccard_score value: 0.6878394403681182.
Better model found at epoch 1 with jaccard_score value: 0.7075615959772799.
Better model found at epoch 2 with jaccard_score value: 0.7126013431541354.
Epoch 5: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file input/roberta-base/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 1

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,2.285354,2.19045,0.677029,09:22
1,2.086771,2.075686,0.707338,09:22
2,2.021292,2.032639,0.70229,09:23
3,1.979964,2.010003,0.711948,09:23
4,1.902113,2.014407,0.710595,09:22
5,1.828384,2.042181,0.710657,09:22


Better model found at epoch 0 with jaccard_score value: 0.6770294334617466.
Better model found at epoch 1 with jaccard_score value: 0.7073380955228691.
Better model found at epoch 3 with jaccard_score value: 0.7119476687892761.
Epoch 6: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file input/roberta-base/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 1

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,2.24622,2.184458,0.686684,09:22
1,2.11406,2.036966,0.705485,09:22
2,2.006424,2.006168,0.708894,09:22
3,1.971405,1.996889,0.704794,09:22
4,1.877421,2.018794,0.708881,09:23
5,1.818184,2.026209,0.709678,09:23
6,1.761049,2.051793,0.711111,09:23
7,1.683529,2.106042,0.707885,09:23
8,1.583682,2.146745,0.708583,09:23


Better model found at epoch 0 with jaccard_score value: 0.686683764047098.
Better model found at epoch 1 with jaccard_score value: 0.7054851424877298.
Better model found at epoch 2 with jaccard_score value: 0.7088940068433014.
Better model found at epoch 5 with jaccard_score value: 0.709678282284258.
Better model found at epoch 6 with jaccard_score value: 0.7111111413678048.
Epoch 9: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file input/roberta-base/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 1

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,2.199582,2.172006,0.693114,09:22
1,2.070554,2.075593,0.697418,09:22
2,1.966836,2.036586,0.704,09:22
3,1.946774,2.00242,0.708841,09:23
4,1.883796,2.037194,0.706539,09:22
5,1.793461,2.049458,0.707556,09:22


Better model found at epoch 0 with jaccard_score value: 0.693114449345234.
Better model found at epoch 1 with jaccard_score value: 0.6974181455814191.
Better model found at epoch 2 with jaccard_score value: 0.7040000105703113.
Better model found at epoch 3 with jaccard_score value: 0.708840808277601.
Epoch 6: early stopping
set state called


### Predict

In [24]:
pt_model = AutoModel.from_pretrained(roberta_path)
model = SpanModel(pt_model)

tr_df  = train_df.iloc[fold_indices[0][0]].reset_index(drop=True)
val_df = train_df.iloc[fold_indices[0][1]].reset_index(drop=True)

train_ds, valid_ds = [TweetDataset(i) for i in [tr_df, val_df]]
test_ds = TweetDataset(test_df, test = True)
loss_fn = partial(CELoss, LabelSmoothingCrossEntropy())

data = DataBunch.create(train_ds, valid_ds, test_ds, path=".", bs = 128)
# learner = Learner(data, model, loss_func = loss_fn(), path = path/'models/electra/models', model_dir=f".")
learner = Learner(data, model, loss_func = loss_fn(), path = path/'models/roberta_ranger_new_loss_fc', model_dir=f".")

INFO:transformers.configuration_utils:loading configuration file input/roberta-base/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 1

In [28]:
learner2 = Learner(data, model, loss_func = loss_fn(), path = path/'models/jel_loss', model_dir=f".")

In [42]:
model2 = SpanModel2(pt_model)
learner3 =  Learner(data, model, loss_func = loss_fn(), path = path/'models/roberta_mish', model_dir=f".")

In [40]:
a = iter(learner.data.train_dl)

In [30]:
model0()

SpanModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [43]:
preds = []
test_df_idx = 0

with torch.no_grad():
    
    for xb,yb in tqdm(learner.data.test_dl):
        model0 = learner.load(f'roberta_conv_update_N_0').model.eval()
        start_logits0, end_logits0 = to_cpu(model0(*xb))
        start_logits0, end_logits0 = start_logits0.float(), end_logits0.float()
        
        model1 = learner.load(f'roberta_conv_update_N_1').model.eval()
        start_logits1, end_logits1 = to_cpu(model1(*xb))
        start_logits1, end_logits1 = start_logits1.float(), end_logits1.float()
        
        model2 = learner.load(f'roberta_conv_update_N_2').model.eval()
        start_logits2, end_logits2 = to_cpu(model2(*xb))
        start_logits2, end_logits2 = start_logits2.float(), end_logits2.float()
        
        model3 = learner.load(f'roberta_conv_update_N_3').model.eval()
        start_logits3, end_logits3 = to_cpu(model3(*xb))
        start_logits3, end_logits3 = start_logits3.float(), end_logits3.float()
        
        model4 = learner.load(f'roberta_conv_update_N_4').model.eval()
        start_logits4, end_logits4 = to_cpu(model4(*xb))
        start_logits4, end_logits4 = start_logits4.float(), end_logits4.float()
        
        #######################################################################
        
        model5 = learner2.load(f'roberta_conv_jel_loss_0').model.eval()
        start_logits5, end_logits5 = to_cpu(model5(*xb))
        start_logits5, end_logits5 = start_logits5.float(), end_logits5.float()
        
        model6 = learner2.load(f'roberta_conv_jel_loss_1').model.eval()
        start_logits6, end_logits6 = to_cpu(model6(*xb))
        start_logits6, end_logits6 = start_logits6.float(), end_logits6.float()
        
        model7 = learner2.load(f'roberta_conv_jel_loss_2').model.eval()
        start_logits7, end_logits7 = to_cpu(model7(*xb))
        start_logits7, end_logits7 = start_logits7.float(), end_logits7.float()
        
        model8 = learner2.load(f'roberta_conv_jel_loss_3').model.eval()
        start_logits8, end_logits8 = to_cpu(model8(*xb))
        start_logits8, end_logits8 = start_logits8.float(), end_logits8.float()
        
        model9 = learner2.load(f'roberta_conv_jel_loss_4').model.eval()
        start_logits9, end_logits9 = to_cpu(model9(*xb))
        start_logits9, end_logits9 = start_logits9.float(), end_logits9.float()
        
        #######################################################################
        
        model10 = learner3.load(f'roberta_conv_mish_0').model.eval()
        start_logits10, end_logits10 = to_cpu(model10(*xb))
        start_logits10, end_logits10 = start_logits10.float(), end_logits10.float()
        
        model11 = learner3.load(f'roberta_conv_mish_1').model.eval()
        start_logits11, end_logits11 = to_cpu(model11(*xb))
        start_logits11, end_logits11 = start_logits11.float(), end_logits11.float()
        
        model12 = learner3.load(f'roberta_conv_mish_2').model.eval()
        start_logits12, end_logits12 = to_cpu(model12(*xb))
        start_logits12, end_logits12 = start_logits12.float(), end_logits12.float()
        
        model13 = learner3.load(f'roberta_conv_mish_3').model.eval()
        start_logits13, end_logits13 = to_cpu(model13(*xb))
        start_logits13, end_logits13 = start_logits13.float(), end_logits13.float()
        
        model14 = learner3.load(f'roberta_conv_mish_4').model.eval()
        start_logits14, end_logits14 = to_cpu(model14(*xb))
        start_logits14, end_logits14 = start_logits14.float(), end_logits14.float()
        
        input_ids = to_cpu(xb[0])
        offsets = to_cpu(xb[2])
        
        start_logits = (start_logits0 + start_logits1 + start_logits2 + start_logits3 + start_logits4 + start_logits5 + start_logits6 + start_logits7 + start_logits8 + start_logits9 + start_logits10 + start_logits11 + start_logits12 + start_logits13 + start_logits14) / 15
        end_logits = (end_logits0 + end_logits1 + end_logits2 + end_logits3 + end_logits4 + end_logits5 + end_logits6 + end_logits7 + end_logits8 + end_logits9 + end_logits10 + end_logits11 + end_logits12 + end_logits13 + end_logits14) / 15
        
        for i in range(len(input_ids)):
            
            _offsets = offsets[i]
#             start_idx, end_idx = torch.argmax(start_logits[i]), torch.argmax(end_logits[i])

            start_idx, end_idx = get_best_start_end_idxs(start_logits[i], end_logits[i])
            original_start, original_end = _offsets[start_idx][0], _offsets[end_idx][1]
            pred_span = test_ds.df.text[test_df_idx][original_start : original_end]
            preds.append(pred_span)
            test_df_idx += 1

100%|██████████| 28/28 [12:20<00:00, 26.45s/it]


In [62]:
a = pd.DataFrame(preds)

In [26]:
for fold in range(5):
    learner = Learner(data, model, loss_func = loss_fn(), path = path/'models', model_dir=f".")
    model0 = learner.load(f'roberta_conv_news_fold_{fold}').model.eval()
    learner.save(f'roberta_conv_jel_loss_{fold}',with_opt=False)

In [43]:
# learner = Learner(data, model, loss_func = loss_fn(), path = path/'models/electra', model_dir=f".")
# learner.load(f'electra-qa-fold_4').save(f'electra_fold_es_4', with_opt=False)

In [47]:
out_df = test_df

In [49]:
out_df['selected_text'] = preds
#trying to predict neural
out_df['selected_text'] = out_df.apply(lambda o: o['text'] if len(o['text']) < 3 else o['selected_text'], 1)
# out_df['selected_text'] = out_df.apply(lambda o: o['text'] if o['sentiment'] == 'neutral' else o['selected_text'], 1)

In [109]:
for idx, row in test_df.iterrows():
    if row.sentiment_x == 'neutral' and row.sentiment_y =='neutral':
        if jaccard(row.text, row.selected_text) >=0.7:
            test_df.selected_text[idx] = row.text
            print(idx)

8
12
23
31
43
50
51
54
61
62
67
73
85
88
89
100
104
110
112
122
131
133
155
173
176
185
196
199
201
208
237
270
272
275
300
301
302
313
314
316
318
341
348
350
354
362
367
373
382
387
399
400
420
422
423
429
436
437
442
447
449
458
460
472
477
484
490
491
501
516
518
523
525
527
539
542
550
569
571
574
586
589
605
608
611
614
617
620
639
648
654
664
667
695
696
707
715
719
734
736
739
740
754
756
757
758
773
785
788
789
792
801
812
817
825
828
830
834
835
841
843
857
882
888
894
899
901
912
914
916
920
924
925
932
935
940
941
947
959
968
975
981
982
991
999
1011
1018
1020
1043
1044
1076
1077
1098
1099
1100
1107
1108
1125
1126
1130
1138
1142
1159
1166
1167
1170
1181
1189
1197
1224
1225
1227
1231
1238
1239
1252
1269
1271
1284
1288
1289
1309
1310
1312
1315
1327
1333
1335
1342
1349
1357
1366
1369
1372
1380
1383
1422
1428
1431
1442
1448
1457
1461
1467
1470
1478
1481
1497
1508
1509
1514
1518
1521
1523
1526
1527
1530
1535
1550
1556
1559
1560
1566
1567
1571
1586
1595
1598
1600
1606
1614
1617
1

In [33]:
out_df = out_df[['textID', 'selected_text']]
out_df.to_csv("submission_testing.csv", index=False)

In [61]:
# out_df.to_csv('test.csv')

In [57]:
pd.set_option('display.max_rows', 4000)

In [38]:
subdf

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy bday!
4,33987a8ee5,I like it!!
5,726e501993,that`s great!!
6,261932614e,HATES
7,afa11da83f,blocked
8,e64208b4ef,and within a short time of the last clue all ...
9,37bcad24ca,What did you get? My day is alright.. haven`...
