In [1]:
from transformers import AutoModelForQuestionAnswering, AutoModel, AutoConfig, get_linear_schedule_with_warmup
from transformers.optimization import AdamW
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
import pandas as pd
import numpy as np
from pathlib import Path
import os
from itertools import compress
import utils

In [2]:
from torch.utils.data import DataLoader
from apex.optimizers.fused_lamb import FusedLAMB as Lamb
from ranger import Ranger
from fairseq import criterions 
from functools import partial
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from fastai.core import *
from fastai.text import *

In [3]:
path = Path.cwd()
file_dir = Path.cwd()/'data'

In [4]:
train_df = pd.read_csv(file_dir/'train.csv')
train_df['text'] = train_df['text'].apply(lambda x: str(x))
train_df['sentiment'] = train_df['sentiment'].apply(lambda x: str(x))
train_df['selected_text'] = train_df['selected_text'].apply(lambda x: str(x))

In [5]:
fold_df = pd.read_csv(file_dir/'train_folds.csv')
fold_df['text'] = fold_df['text'].apply(lambda x: str(x))
fold_df['sentiment'] = fold_df['sentiment'].apply(lambda x: str(x))
fold_df['selected_text'] = fold_df['selected_text'].apply(lambda x: str(x))

In [6]:
test_df = pd.read_csv(file_dir/'test.csv')
test_df['text'] = test_df['text'].apply(lambda x: str(x))
test_df['sentiment'] = test_df['sentiment'].apply(lambda x: str(x))

In [34]:
max_len = 128
bs = 32
tokenizer = BertWordPieceTokenizer('input/electra-base-disc/vocab.txt', lowercase=True)

In [8]:
def preprocess(sentiment, tweet, selected, tokenizer, max_len):
    _input = tokenizer.encode(sentiment, tweet)
    _span = tokenizer.encode(selected, add_special_tokens=False)
    
    len_span = len(_span.ids)
    start_idx = None
    end_idx = None
    
    for ind in (i for i, e in enumerate(_input.ids) if e == _span.ids[0]):
        if _input.ids[ind: ind + len_span] == _span.ids:
            start_idx = ind
            end_idx = ind + len_span - 1
            break
    
    # Handles cases where Wordpiece tokenizing input & span separately produces different outputs
    if not start_idx:
        idx0 = tweet.find(selected)
        idx1 = idx0 + len(selected)
        
        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1):
                char_targets[ct] = 1
                
        tweet_offsets = list(compress(_input.offsets, _input.type_ids))[0:-1]
        
        target_idx = []
        for j, (offset1, offset2) in enumerate(tweet_offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)
                
        start_idx, end_idx = target_idx[0] +3 , target_idx[-1] + 3
        
    _input.start_target = start_idx
    _input.end_target = end_idx
    _input.tweet = tweet
    _input.sentiment = sentiment
    _input.selected = selected
    
    _input.pad(max_len)
    
    return _input

In [9]:
def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss    

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, ε:float=0.1, reduction='mean'):
        super().__init__()
        self.ε,self.reduction = ε,reduction
    
    def forward(self, output, target):
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return torch.lerp(nll, loss/c, self.ε) 

In [10]:
class TweetDataset(Dataset):
    def __init__(self, dataset, test = None):
        self.df = dataset
        self.test = test
        
    def __getitem__(self, idx):
        if not self.test:
            sentiment, tweet, selected = (self.df[col][idx] for col in ['sentiment', 'text', 'selected_text'])
            _input = preprocess(sentiment, tweet, selected, tokenizer, max_len)
            
            yb = [torch.tensor(_input.start_target), torch.tensor(_input.end_target)]
            
        else:
            _input = tokenizer.encode(self.df.sentiment[idx], self.df.text[idx])
            _input.pad(max_len)
            
            yb = 0

        xb = [torch.LongTensor(_input.ids),
              torch.LongTensor(_input.attention_mask),
              torch.LongTensor(_input.type_ids),
              np.array(_input.offsets)]

        return xb, yb     

    def __len__(self):
        return len(self.df)

In [11]:
config = AutoConfig.from_pretrained('google/electra-base-discriminator')
config.output_hidden_states = True
# pt_model = AutoModel.from_pretrained('google/electra-base-discriminator', config = config)
# pt_model.save_pretrained(electra_path)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json from cache at /home/jack/.cache/torch/transformers/9236d197566a7f1be2b2151f5afcc5a8e17f31e1e23c52f3cdf2340019986e78.88ba6e8e7d5a7936e86d6f2551fe19c236dc57c24da163907cd0544e9933f6ee
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    

In [13]:
electra_path = path/'input/electra-base-disc'
pt_model = AutoModel.from_pretrained(electra_path)

INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

In [14]:
# class SpanModel(nn.Module):
#     def __init__(self, pt_model):
#         super().__init__()
#         self.model = pt_model
#         self.drop_out = nn.Dropout(0.1)
#         self.qa_outputs = nn.Linear(768 * 2, 2) # update hidden size

#     # could pass offsets here and not use - can grab in last_input
#     def forward(self, input_ids, attention_mask, token_type_ids, offsets = None):
        
#         _, hidden_states = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

#         out = torch.cat((hidden_states[-1], hidden_states[-2]), dim=-1)
#         out = self.drop_out(out)
#         logits = self.qa_outputs(out)
        
#         start_logits, end_logits = logits.split(1, dim=-1)
        
#         start_logits = start_logits.squeeze(-1)
#         end_logits = end_logits.squeeze(-1)

#         return start_logits, end_logits

In [37]:
class SpanModel(nn.Module):
    def __init__(self,pt_model):
        super().__init__()
        self.model = pt_model
        self.drop_out = nn.Dropout(0.5)
        self.qa_outputs1c = torch.nn.Conv1d(768*2, 128, 2)
        self.qa_outputs2c = torch.nn.Conv1d(768*2, 128, 2)
        self.qa_outputs1 = nn.Linear(128, 1)
        self.qa_outputs2 = nn.Linear(128, 1)
        
#         self.qa_outputs = nn.Linear(768 * 2, 2) # update hidden size

    # could pass offsets here and not use - can grab in last_input
    def forward(self, input_ids, attention_mask, token_type_ids, offsets = None):
        
        _, hidden_states = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        out = torch.cat((hidden_states[-1], hidden_states[-2]), dim=-1)
        out = self.drop_out(out)
        out = torch.nn.functional.pad(out.transpose(1,2), (1, 0))
        
        out1 = self.qa_outputs1c(out).transpose(1,2)
        out2 = self.qa_outputs2c(out).transpose(1,2)

        start_logits = self.qa_outputs1(self.drop_out(out1)).squeeze(-1)
        end_logits = self.qa_outputs2(self.drop_out(out2)).squeeze(-1)
        return start_logits, end_logits

In [17]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [18]:
def dist_between(start_logits, end_logits, device='cpu', max_seq_len=128):
    """get dist btw. pred & ground_truth"""

    linear_func = torch.tensor(np.linspace(0, 1, max_seq_len, endpoint=False), requires_grad=False)
    linear_func = linear_func.to(device)

    start_pos = (start_logits*linear_func).sum(axis=1)
    end_pos = (end_logits*linear_func).sum(axis=1)

    diff = end_pos-start_pos

    return diff.sum(axis=0)/diff.size(0)


def dist_loss(start_logits, end_logits, start_positions, end_positions, device='cpu', max_seq_len=128, scale=1):
    """calculate distance loss between prediction's length & GT's length
    
    Input
    - start_logits ; shape (batch, max_seq_len{128})
        - logits for start index
    - end_logits
        - logits for end index
    - start_positions ; shape (batch, 1)
        - start index for GT
    - end_positions
        - end index for GT
    """
    start_logits = torch.nn.Softmax(1)(start_logits) # shape ; (batch, max_seq_len)
    end_logits = torch.nn.Softmax(1)(end_logits)
    
    start_one_hot = torch.nn.functional.one_hot(start_positions, num_classes=max_seq_len).to(device)
    end_one_hot = torch.nn.functional.one_hot(end_positions, num_classes=max_seq_len).to(device)
    
    pred_dist = dist_between(start_logits, end_logits, device, max_seq_len)
    gt_dist = dist_between(start_one_hot, end_one_hot, device, max_seq_len) # always positive
    diff = (gt_dist-pred_dist)

    rev_diff_squared = 1-torch.sqrt(diff*diff) # as diff is smaller, make it get closer to the one
    loss = -torch.log(rev_diff_squared) # by using negative log function, if argument is near zero -> inifinite, near one -> zero

    return loss*scale

In [19]:
class join_loss(Module):
    def __init__(self, ce_loss_fn = nn.CrossEntropyLoss(), device = 'cpu'): 
        self.loss_fn = ce_loss_fn
        self.device = device
        
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs # assumes tuple input
        
        logits = torch.cat([start_logits, end_logits]).contiguous()
        
        targets = torch.cat([start_targets, end_targets]).contiguous()
        
        ce_loss = self.loss_fn(logits, targets)
        
        distance_loss = dist_loss(start_logits, end_logits, 
                                  start_targets, end_targets, device=device, max_seq_len =128)
        return ce_loss+distance_loss

In [20]:
class CELoss(Module):
    def __init__(self, loss_fn = nn.CrossEntropyLoss()): 
        self.loss_fn = loss_fn
        
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs # assumes tuple input
        
        logits = torch.cat([start_logits, end_logits]).contiguous()
        
        targets = torch.cat([start_targets, end_targets]).contiguous()
        
        return self.loss_fn(logits, targets)

In [25]:
def get_best_start_end_idxs(start_logits, end_logits):
    max_len = len(start_logits)
    start_logits, end_logits = start_logits.cpu(),end_logits.cpu()
    
    a = np.tile(start_logits, (max_len, 1))
    b = np.tile(end_logits, (max_len, 1))
    c = np.tril(a + b.T, k=0).T
    c[c == 0] = -1000
    return np.unravel_index(c.argmax(), c.shape)

In [26]:
# Note that validation ds is by default not shuffled in fastai - so indexing like this will work for Callback
# https://forums.fast.ai/t/how-to-set-shuffle-false-of-train-and-val/33730

class JaccardScore(Callback):
    "Stores predictions and targets to perform calculations on epoch end."
    def __init__(self, valid_ds): 
        self.valid_ds = valid_ds
        self.context_text = valid_ds.df.text
        self.answer_text = valid_ds.df.selected_text
        
    def on_epoch_begin(self, **kwargs):
        self.jaccard_scores = []  
        self.valid_ds_idx = 0
        
        
    def on_batch_end(self, last_input:Tensor, last_output:Tensor, last_target:Tensor, **kwargs):
              
        input_ids = last_input[0]
        attention_masks = last_input[1].bool()
        token_type_ids = last_input[2].bool()
        offsets = last_input[3]

        start_logits, end_logits = last_output
        
        # for id in batch of ids
        for i in range(len(input_ids)):
            
            _offsets = offsets[i]
#             start_idx, end_idx = torch.argmax(start_logits[i]), torch.argmax(end_logits[i])

            start_idx, end_idx = get_best_start_end_idxs(start_logits[i], end_logits[i])
            _answer_text = self.answer_text[self.valid_ds_idx]
            original_start, original_end = _offsets[start_idx][0], _offsets[end_idx][1]
            pred_span = self.context_text[self.valid_ds_idx][original_start : original_end]
                
            score = jaccard(pred_span, _answer_text)
            self.jaccard_scores.append(score)

            self.valid_ds_idx += 1
            
    def on_epoch_end(self, last_metrics, **kwargs):        
        res = np.mean(self.jaccard_scores)
        return add_metrics(last_metrics, res)

In [27]:
from fastai.callbacks import *

def new_on_train_begin(self, **kwargs:Any)->None:
    "Initializes the best value."
    if not hasattr(self, 'best'):
        self.best = float('inf') if self.operator == np.less else -float('inf')

SaveModelCallback.on_train_begin = new_on_train_begin

In [29]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [30]:
fold_indices = list(skf.split(train_df, train_df.sentiment))

In [35]:
device = torch.device('cuda')

def run_fold(fold):
    wd = 0.01 # high value but worked quite well
    lr = 1e-4
    
    tr_df  = train_df.iloc[fold_indices[fold][0]].reset_index(drop=True)
    val_df = train_df.iloc[fold_indices[fold][1]].reset_index(drop=True)
    
    train_ds, valid_ds = [TweetDataset(i) for i in [tr_df,val_df]]
    data = DataBunch.create(train_ds, valid_ds,bs = bs)
#     loss_fn = partial(CELoss, LabelSmoothingCrossEntropy())
    loss_fn = join_loss(LabelSmoothingCrossEntropy(), device) # no need to initialise later - no partial 

    pt_model = AutoModel.from_pretrained(electra_path)
    model = SpanModel(pt_model)
    learner = Learner(data, model, loss_func = loss_fn, opt_func = Ranger, metrics = [JaccardScore(valid_ds)])
    
    early_stop_cb = EarlyStoppingCallback(learner, monitor='jaccard_score',mode='max',patience=2)
    save_model_cb = SaveModelCallback(learner,every='improvement',monitor='jaccard_score',name=f'electra_conv_fold_{fold}')
    csv_logger_cb = CSVLogger(learner, f"electra_conv_logs_{fold}", True)

#     learner.fit_fc(4, 1e-4,start_pct=0.6, wd=wd, callbacks=[early_stop_cb, save_model_cb, csv_logger_cb]) 
    learner.fit_fc(8, 3e-5,start_pct=0.7, wd=wd, callbacks=[early_stop_cb, save_model_cb, csv_logger_cb]) 

    learner.save(f'electra_conv_after_epoch_{fold}', with_opt=False)
    del learner; del pt_model; gc.collect()

In [38]:
for fold in range(5):run_fold(fold)

INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,1.832117,1.705358,0.667233,09:14
1,1.617944,1.603257,0.691322,09:19
2,1.568793,1.584891,0.699648,09:16
3,1.527305,1.580717,0.697125,09:15
4,1.481543,1.599735,0.698195,09:15


Better model found at epoch 0 with jaccard_score value: 0.6672326750175284.
Better model found at epoch 1 with jaccard_score value: 0.6913219122151314.
Better model found at epoch 2 with jaccard_score value: 0.6996481322798078.
Epoch 5: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,1.850087,1.694955,0.663015,09:15
1,1.673517,1.571152,0.697689,09:15
2,1.583052,1.54562,0.704074,09:15
3,1.534307,1.533585,0.708425,09:15
4,1.480048,1.556394,0.7048,09:15
5,1.419722,1.597224,0.705492,09:15


Better model found at epoch 0 with jaccard_score value: 0.6630148612618942.
Better model found at epoch 1 with jaccard_score value: 0.6976891396552062.
Better model found at epoch 2 with jaccard_score value: 0.7040738114311231.
Better model found at epoch 3 with jaccard_score value: 0.7084253853538849.
Epoch 6: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,1.823091,1.704397,0.671826,09:15
1,1.641701,1.583322,0.697209,09:15
2,1.560581,1.567448,0.710517,09:15
3,1.512531,1.564653,0.703947,09:15
4,1.473287,1.572921,0.708385,09:15


Better model found at epoch 0 with jaccard_score value: 0.6718264005391787.
Better model found at epoch 1 with jaccard_score value: 0.6972091060838617.
Better model found at epoch 2 with jaccard_score value: 0.7105167074074102.
Epoch 5: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,1.860298,1.705582,0.663785,09:15
1,1.639119,1.579458,0.69854,09:15
2,1.55961,1.559442,0.709788,09:15
3,1.524054,1.546332,0.707801,09:15
4,1.485781,1.574294,0.702412,09:15


Better model found at epoch 0 with jaccard_score value: 0.6637853009669433.
Better model found at epoch 1 with jaccard_score value: 0.698540301217468.
Better model found at epoch 2 with jaccard_score value: 0.7097875917887245.
Epoch 5: early stopping
set state called


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


epoch,train_loss,valid_loss,jaccard_score,time
0,1.886956,1.72508,0.658966,09:15
1,1.676846,1.606192,0.691776,09:15
2,1.60252,1.584073,0.695335,09:15
3,1.536227,1.574947,0.701367,09:15
4,1.493362,1.598774,0.697292,09:15
5,1.414896,1.614001,0.703241,09:15
6,1.360013,1.656392,0.697167,09:18


Better model found at epoch 0 with jaccard_score value: 0.6589658119631354.
Better model found at epoch 1 with jaccard_score value: 0.6917764978574378.
Better model found at epoch 2 with jaccard_score value: 0.6953353885112826.
Better model found at epoch 3 with jaccard_score value: 0.701366854351195.
Better model found at epoch 5 with jaccard_score value: 0.7032409662423725.
set state called


KeyboardInterrupt: 

In [19]:
for fold in range(5): run_fold(fold)

INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

epoch,train_loss,valid_loss,jaccard_score,time
0,1.704172,1.671664,0.675085,08:47
1,1.605431,1.596937,0.691833,08:45


Better model found at epoch 0 with jaccard_score value: 0.6750852608829276.
Better model found at epoch 1 with jaccard_score value: 0.6918329359688112.


epoch,train_loss,valid_loss,jaccard_score,time
0,1.538568,1.595869,0.694469,08:48
1,1.561806,1.580986,0.694178,08:47
2,1.496153,1.585295,0.697243,08:47
3,1.489867,1.582347,0.699513,08:47
4,1.456984,1.584616,0.701502,08:47


Better model found at epoch 0 with jaccard_score value: 0.6944688959777262.
Better model found at epoch 2 with jaccard_score value: 0.6972432103015209.
Better model found at epoch 3 with jaccard_score value: 0.6995133383703219.
Better model found at epoch 4 with jaccard_score value: 0.7015015486818422.


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

epoch,train_loss,valid_loss,jaccard_score,time
0,1.550042,1.504974,0.719661,08:47
1,1.468588,1.478897,0.72985,08:47


Better model found at epoch 0 with jaccard_score value: 0.7196607447413469.
Better model found at epoch 1 with jaccard_score value: 0.7298503022462369.


epoch,train_loss,valid_loss,jaccard_score,time
0,1.41138,1.480441,0.734387,08:46
1,1.411632,1.478839,0.732568,08:47
2,1.371329,1.487647,0.732334,08:47


Better model found at epoch 0 with jaccard_score value: 0.7343868986105336.
Epoch 3: early stopping


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

epoch,train_loss,valid_loss,jaccard_score,time
0,1.47208,1.404751,0.756665,08:47
1,1.422424,1.381425,0.764549,08:46


Better model found at epoch 0 with jaccard_score value: 0.756664935576462.
Better model found at epoch 1 with jaccard_score value: 0.7645492397229302.


epoch,train_loss,valid_loss,jaccard_score,time
0,1.373208,1.382638,0.762302,08:46
1,1.36491,1.377253,0.766926,08:47
2,1.337119,1.378769,0.766391,08:46
3,1.303695,1.384787,0.762903,08:46


Better model found at epoch 1 with jaccard_score value: 0.7669257510881144.
Epoch 4: early stopping


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

epoch,train_loss,valid_loss,jaccard_score,time
0,1.417454,1.304638,0.791627,08:46
1,1.347521,1.283855,0.801031,08:46


Better model found at epoch 0 with jaccard_score value: 0.791626891998231.
Better model found at epoch 1 with jaccard_score value: 0.8010307930575726.


epoch,train_loss,valid_loss,jaccard_score,time
0,1.276845,1.273421,0.805131,08:46
1,1.276553,1.270953,0.805076,08:46
2,1.257756,1.271808,0.80669,08:46
3,1.236173,1.269701,0.805958,08:46
4,1.19579,1.273628,0.808509,08:47


Better model found at epoch 0 with jaccard_score value: 0.8051309754636492.
Better model found at epoch 2 with jaccard_score value: 0.8066904553436521.
Better model found at epoch 4 with jaccard_score value: 0.8085087238848403.


INFO:transformers.configuration_utils:loading configuration file /home/jack/Documents/DL/kaggle/tweet_qa/input/electra-base-disc/config.json
INFO:transformers.configuration_utils:Model config ElectraConfig {
  "_num_labels": 2,
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 768,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size"

epoch,train_loss,valid_loss,jaccard_score,time
0,1.314602,1.186479,0.842287,08:46
1,1.233285,1.117247,0.871059,08:46


Better model found at epoch 0 with jaccard_score value: 0.8422866209307865.
Better model found at epoch 1 with jaccard_score value: 0.8710594645665488.


epoch,train_loss,valid_loss,jaccard_score,time
0,1.179185,1.110049,0.871128,08:46
1,1.188191,1.11364,0.870893,08:46
2,1.151463,1.113099,0.864643,08:46


Better model found at epoch 0 with jaccard_score value: 0.8711278636170685.
Epoch 3: early stopping


In [41]:
for fold in range(5):
    learner = Learner(data, model, loss_func = loss_fn(), path = path/'models', model_dir=f".")
    model0 = learner.load(f'electra_conv_fold_{fold}').model.eval()
    learner.save(f'electra_conv_{fold}',with_opt=False)

### Predict

In [40]:
model = SpanModel(pt_model)
tr_df  = fold_df[fold_df.kfold != 0].reset_index(drop=True)
val_df = fold_df[fold_df.kfold == 0].reset_index(drop=True)
train_ds, valid_ds = [TweetDataset(i) for i in [tr_df, val_df]]
test_ds = TweetDataset(test_df, test = True)
loss_fn = partial(CELoss, LabelSmoothingCrossEntropy())

data = DataBunch.create(train_ds, valid_ds, test_ds, path=".", bs = bs)
learner = Learner(data, model, loss_func = loss_fn(), path = path/'models', model_dir=f".")

In [61]:
preds = []
test_df_idx = 0

with torch.no_grad():
    
    for xb,yb in tqdm(learner.data.test_dl):
        model0 = learner.load(f'electra_fold_es_0').model.eval()
        start_logits0, end_logits0 = to_cpu(model0(*xb))
        start_logits0, end_logits0 = start_logits0.float(), end_logits0.float()
        
        model1 = learner.load(f'electra_fold_es_1').model.eval()
        start_logits1, end_logits1 = to_cpu(model1(*xb))
        start_logits1, end_logits1 = start_logits1.float(), end_logits1.float()
        
        model2 = learner.load(f'electra_fold_es_2').model.eval()
        start_logits2, end_logits2 = to_cpu(model2(*xb))
        start_logits2, end_logits2 = start_logits2.float(), end_logits2.float()
        
        model3 = learner.load(f'electra_fold_es_3').model.eval()
        start_logits3, end_logits3 = to_cpu(model3(*xb))
        start_logits3, end_logits3 = start_logits3.float(), end_logits3.float()
        
        model4 = learner.load(f'electra_fold_es_4').model.eval()
        start_logits4, end_logits4 = to_cpu(model4(*xb))
        start_logits4, end_logits4 = start_logits4.float(), end_logits4.float()
        
        input_ids = to_cpu(xb[0])
        attention_masks = to_cpu(xb[1].bool())
        token_type_ids = to_cpu(xb[2].bool())
        offsets = to_cpu(xb[3])
        
        start_logits = (start_logits1 + start_logits2 + start_logits3 + start_logits4) / 4
        end_logits = (end_logits1 + end_logits2 + end_logits3 + end_logits4) / 4
        
        for i in range(len(input_ids)):
            
            _offsets = offsets[i]
            start_idx, end_idx = torch.argmax(start_logits[i]), torch.argmax(end_logits[i])
            original_start, original_end = _offsets[start_idx][0], _offsets[end_idx][1]
            pred_span = test_ds.df.text[test_df_idx][original_start : original_end]
            preds.append(pred_span)
            test_df_idx += 1

100%|██████████| 89/89 [04:29<00:00,  3.03s/it]


In [62]:
a = pd.DataFrame(preds)

In [32]:
learner = Learner(data, model, loss_func = loss_fn(), path = path/'models/electra', model_dir=f".")

In [33]:
model0 = learner.load(f'electra-qa-fold_0').model.eval()
# model0.sa

# model1 = learner.load(f'electra-qa-fold_1').model.eval()
# start_logits1, end_logits1 = to_cpu(model1(*xb))
# start_logits1, end_logits1 = start_logits1.float(), end_logits1.float()

# model2 = learner.load(f'electra-qa-fold_2').model.eval()
# start_logits2, end_logits2 = to_cpu(model2(*xb))
# start_logits2, end_logits2 = start_logits2.float(), end_logits2.float()

# model3 = learner.load(f'electra-qa-fold_3').model.eval()
# start_logits3, end_logits3 = to_cpu(model3(*xb))
# start_logits3, end_logits3 = start_logits3.float(), end_logits3.float()

# model4 = learner.load(f'electra-qa-fold_4').model.eval()
# start_logits4, end_logits4 = to_cpu(model4(*xb))
# start_logits4, end_logits4 = start_logits4.float(), end_logits4.float()

In [43]:
# learner = Learner(data, model, loss_func = loss_fn(), path = path/'models/electra', model_dir=f".")
# learner.load(f'electra-qa-fold_4').save(f'electra_fold_es_4', with_opt=False)

In [22]:
out_df = test_df

In [23]:
out_df['selected_text'] = preds
out_df['selected_text'] = out_df.apply(lambda o: o['text'] if len(o['text']) < 3 else o['selected_text'], 1)
out_df['selected_text'] = out_df.apply(lambda o: o['text'] if o['sentiment'] == 'neutral' else o['selected_text'], 1)

In [24]:
subdf = out_df[['textID', 'selected_text']]
# subdf.to_csv("submission1.csv", index=False)