# Library Setup

In [1]:
!pip install iterative-stratification==0.1.7

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7
[0m

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import transformers
import os
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import glob
import os
import wandb
import random
import matplotlib.pyplot as plt
import numpy as np
from transformers import AdamW
from text_unidecode import unidecode
import torch
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoConfig
import re
from torch.nn import Module
import torch.nn as nn
from collections import Counter, defaultdict
from tqdm import tqdm
import unicodedata
from copy import deepcopy
import sys
import gc
import codecs
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
CFG1 = {
    "model_name": "google/bigbird-roberta-large",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-ell-bigbird",
    "max_length": 4096,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "num_warmup_steps": 0.0,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG1["tokenizer"] = AutoTokenizer.from_pretrained(CFG1["model_name"])

Downloading:   0%|          | 0.00/943 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

In [4]:
CFG2 = {
    "model_name": "microsoft/deberta-v3-base",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-ell-debertav3base-notebooks",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "num_warmup_steps": 0.0,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG2["tokenizer"] = AutoTokenizer.from_pretrained(CFG2["model_name"])

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
CFG3 = {
    "model_name": "microsoft/deberta-v3-large",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/debertav3large-ell-download",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG3["tokenizer"] = AutoTokenizer.from_pretrained(CFG3["model_name"])

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
CFG4 = {
    "model_name": "allenai/longformer-base-4096",
    "type": "Full Input",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-ell-longformer",
    "max_length": 4096,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 32,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 8,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG4["tokenizer"] = AutoTokenizer.from_pretrained(CFG4["model_name"])

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [7]:
CFG5 = {
    "model_name": "microsoft/deberta-v3-large",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/ell-pseudo-debertav3large-download",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG5["tokenizer"] = AutoTokenizer.from_pretrained(CFG5["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
CFG6 = {
    "model_name": "microsoft/deberta-v3-base",
    "type": "Attention Regression Head",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-attention-regression-head-debertav3b",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": "attention",
    "layer_start": 1,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "hidden_dim": 128,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG6["tokenizer"] = AutoTokenizer.from_pretrained(CFG6["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
CFG7 = {
    "model_name": "microsoft/deberta-v3-base",
    "type": "Weighted Regression Head",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-weighted-head-debertav3base",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": "weighted",
    "layer_start": 9,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG7["tokenizer"] = AutoTokenizer.from_pretrained(CFG7["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
CFG8 = {
    "model_name": "microsoft/deberta-v3-base",
    "type": "Attention Regression Head + Multisample Dropout",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-attention-multisample-debertav3base",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": "attention",
    "layer_start": 1,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "hidden_dim": 128,
    "dropout": 0.3,
    "multisample": True,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG8["tokenizer"] = AutoTokenizer.from_pretrained(CFG8["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
CFG9 = {
    "model_name": "microsoft/deberta-v3-base",
    "type": "Baseline L2 Loss",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-debertav3b-l2",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "layer_start": 9,
    "weight_decay": 0.3,
    "dropout": 0.0,
    "grad_norm": 1000,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG9["tokenizer"] = AutoTokenizer.from_pretrained(CFG9["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Model Definition

In [12]:
class WeightedLayerPooling(torch.nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average
    

In [13]:
class AttentionPooling(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=0)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=0)

        return context_vector

In [14]:
#OBVIOUSLY, CHANGE THIS AS YOU NEED. USE SELF.LOG FOR ALL IMPORTANT METRICS
class Model(nn.Module):
    def __init__(self, config, vocab_length, data_loader_len):
        super(Model, self).__init__()
        self.config = config
        self.vocab_length = vocab_length
        self.base_model = AutoModel.from_pretrained(self.config['model_name'], output_hidden_states = True)
        self.base_model.resize_token_embeddings(vocab_length)

        if self.config["pooler"] == "weighted":
            self.pooler = WeightedLayerPooling(self.base_model.config.num_hidden_layers, layer_start = self.config["layer_start"])  
            self._init_weights(self.pooler.layer_weights)
            
        elif self.config["pooler"] == "attention":
            self.pooler = AttentionPooling(self.base_model.config.hidden_size, config["hidden_dim"])

        if self.config["multisample"]:
            self.dropout1 = nn.Dropout(0.1)
            self.dropout2 = nn.Dropout(0.2)
            self.dropout3 = nn.Dropout(0.3)
            self.dropout4 = nn.Dropout(0.4)
            self.dropout5 = nn.Dropout(0.5)
            
        self.dropout = nn.Dropout(self.config["dropout"])
        self.fc = nn.Linear(self.base_model.config.hidden_size, 6)

        self._init_weights(self.fc)
        self.data_loader_len = data_loader_len

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.base_model.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.base_model.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):

        if self.config["pooler"] == "weighted":
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        
            x = self.base_model(input_ids = input_ids, attention_mask = attention_mask)["hidden_states"]

            x = torch.stack(x)
            cls_embeddings = self.pooler(x)[:, 0]

            return cls_embeddings
            
        else:
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        
            x = self.base_model(input_ids = input_ids, attention_mask = attention_mask)["last_hidden_state"]

            return x[:, 0, :]

            
    def forward(self, inputs):
        
        features = self.feature(inputs)
        
        if self.config["multisample"]:
            logits1 = self.fc(self.dropout1(features))
            logits2 = self.fc(self.dropout2(features))
            logits3 = self.fc(self.dropout3(features))
            logits4 = self.fc(self.dropout4(features))
            logits5 = self.fc(self.dropout5(features))

            logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5

        else:
            logits = self.fc(features)
            
        return logits

# Dataset Classes

In [15]:
class TestData(Dataset):
    def __init__(self, df, config, indices, special_tokens = None):
        self.df = df
        self.esc_chars = ['\"', "\\", "\n", "\r", "\t", "\b", "\f", "\v", ":)", ";)", ":(", "uwu", "owo", "xd", ":3", ":-)", ":D", ">:(", "\xa0", "\x92", "\x93", "\x91", "\x94", "\x97", "x\B4", "\x96", "\x82", "\x84"]
        self.data = self.df.iloc[indices]
        self.data["full_text"] = self.data["full_text"].apply(lambda text: self.resolve_encodings_and_normalize(text))

        codecs.register_error("replace_encoding_with_utf8", self.replace_encoding_with_utf8)
        codecs.register_error("replace_decoding_with_cp1252", self.replace_decoding_with_cp1252)

    def replace_encoding_with_utf8(self, error):
        return error.object[error.start : error.end].encode("utf-8"), error.end


    def replace_decoding_with_cp1252(self, error):
        return error.object[error.start : error.end].decode("cp1252"), error.end


    def resolve_encodings_and_normalize(self, text: str) -> str:
        text = (
            text.encode("raw_unicode_escape")
            .decode("utf-8", errors="replace_decoding_with_cp1252")
            .encode("cp1252", errors="replace_encoding_with_utf8")
            .decode("utf-8", errors="replace_decoding_with_cp1252")
        )
        
        text = unidecode(text)
        
        return self.remove_esc_chars(text)

    def remove_esc_chars(self, text):
        txt = deepcopy(text)
        for char in self.esc_chars:
            if char == '\"':
                txt = txt.replace(char, '"')
            elif char == "\x92" or char == "\x91" or char == "\xB4":
                txt = txt.replace(char, "'")
            elif char == "\0x93" or char == "\0x94":
                txt = txt.replace(char, '"')
            elif char == "\0x97" or char == "\0x96":
                txt = txt.replace(char, '-')
            elif char == "\0x82" or char == "\0x84":
                txt = txt.replace(char, ',')
            else:
                txt = txt.replace(char, ' ')
        return txt

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]["full_text"]

In [16]:
def construct_collate_fn(config):
    def collate_dynamic_padding(batch):
        # Dynamic Padding tokenization
        sentences = config["tokenizer"](batch, padding=True, max_length = config["max_length"], truncation = True, return_token_type_ids = False, return_tensors="pt")
        return sentences
    
    return collate_dynamic_padding

In [17]:
#CHANGE AS NEEDED. MOST OF THE TIME, PYTORCH'S DEFAULT COLLATOR IS ENOUGH.
class DataModule():
    def __init__(self, config, test, collate_fn):
        self.config = config
        self.test = test
        self.collate_fn = collate_fn

    def test_dataloader(self):
        test_loader = DataLoader(self.test, batch_size = self.config["batch_size"], collate_fn = self.collate_fn)      
        return test_loader

# Inferencing

In [18]:
def predict(model, loader):
    outputs = []
    device = torch.device('cuda')
    model = model.to(device).eval()
    with torch.no_grad():
        for inputs in tqdm(loader):
            for key, value in inputs.items():
                inputs[key] = value.to(device)
            predictions = model(inputs)
            outputs.append(predictions)
    return torch.cat(tuple(outputs))

In [19]:
def get_predictions(data, num_preds, path, config, save_path):
    predictions = np.zeros((len(data), 6))
    mlskf = MultilabelStratifiedKFold(n_splits=CFG["folds"], shuffle=True, random_state=CFG["seed"])
    for fold, (train_index, val_index) in enumerate(mlskf.split(data, data[config["targets"]])):
        
        test = TestData(data, config, val_index)
        dataset = DataModule(config, test, construct_collate_fn(config))
        loader = dataset.test_dataloader()
        
        model = Model(config, len(config["tokenizer"]), len(loader))
        checkpoint = torch.load(f"{path}/fold-{fold}.pt", map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['model_state_dict'], strict = False)
        
        results = predict(model, loader)
        
        predictions[val_index] = results.cpu().numpy()
        
        del model, checkpoint; gc.collect()
        torch.cuda.empty_cache()
    
    df = pd.DataFrame(predictions, columns = CFG["targets"])
    df.to_csv(save_path)

In [20]:
CFGS = [CFG1, CFG2, CFG3, CFG4, CFG5, CFG6, CFG7, CFG8, CFG9]

In [21]:
models = ['bigbird-roberta-large', 'deberta-v3-baseline', 'deberta-v3-large', 'longformer-base', 'deberta-v3-large-psuedo', 'deberta-v3-base-attention-head', 'deberta-v3-base-weighted-head', 'deberta-v3-base-attention-multisample', 'deberta-v3-baseline-L2']
for num, CFG in enumerate(CFGS):
    data = pd.read_csv(f"../input/feedback-prize-english-language-learning/train.csv")

    oof_name = f"{models[num]}.csv"

    print(oof_name)

    get_predictions(data, num_preds = 4, path=CFG["weights"], config = CFG, save_path = oof_name)

bigbird-roberta-large.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing BigBirdModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  * num_indices_to_pick_from
 10%|▉         | 6/62 [00:14<02:11,  2.35s/it]Attention type 'block_sparse

deberta-v3-baseline.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 62/6

deberta-v3-large.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Downloading:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 62/

longformer-base.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 62/62 [01:05<00:00,  1.06s/it]
Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.lay

deberta-v3-large-psuedo.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreT

deberta-v3-base-attention-head.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTr

deberta-v3-base-weighted-head.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTr

deberta-v3-base-attention-multisample.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTr

deberta-v3-baseline-L2.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTr

In [22]:
%debug