# Library Setup

In [1]:
!pip install --force-reinstall --no-deps --no-index /kaggle/input/transformers-422/transformers-4.24.0-py3-none-any.whl

Processing /kaggle/input/transformers-422/transformers-4.24.0-py3-none-any.whl
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.20.1
    Uninstalling transformers-4.20.1:
      Successfully uninstalled transformers-4.20.1
Successfully installed transformers-4.24.0
[0m

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import transformers
import os
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import glob
import os
import wandb
import random
import matplotlib.pyplot as plt
import numpy as np
from transformers import AdamW
from text_unidecode import unidecode
import torch
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoConfig
import re
from torch.nn import Module
import torch.nn as nn
from collections import Counter, defaultdict
from tqdm import tqdm
import unicodedata
from copy import deepcopy
import sys
import gc
import codecs

In [3]:
transformers.__version__

'4.24.0'

In [4]:
CFG1 = {
    "model_name": "../input/bigbirdrobertalarge/bigbird-roberta-large",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-ell-bigbird",
    "max_length": 4096,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "num_warmup_steps": 0.0,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG1["tokenizer"] = AutoTokenizer.from_pretrained(CFG1["model_name"])

In [5]:
CFG2 = {
    "model_name": "../input/debertav3base",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-ell-debertav3base-notebooks",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "num_warmup_steps": 0.0,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG2["tokenizer"] = AutoTokenizer.from_pretrained(CFG2["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
CFG3 = {
    "model_name": "../input/deberta-v3-large/deberta-v3-large",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/debertav3large-ell-download",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG3["tokenizer"] = AutoTokenizer.from_pretrained(CFG3["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
CFG4 = {
    "model_name": "../input/allenailongformerbase4096/longformer",
    "type": "Full Input",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-ell-longformer",
    "max_length": 4096,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 32,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 8,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG4["tokenizer"] = AutoTokenizer.from_pretrained(CFG4["model_name"])

In [8]:
CFG5 = {
    "model_name": "../input/deberta-v3-large/deberta-v3-large",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/ell-pseudo-debertav3large-download",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG5["tokenizer"] = AutoTokenizer.from_pretrained(CFG5["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
CFG6 = {
    "model_name": "../input/debertav3base",
    "type": "Attention Regression Head",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-attention-regression-head-debertav3b",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": "attention",
    "layer_start": 1,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "hidden_dim": 128,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG6["tokenizer"] = AutoTokenizer.from_pretrained(CFG6["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
CFG7 = {
    "model_name": "../input/debertav3base",
    "type": "Weighted Regression Head",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-weighted-head-debertav3base",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": "weighted",
    "layer_start": 9,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG7["tokenizer"] = AutoTokenizer.from_pretrained(CFG7["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
CFG8 = {
    "model_name": "../input/debertav3base",
    "type": "Attention Regression Head + Multisample Dropout",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-attention-multisample-debertav3base",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": "attention",
    "layer_start": 1,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "hidden_dim": 128,
    "dropout": 0.3,
    "multisample": True,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG8["tokenizer"] = AutoTokenizer.from_pretrained(CFG8["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
CFG9 = {
    "model_name": "../input/debertav3base",
    "type": "Baseline L2 Loss",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "../input/downloading-debertav3b-l2",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "layer_start": 9,
    "weight_decay": 0.3,
    "dropout": 0.0,
    "grad_norm": 1000,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG9["tokenizer"] = AutoTokenizer.from_pretrained(CFG9["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
CFG10 = {
    "model_name": "../input/deberta-v3-large/deberta-v3-large",
    "type": "Smooth L1 Loss",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-debertav3large-smoothl1",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "layer_start": 9,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "dropout": 0.0,
    "multisample": False,
    "oof_path": "deberta-v3-large-L1.csv", 
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG10["tokenizer"] = AutoTokenizer.from_pretrained(CFG10["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
CFG11 = {
    "model_name": "../input/deberta-v3-large/deberta-v3-large",
    "type": "L2 Loss",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-debertav3large-l2",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "layer_start": 9,
    "weight_decay": 0.3,
    "dropout": 0.0,
    "grad_norm": 1000,
    "multisample": False,
    "oof_path": "deberta-v3-large-L2.csv", 
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG11["tokenizer"] = AutoTokenizer.from_pretrained(CFG11["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
CFG12 = {
    "model_name": "/kaggle/input/transformers/xlnet-large-cased",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-xlnet-large-l1",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "dropout": 0.0,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG12["tokenizer"] = AutoTokenizer.from_pretrained(CFG12["model_name"])

In [16]:
CFG13 = {
    "model_name": "/kaggle/input/ernie20largeen/nghuyong/ernie-2.0-large-en",
    "type": "Other Models",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-ernie-2-0-large",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "dropout": 0.0,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "multisample": False,
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG13["tokenizer"] = AutoTokenizer.from_pretrained(CFG13["model_name"])

In [17]:
CFG14 = {
    "model_name": "../input/deberta-v3-large/deberta-v3-large",
    "type": "DS Last",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-debertav3large-l1-ds-last",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 4,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 4,
    "pooler": "DS Last",
    "layer_start": 9,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "aux_weight": 0.5,
    "dropout": 0.1,
    "multisample": False,
    "oof_path": "deberta-v3-large-L1-DS-Last.csv", 
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG14["tokenizer"] = AutoTokenizer.from_pretrained(CFG14["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
CFG15 = {
    "model_name": "../input/deberta-v3-large/deberta-v3-large",
    "type": "DS All",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-debertav3large-l1-ds-all",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 4,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 4,
    "pooler": "DS All",
    "layer_start": 9,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "aux_weight": 0.5,
    "dropout": 0.1,
    "multisample": False,
    "oof_path": "deberta-v3-large-L1-DS-All.csv", 
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG15["tokenizer"] = AutoTokenizer.from_pretrained(CFG15["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
CFG16 = {
    "model_name": "../input/deberta-v3-large/deberta-v3-large",
    "type": "DS MaxPool",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-debertav3large-l1-maxpool",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 4,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 4,
    "pooler": "DS MaxPool",
    "layer_start": 9,
    "weight_decay": 0.3,
    "grad_norm": 1000,
    "aux_weight": 0.5,
    "dropout": 0.1,
    "multisample": False,
    "oof_path": "deberta-v3-large-DS-MaxPool.csv", 
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG16["tokenizer"] = AutoTokenizer.from_pretrained(CFG16["model_name"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
CFG17 = {
    "model_name": "/kaggle/input/transformers/funnel-transformer-large",
    "type": "Smooth L1 Loss",
    "targets": ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    "weights": "/kaggle/input/downloading-funnel-l1",
    "max_length": 512,
    "seed": 42,
    "folds": 4,
    "lr": 2e-5, 
    "batch_size": 16,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 6,
    "grad_accum": 1,
    "pooler": None,
    "layer_start": 9,
    "weight_decay": 0.3,
    "dropout": 0.0,
    "grad_norm": 1000,
    "multisample": False,
    "oof_path": "funnel-L1.csv", 
    "optimizer": "AdamW",
    "scheduler": "linear",
}
CFG17["tokenizer"] = AutoTokenizer.from_pretrained(CFG17["model_name"])

# Model Definition

In [21]:
class WeightedLayerPooling(torch.nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average
    

In [22]:
class AttentionPooling(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=0)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=0)

        return context_vector

In [23]:
#OBVIOUSLY, CHANGE THIS AS YOU NEED. USE SELF.LOG FOR ALL IMPORTANT METRICS
class Model(nn.Module):
    def __init__(self, config, vocab_length, data_loader_len):
        super(Model, self).__init__()
        self.config = config
        self.vocab_length = vocab_length
        self.base_model = AutoModel.from_pretrained(self.config['model_name'], output_hidden_states = True)  
        self.base_model.resize_token_embeddings(vocab_length)
        self.dropout = torch.nn.Dropout(p=CFG["dropout"])
        self.order = torch.LongTensor([5, 0, 1, 2, 3, 4]).cuda()
        
        if self.config["pooler"] == "weighted":
            self.pooler = WeightedLayerPooling(self.base_model.config.num_hidden_layers, layer_start = self.config["layer_start"])  
            self._init_weights(self.pooler.layer_weights)
            
        elif self.config["pooler"] == "attention":
            self.pooler = AttentionPooling(self.base_model.config.hidden_size, config["hidden_dim"])

        if self.config["multisample"]:
            self.dropout1 = nn.Dropout(0.1)
            self.dropout2 = nn.Dropout(0.2)
            self.dropout3 = nn.Dropout(0.3)
            self.dropout4 = nn.Dropout(0.4)
            self.dropout5 = nn.Dropout(0.5)
            
        self.dropout = nn.Dropout(self.config["dropout"])
        self.fc = nn.Linear(self.base_model.config.hidden_size, 6)
        self._init_weights(self.fc)
        
        if config["pooler"] == "DS All" or config["pooler"] == "DS MaxPool":
            self.fcs = nn.ModuleList([])
            for _ in range(6):
                layer = nn.Linear(self.base_model.config.hidden_size, 1)
                self._init_weights(layer)
                self.fcs.append(layer)
        else:
            self.fcs = nn.ModuleList([])
            for _ in range(7):
                layer = nn.Linear(self.base_model.config.hidden_size, 1)
                self._init_weights(layer)
                self.fcs.append(layer)
        
        self.data_loader_len = data_loader_len

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.base_model.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.base_model.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):

        if self.config["pooler"] == "weighted":
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        
            x = self.base_model(input_ids = input_ids, attention_mask = attention_mask)["hidden_states"]

            x = torch.stack(x)
            cls_embeddings = self.pooler(x)[:, 0]

            return cls_embeddings
        
        
        elif self.config["pooler"] == "DS Last":
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        
            x = self.base_model(input_ids = input_ids, attention_mask = attention_mask)["hidden_states"]

            x = torch.stack(x)

            # Last 6 layers
            return x[-7:, :, :, :]
        
        elif self.config["pooler"] == "DS All":
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        
            x = self.base_model(input_ids = input_ids, attention_mask = attention_mask)["hidden_states"]

            x = torch.stack(x)

            # Last 6 layers
            return x[-6:, :, :, :]
        
        elif self.config["pooler"] == "DS MaxPool":
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        
            x = self.base_model(input_ids = input_ids, attention_mask = attention_mask)["hidden_states"]

            x = torch.stack(x)

            # Last 6 layers
            return x[-6:, :, :, :]
            
        else:
            input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        
            x = self.base_model(input_ids = input_ids, attention_mask = attention_mask)["last_hidden_state"]

            return x[:, 0, :]

            
    def forward(self, inputs):
        
        features = self.feature(inputs)
        
        if self.config["multisample"]:
            logits1 = self.fc(self.dropout1(features))
            logits2 = self.fc(self.dropout2(features))
            logits3 = self.fc(self.dropout3(features))
            logits4 = self.fc(self.dropout4(features))
            logits5 = self.fc(self.dropout5(features))

            logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
            
            return logits

            
        if self.config["pooler"] == "DS Last":

            outputs = []

            for layer_num, layer in enumerate(features):

                if layer_num == (len(features) - 1):
                    pred = self.fc(self.dropout(layer[:, 0, :]))
                    break
            
                outputs.append(self.fcs[layer_num](self.dropout(layer[:, 0, :])))

            outputs = torch.stack(outputs)
            
            return pred
        
        elif self.config["pooler"] == "DS MaxPool":

            outputs = []

            layers = []

            for layer_num, layer in enumerate(features):
                
                layers.append(self.dropout(layer[:, 0, :]))
                
                outputs.append(self.fcs[layer_num](layers[-1]))

            outputs = torch.stack(outputs)

            layers = torch.stack(layers)

            final_cls = torch.max(layers, dim = 0)[0]

            pred = self.fc(final_cls)
            
            return pred
        
        elif self.config["pooler"] == "DS All":

            outputs = []

            layers = []

            for layer_num, layer in enumerate(features):
                
                layers.append(self.dropout(layer[:, 0, :]))
                
                outputs.append(self.fcs[layer_num](layers[-1]))
            
            outputs = torch.stack(outputs)

            return torch.index_select(outputs.squeeze(-1).transpose(0,1), 1, self.order)
        
        else:
            logits = self.fc(features)
            
            return logits
            

# Dataset Classes

In [24]:
class TestData(Dataset):
    def __init__(self, df, config, special_tokens = None):
        self.df = df
        self.esc_chars = ['\"', "\\", "\n", "\r", "\t", "\b", "\f", "\v", ":)", ";)", ":(", "uwu", "owo", "xd", ":3", ":-)", ":D", ">:(", "\xa0", "\x92", "\x93", "\x91", "\x94", "\x97", "x\B4", "\x96", "\x82", "\x84"]
        self.df["full_text"] = self.df["full_text"].apply(lambda text: self.resolve_encodings_and_normalize(text))

        codecs.register_error("replace_encoding_with_utf8", self.replace_encoding_with_utf8)
        codecs.register_error("replace_decoding_with_cp1252", self.replace_decoding_with_cp1252)

    def replace_encoding_with_utf8(self, error):
        return error.object[error.start : error.end].encode("utf-8"), error.end


    def replace_decoding_with_cp1252(self, error):
        return error.object[error.start : error.end].decode("cp1252"), error.end


    def resolve_encodings_and_normalize(self, text: str) -> str:
        text = (
            text.encode("raw_unicode_escape")
            .decode("utf-8", errors="replace_decoding_with_cp1252")
            .encode("cp1252", errors="replace_encoding_with_utf8")
            .decode("utf-8", errors="replace_decoding_with_cp1252")
        )
        
        text = unidecode(text)
        
        return self.remove_esc_chars(text)

    def remove_esc_chars(self, text):
        txt = deepcopy(text)
        for char in self.esc_chars:
            if char == '\"':
                txt = txt.replace(char, '"')
            elif char == "\x92" or char == "\x91" or char == "\xB4":
                txt = txt.replace(char, "'")
            elif char == "\0x93" or char == "\0x94":
                txt = txt.replace(char, '"')
            elif char == "\0x97" or char == "\0x96":
                txt = txt.replace(char, '-')
            elif char == "\0x82" or char == "\0x84":
                txt = txt.replace(char, ',')
            else:
                txt = txt.replace(char, ' ')
        return txt

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]["full_text"]

In [25]:
def construct_collate_fn(config):
    def collate_dynamic_padding(batch):
        # Dynamic Padding tokenization
        sentences = config["tokenizer"](batch, padding=True, max_length = config["max_length"], truncation = True, return_token_type_ids = False, return_tensors="pt")
        return sentences
    
    return collate_dynamic_padding

In [26]:
#CHANGE AS NEEDED. MOST OF THE TIME, PYTORCH'S DEFAULT COLLATOR IS ENOUGH.
class DataModule():
    def __init__(self, config, test, collate_fn):
        self.config = config
        self.test = test
        self.collate_fn = collate_fn

    def test_dataloader(self):
        test_loader = DataLoader(self.test, batch_size = self.config["batch_size"], collate_fn = self.collate_fn)      
        return test_loader

# Inferencing

In [27]:
def predict(model, loader):
    outputs = []
    device = torch.device('cuda')
    model = model.to(device).eval()
    with torch.no_grad():
        for inputs in tqdm(loader):
            for key, value in inputs.items():
                inputs[key] = value.to(device)
            predictions = model(inputs)
            outputs.append(predictions)
    return torch.cat(tuple(outputs))

In [28]:
def get_predictions(loader, num_preds, path, config, save_path):
    predictions = []
    for fold in range(num_preds):
        
        model = Model(config, len(config["tokenizer"]), len(loader))
        checkpoint = torch.load(f"{path}/fold-{fold}.pt", map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['model_state_dict'], strict = False)               
        
        results = predict(model, loader)
        
        predictions.append(results.cpu().numpy())
        
        del model, checkpoint; gc.collect()
        torch.cuda.empty_cache()
    
    predictions = np.mean(np.array(predictions), axis = 0)
    df = pd.DataFrame(predictions, columns = CFG["targets"])
    df.to_csv(save_path)
    
    del predictions, df; gc.collect()

In [29]:
CFGS = [CFG1, CFG2, CFG3, CFG4, CFG5, CFG6, CFG7, CFG8, CFG9, CFG10, CFG11, CFG12, CFG13, CFG14, CFG15, CFG16, CFG17]

In [30]:
models = ['bigbird-roberta-large', 'deberta-v3-baseline', 'deberta-v3-large', 'longformer-base', 'deberta-v3-large-psuedo', 'deberta-v3-base-attention-head', 'deberta-v3-base-weighted-head', 'deberta-v3-base-attention-multisample', 'deberta-v3-baseline-L2', "deberta-v3-large-L1", "deberta-v3-large-L2", "xlnet-large-cased-L1", "ernie-2", "deberta-v3-large-L1-Last", "deberta-v3-large-L1-DS-All", "deberta-v3-large-DS-MaxPool", "funnel-L1"]
for num, CFG in enumerate(CFGS):
    
    data = pd.read_csv(f"../input/feedback-prize-english-language-learning/test.csv")

    test = TestData(data, CFG)
    dataset = DataModule(CFG, test, construct_collate_fn(CFG))
    loader = dataset.test_dataloader()
    
    oof_name = f"{models[num]}.csv"

    get_predictions(loader, num_preds = 4, path=CFG["weights"], config = CFG, save_path = oof_name)
    
    del data, test, dataset, loader; gc.collect()

Some weights of the model checkpoint at ../input/bigbirdrobertalarge/bigbird-roberta-large were not used when initializing BigBirdModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  * num_indices_to_pick_from
100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
Some w

# Stacking

In [31]:
CFG = {
    "model_name": "6x Linear",
    "models": [
            "deberta-v3-large-L1",
            "deberta-v3-baseline-L2",
            "deberta-v3-large",
            "longformer-base",
            "deberta-v3-large-psuedo",
            "deberta-v3-baseline",
            "deberta-v3-base-attention-head",
            "deberta-v3-base-attention-multisample",
            "bigbird-roberta-large",
            "deberta-v3-base-weighted-head",
            "ernie-2",
            "funnel-L1",
            "xlnet-large-cased-L1",
            "deberta-v3-large-DS-MaxPool",
            "deberta-v3-large-L1-Last",
            "deberta-v3-large-L1-DS-All",
            "deberta-v3-large-L2"
        ],
    "type": "Stacking",
    "seed": 42,
    "lr": 1e-3,
    "final_div_factor": 1e8,
    "batch_size": 4,
    "epochs": 20,
    "num_warmup_steps": 0.0,
    "patience": 5,
    "grad_accum": 1,
    "weight_decay": 0.01,
    
    "optimizer": "one_cycle",
    "scheduler": "linear",
}

In [32]:
#OBVIOUSLY, CHANGE THIS AS YOU NEED. USE SELF.LOG FOR ALL IMPORTANT METRICS
class Stacker(nn.Module):
    def __init__(self, config, data_loader_len):
        super(Stacker, self).__init__()
        self.config = config
        
        self.fc = nn.Linear(len(self.config["models"]), 1)

        self.data_loader_len = data_loader_len

    def forward(self, inputs):

        return self.fc(inputs)

In [33]:
class Data(Dataset):
    def __init__(self, base_model_dfs, config):
        self.base_model_dfs = base_model_dfs
        
    def __len__(self):
        return len(self.base_model_dfs["deberta-v3-large"])

    def __getitem__(self, idx):
        
        predictions = []
        for model in self.base_model_dfs.values():
            predictions.append(model.iloc[idx][1:].values.astype(float))
        
        return np.array(predictions).T

In [34]:
def collate_dynamic_padding(batch):
    
    predictions = torch.tensor(batch)

    return predictions

In [35]:
text_id = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv").drop("full_text", axis = 1)

base_model_dfs = {}
for model in CFG["models"]:
    path = f"./{model}.csv"
    base_model_dfs[model] = pd.read_csv(path)

In [36]:
train = Data(base_model_dfs, CFG)
dataset = DataModule(CFG, train, collate_dynamic_padding)

In [37]:
test_loader = dataset.test_dataloader()

In [38]:
stacker = Stacker(CFG, len(test_loader))

In [39]:
checkpoint = torch.load(f"../input/downloading-stacking-linear6x/stacker-6x.pt", map_location=torch.device('cpu'))

In [40]:
stacker.load_state_dict(checkpoint['model_state_dict'], strict = False)

<All keys matched successfully>

In [41]:
device = torch.device('cuda')

In [42]:
stacker = stacker.to(device).eval()

In [43]:
final_predictions = []
for batch in test_loader:
    with torch.no_grad():
        inputs = batch

        inputs = inputs.to(device).to(torch.float32)
        
        y_hat = stacker(inputs).squeeze()
    
        final_predictions.append(y_hat)

  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
final_predictions

[tensor([[2.8891, 2.7693, 3.0555, 3.0300, 2.7272, 2.7012],
         [2.6773, 2.4880, 2.7181, 2.4632, 2.1987, 2.7133],
         [3.5766, 3.4281, 3.5467, 3.5945, 3.4269, 3.4317]], device='cuda:0')]

In [45]:
submission_preds = torch.cat(final_predictions).cpu().numpy()

In [46]:
submission_preds

array([[2.8891196, 2.7692761, 3.055546 , 3.0299673, 2.7271621, 2.7011757],
       [2.6773233, 2.487992 , 2.7180629, 2.4631908, 2.1986609, 2.71327  ],
       [3.576643 , 3.4281452, 3.5466895, 3.594492 , 3.426944 , 3.4317079]],
      dtype=float32)

In [47]:
submission = pd.DataFrame(text_id)

In [48]:
submission["cohesion"], submission["syntax"], submission["vocabulary"], submission["phraseology"], submission["grammar"], submission["conventions"] = submission_preds[:,0], submission_preds[:,1], submission_preds[:,2], submission_preds[:,3], submission_preds[:,4], submission_preds[:,5]

In [49]:
submission

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.88912,2.769276,3.055546,3.029967,2.727162,2.701176
1,000BAD50D026,2.677323,2.487992,2.718063,2.463191,2.198661,2.71327
2,00367BB2546B,3.576643,3.428145,3.54669,3.594492,3.426944,3.431708


In [50]:
submission.to_csv("submission.csv", index = False)