In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

# Set transformer directory
transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

# Set input directory
input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

# Set convert file
convert_file = input_dir / "convert_slow_tokenizer.py"

# Set conversion path
conversion_path = transformers_path/convert_file.name

# If conversion path exists, delete it
if conversion_path.exists():
    conversion_path.unlink()

# Copy convert file to transformers path
shutil.copy(convert_file, transformers_path)

# Set DeBERTa v2 path
deberta_v2_path = transformers_path / "models" / "deberta_v2"

# For each filename in the list, copy the file to deberta_v2_path
for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
import os
import re
import gc
import ast
import sys
import copy
import json
import math
import string
import pickle
import random
import itertools
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    # Apply seed to GPU
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set seed for reproducibility
seed_everything(seed=42)

In [None]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
        
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
        
    return micro_f1(bin_preds, bin_truths)

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        # Get the location list for the current row
        lst = df.loc[i, 'location']
        if lst:
            # Join the location list with semicolons
            new_lst = ';'.join(lst)
            # Convert the new list to a literal eval
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    # Iterate over the location list for each row
    for location_list in df['location_for_create_labels'].values:
        truth = []
        # If the location list is not empty
        if len(location_list) > 0:
            # Get the first location
            location = location_list[0]
            # Iterate over the locations in the location string
            for loc in [s.split() for s in location.split(';')]:
                # Convert the location string to a list of integers
                start, end = int(loc[0]), int(loc[1])
                # Append the start and end as a list to the truth list
                truth.append([start, end])
        # Append the truth list to the truths list
        truths.append(truth)
        
    return truths


def get_char_probs(texts, predictions, tokenizer):
    # Initialize results as a list of zeros with the length of each text
    results = [np.zeros(len(t)) for t in texts]
    # Iterate over the texts and predictions
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        # Encode the text with special tokens and return offset mappings
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        # Iterate over the offset mappings and predictions
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            # Get the start and end of the current token
            start = offset_mapping[0]
            end = offset_mapping[1]
            # Set the prediction value for the current token
            results[i][start:end] = pred
            
    return results


def get_results(char_probs, th=0.5):
    # Initialize results as an empty list
    results = []
    # Iterate over the character probabilities
    for char_prob in char_probs:
        # Get the indices where the character probability is greater than or equal to the threshold
        result = np.where(char_prob >= th)[0] + 1
        # Group consecutive indices
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        # Format the result as a string of start and end indices
        result = [f"{min(r)} {max(r)}" for r in result]
        # Join the result with semicolons
        result = ";".join(result)
        results.append(result)
        
    return results


def get_predictions(results):
    # Initialize predictions as an empty list
    predictions = []
    # Iterate over the results
    for result in results:
        prediction = []
        # If the result is not empty
        if result != "":
            # Iterate over the locations in the result string
            for loc in [s.split() for s in result.split(';')]:
                # Convert the location string to a list of integers
                start, end = int(loc[0]), int(loc[1])
                # Append the start and end as a list to the prediction list
                prediction.append([start, end])
        # Append the prediction list to the predictions list
        predictions.append(prediction)
        
    return predictions


def get_score(y_true, y_pred):
    # Calculate the micro f1 score for the true and predicted spans
    return span_micro_f1(y_true, y_pred)

In [None]:
def process_feature_text(text):
    # Replace 'I-year' with '1-year'
    text = re.sub('I-year', '1-year', text)
    # Replace '-OR-' with ' or '
    text = re.sub('-OR-', " or ", text)
    # Replace '-' with ' '
    text = re.sub('-', ' ', text)
    return text


def clean_spaces(txt):
    # Replace newlines with spaces
    txt = re.sub('\n', ' ', txt)
    # Replace tabs with spaces
    txt = re.sub('\t', ' ', txt)
    # Replace carriage returns with spaces
    txt = re.sub('\r', ' ', txt)
    # Replace multiple spaces with a single space
    txt = re.sub(r'\s+', ' ', txt)
    return txt


In [None]:
# Set main directory
main_dir="../input/nbme-score-clinical-patient-notes/"

def preprocess_features(features):
    # Replace the feature text for the 27th row
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"

    return features


# Read test, submission, features, and patient notes
test = pd.read_csv(main_dir+'test.csv')
submission = pd.read_csv(main_dir+'sample_submission.csv')
features = pd.read_csv(main_dir+'features.csv')
patient_notes = pd.read_csv(main_dir+'patient_notes.csv')

# Preprocess features
features = preprocess_features(features)

# Print shapes and display head of each dataframe
print(f"test.shape: {test.shape}")
display(test.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

In [None]:
# Merge test with features on 'feature_num' and 'case_num'
test = test.merge(features, on=['feature_num', 'case_num'], how='left')

# Merge test with patient_notes on 'pn_num' and 'case_num'
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

# Display the first few rows of the merged dataframe
display(test.head())

In [None]:
def prepare_input_fast(cfg, text, feature_text, batch_max_len):
    # Tokenize the text and feature text with special tokens, padding, and return offset mappings
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=batch_max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    # Convert the tokenized inputs to tensors
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDatasetFast(Dataset):
    # Initialize the dataset with configuration and dataframe
    def __init__(self, cfg, df):
        # Set the configuration and dataframe
        self.cfg = cfg
        # Get the feature texts, pn history, and batch max length from the dataframe
        self.feature_texts = df['feature_text'].values
        # Get the pn history from the dataframe
        self.pn_historys = df['pn_history'].values
        # Get the batch max length from the dataframe
        self.batch_max_len = df['batch_max_length'].values

    def __len__(self):
        # Return the length of the feature texts
        return len(self.feature_texts)

    def __getitem__(self, item):
        # Prepare the input for the current item
        inputs = prepare_input_fast(self.cfg, 
                                    self.pn_historys[item], 
                                    self.feature_texts[item],
                                    self.batch_max_len[item],
                                    )
        # Return the prepared input
        return inputs


In [None]:
class CustomModel(nn.Module):
    # Initialize the model with configuration, optional config path, and pretrained flag
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        # Set the configuration
        self.cfg = cfg
        # If config path is not provided, load the configuration from the model
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        # Otherwise, load the configuration from the provided path
        else:
            self.config = torch.load(config_path)
        # If pretrained is True, load the model from the pretrained model
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        # Otherwise, load the model from the configuration
        else:
            self.model = AutoModel.from_config(self.config)
        # Add a dropout layer to the model
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        # Add a linear layer to the model
        self.fc = nn.Linear(self.config.hidden_size, 1)
        # Initialize the weights of the linear layer
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        # If the module is a linear layer
        if isinstance(module, nn.Linear):
            # Normalize the weights with mean 0 and standard deviation from the configuration
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # If the module has a bias, set it to zero
            if module.bias is not None:
                module.bias.data.zero_()
        # If the module is an embedding layer
        elif isinstance(module, nn.Embedding):
            # Normalize the weights with mean 0 and standard deviation from the configuration
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # If the module has a padding index, set the corresponding weight to zero
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # If the module is a layer normalization layer
        elif isinstance(module, nn.LayerNorm):
            # Set the bias to zero
            module.bias.data.zero_()
            # Set the weight to one
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        # Pass the inputs through the model and get the outputs
        outputs = self.model(**inputs)
        # Get the last hidden states from the outputs
        last_hidden_states = outputs[0]
        # Return the last hidden states
        return last_hidden_states

    def forward(self, inputs):
        # Pass the inputs through the feature method and get the feature
        feature = self.feature(inputs)
        # Pass the feature through the dropout layer and the linear layer
        output = self.fc(self.fc_dropout(feature))
        # Return the output
        return output

In [None]:
def inference_fn_fast(test_loader, model, device):
    # Initialize preds as an empty list
    preds = []
    # Set the model to evaluation mode
    model.eval()
    # Move the model to the specified device
    model.to(device)
    # Initialize a progress bar for the test loader
    tk0 = tqdm(test_loader, total=len(test_loader))
    # Iterate over the test loader
    for inputs in tk0:
        # for inputs in test_loader
        bs = len(inputs['input_ids'])
        # Initialize pred_w_pad with zeros
        pred_w_pad = np.zeros((bs, CFG.max_len, 1))
        # Iterate over the inputs
        for k, v in inputs.items():
            # Move the input to the specified device
            inputs[k] = v.to(device)
        # Disable gradient calculation
        with torch.no_grad():
            # Pass the inputs through the model and get the predictions
            y_preds = model(inputs)
        # Convert the predictions to a numpy array and apply sigmoid
        y_preds = y_preds.sigmoid().to('cpu').numpy()
        # Update pred_w_pad with the predictions
        pred_w_pad[:, :y_preds.shape[1]] = y_preds
        # Append the predictions to the preds list
        preds.append(pred_w_pad)
    # Concatenate the predictions
    predictions = np.concatenate(preds)
    # Return the predictions
    return predictions

## DeBERTa-V3-Large-3

In [None]:
# class CFG:
#     num_workers=4
#     path="../input/nbme-deberta-v3-large-3/"
#     config_path=path+'config.pth'
#     model="microsoft/deberta-v3-large"
#     batch_size=32
#     fc_dropout=0.2
#     max_len=354
#     seed=42
#     n_fold=5
#     trn_fold=[0, 1, 2, 3, 4]

# from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

# tokenizer = DebertaV2TokenizerFast.from_pretrained('../input/deberta-tokenizer')
# CFG.tokenizer = tokenizer

# input_lengths = []
# tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
# for text, feature_text in tk0:
#     length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
#     input_lengths.append(length)
# test['input_lengths'] = input_lengths
# length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# # sort dataframe
# sort_df = test.iloc[length_sorted_idx]

# # calc max_len per batch
# sorted_input_length = sort_df['input_lengths'].values
# batch_max_length = np.zeros_like(sorted_input_length)
# bs = CFG.batch_size
# for i in range((len(sorted_input_length)//bs)+1):
#     batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# sort_df['batch_max_length'] = batch_max_length

# test_dataset = TestDatasetFast(CFG, sort_df)
# test_loader = DataLoader(test_dataset,
#                       batch_size=CFG.batch_size,
#                       shuffle=False,
#                       num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    
#     state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
       
#     model.load_state_dict(state['model'])
#     prediction = inference_fn_fast(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
    
#     ## data re-sort ## 
#     prediction = prediction[np.argsort(length_sorted_idx)]
    
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs
#     gc.collect()
#     torch.cuda.empty_cache()
    
# predictions_v3_large_1 = np.mean(predictions, axis=0)

## DeBERTa-V3-Large-6

In [None]:
class CFG:
    # Configuration for the model
    num_workers=4
    path="../input/nbme-v3-large-6/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.2
    max_len=512
    seed=42
    n_fold=7
    trn_fold=[0, 2, 3, 4, 5, 6, 7]

from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

tokenizer = DebertaV2TokenizerFast.from_pretrained('../input/deberta-tokenizer')
# Set the tokenizer
CFG.tokenizer = tokenizer

# Calculate input lengths
input_lengths = []
# Iterate over the test data
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    # Calculate the length of the tokenized input
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    # Append the length to the input lengths list
    input_lengths.append(length)
# Add the input lengths to the test dataframe
test['input_lengths'] = input_lengths
# Sort the test dataframe by input length
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# Sort the test dataframe by input length
sort_df = test.iloc[length_sorted_idx]

# Calculate the maximum length per batch
sorted_input_length = sort_df['input_lengths'].values
# Initialize batch_max_length with zeros
batch_max_length = np.zeros_like(sorted_input_length)
# Set the batch size
bs = CFG.batch_size
# Calculate the maximum length per batch
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# Add the batch_max_length to the test dataframe
sort_df['batch_max_length'] = batch_max_length

# Create the test dataset
test_dataset = TestDatasetFast(CFG, sort_df)
# Create the test loader
test_loader = DataLoader(test_dataset,
                      batch_size=CFG.batch_size,
                      shuffle=False,
                      num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# Initialize predictions as an empty list
predictions = []
# Iterate over the training folds
for fold in CFG.trn_fold:
    # Create the model
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    # Load the model state from the specified path
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    # Load the model state
    model.load_state_dict(state['model'])
    prediction = inference_fn_fast(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    
    # Re-sort the prediction
    prediction = prediction[np.argsort(length_sorted_idx)]
    
    # Get the character probabilities
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    # Append the character probabilities to the predictions list
    predictions.append(char_probs)
    # Delete the model, state, prediction, and character probabilities
    del model, state, prediction, char_probs
    # Collect garbage
    gc.collect()
    # Empty the cache
    torch.cuda.empty_cache()
    
# Calculate the mean of the predictions
predictions_v3_large_2 = np.mean(predictions, axis=0)

## DeBERTa-Large

In [None]:
# class CFG:
#     num_workers=4
#     path="../input/nbme-deberta-large-fold-5/"
#     config_path=path+'config.pth'
#     model="microsoft/deberta-large"
#     batch_size=24
#     fc_dropout=0.2
#     max_len=466
#     seed=42
#     n_fold=5
#     trn_fold=[0, 1, 2, 3, 4]

# CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# input_lengths = []
# tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
# for text, feature_text in tk0:
#     length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
#     input_lengths.append(length)
# test['input_lengths'] = input_lengths
# length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# # sort dataframe
# sort_df = test.iloc[length_sorted_idx]

# # calc max_len per batch
# sorted_input_length = sort_df['input_lengths'].values
# batch_max_length = np.zeros_like(sorted_input_length)
# bs = CFG.batch_size
# for i in range((len(sorted_input_length)//bs)+1):
#     batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# sort_df['batch_max_length'] = batch_max_length

# test_dataset = TestDatasetFast(CFG, sort_df)
# test_loader = DataLoader(test_dataset,
#                       batch_size=CFG.batch_size,
#                       shuffle=False,
#                       num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
#     state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn_fast(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
    
#     prediction = prediction[np.argsort(length_sorted_idx)]
    
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs; gc.collect()
#     torch.cuda.empty_cache()
# predictions_v1_l = np.mean(predictions, axis=0)

## DeBERTa-Large-4

In [None]:
class CFG:
    # Configuration for the model
    num_workers=4
    path="../input/nbme-large-4/"
    config_path=path+'config.pth'
    model="microsoft/deberta-large"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=6
    trn_fold=[0, 2, 3, 5, 6, 7]

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path + 'tokenizer/')

# Calculate input lengths
input_lengths = []
# Iterate over the test data
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    # Calculate the length of the tokenized input
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    # Append the length to the input lengths list
    input_lengths.append(length)
# Add the input lengths to the test dataframe
test['input_lengths'] = input_lengths
# Sort the test dataframe by input length
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# Sort the test dataframe by input length
sort_df = test.iloc[length_sorted_idx]

# Calculate the maximum length per batch
sorted_input_length = sort_df['input_lengths'].values
# Initialize batch_max_length with zeros
batch_max_length = np.zeros_like(sorted_input_length)
# Set the batch size
bs = CFG.batch_size
# Calculate the maximum length per batch
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# Add the batch_max_length to the test dataframe
sort_df['batch_max_length'] = batch_max_length

# Create the test dataset
test_dataset = TestDatasetFast(CFG, sort_df)
# Create the test loader
test_loader = DataLoader(test_dataset,
                      batch_size=CFG.batch_size,
                      shuffle=False,
                      num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
# Iterate over the training folds
for fold in CFG.trn_fold:
    # Create the model
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    # Load the model state from the specified path
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    # Load the model state
    model.load_state_dict(state['model'])
    # Get the predictions
    prediction = inference_fn_fast(test_loader, model, device)
    # Reshape the predictions
    prediction = prediction.reshape((len(test), CFG.max_len))
    # Re-sort the prediction
    prediction = prediction[np.argsort(length_sorted_idx)]
    # Get the character probabilities
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    # Append the character probabilities to the predictions list
    predictions.append(char_probs)
    # Delete the model, state, prediction, and character probabilities
    del model, state, prediction, char_probs; gc.collect()
    # Empty the cache
    torch.cuda.empty_cache()
# Calculate the mean of the predictions
predictions_v1_2 = np.mean(predictions, axis=0)

## DeBERTa-Large-MNLI

In [None]:
# class CFG:
#     num_workers=4
#     path="../input/nbme-deberta-large-mnli/"
#     config_path=path+'config.pth'
#     model="microsoft/deberta-large-mnli"
#     batch_size=24
#     fc_dropout=0.2
#     max_len=466
#     seed=1024
#     n_fold=5
#     trn_fold=[0, 1, 2, 3, 4]

# CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# input_lengths = []
# tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
# for text, feature_text in tk0:
#     length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
#     input_lengths.append(length)
# test['input_lengths'] = input_lengths
# length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# # sort dataframe
# sort_df = test.iloc[length_sorted_idx]

# # calc max_len per batch
# sorted_input_length = sort_df['input_lengths'].values
# batch_max_length = np.zeros_like(sorted_input_length)
# bs = CFG.batch_size
# for i in range((len(sorted_input_length)//bs)+1):
#     batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# sort_df['batch_max_length'] = batch_max_length

# test_dataset = TestDatasetFast(CFG, sort_df)
# test_loader = DataLoader(test_dataset,
#                       batch_size=CFG.batch_size,
#                       shuffle=False,
#                       num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
#     state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn_fast(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
    
#     prediction = prediction[np.argsort(length_sorted_idx)]
    
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs; gc.collect()
#     torch.cuda.empty_cache()
# predictions_large_mnli_1 = np.mean(predictions, axis=0)

## DeBERTa-Large-MNLI-3

In [None]:
class CFG:
    # Configuration for the model
    num_workers=4
    path="../input/nbme-large-mnli-3/"
    config_path=path+'config.pth'
    model="microsoft/deberta-large-mnli"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=1024
    n_fold=7
    trn_fold=[0, 2, 3, 4, 5, 6, 7]

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path + 'tokenizer/')

# Calculate input lengths
input_lengths = []
# Iterate over the test data
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    # Calculate the length of the tokenized input
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    # Append the length to the input lengths list
    input_lengths.append(length)
# Add the input lengths to the test dataframe
test['input_lengths'] = input_lengths
# Sort the test dataframe by input length
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# Sort the test dataframe by input length
sort_df = test.iloc[length_sorted_idx]

# Calculate the maximum length per batch
sorted_input_length = sort_df['input_lengths'].values
# Initialize batch_max_length with zeros
batch_max_length = np.zeros_like(sorted_input_length)
# Recalculate the maximum length per batch
bs = CFG.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# Add the batch_max_length to the test dataframe
sort_df['batch_max_length'] = batch_max_length

# Create the test dataset
test_dataset = TestDatasetFast(CFG, sort_df)
# Create the test loader
test_loader = DataLoader(test_dataset,
                      batch_size=CFG.batch_size,
                      shuffle=False,
                      num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
# Iterate over the training folds
for fold in CFG.trn_fold:
    # Create the model
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    # Load the model state from the specified path
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    # Load the model state
    model.load_state_dict(state['model'])
    # Get the predictions
    prediction = inference_fn_fast(test_loader, model, device)
    # Reshape the predictions
    prediction = prediction.reshape((len(test), CFG.max_len))
    # Re-sort the prediction
    prediction = prediction[np.argsort(length_sorted_idx)]
    # Get the character probabilities
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    # Append the character probabilities to the predictions list
    predictions.append(char_probs)
    # Delete the model, state, prediction, and character probabilities
    del model, state, prediction, char_probs; gc.collect()
    # Empty the cache
    torch.cuda.empty_cache()
# Calculate the mean of the predictions
predictions_large_mnli_2 = np.mean(predictions, axis=0)

## DeBERTa-XLarge

In [None]:
class CFG:
    # Configuration for the model
    num_workers=4
    path="../input/nbme-xlarge-3/"
    config_path=path+'config.pth'
    model="microsoft/deberta-xlarge"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=6
    trn_fold=[0, 2, 3, 5, 6, 7]

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path + 'tokenizer/')

# Calculate input lengths
input_lengths = []
# Iterate over the test data
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    # Calculate the length of the tokenized input
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    # Append the length to the input lengths list
    input_lengths.append(length)
# Add the input lengths to the test dataframe
test['input_lengths'] = input_lengths
# Sort the test dataframe by input length
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# Calculate the maximum length per batch
sorted_input_length = sort_df['input_lengths'].values
# Initialize batch_max_length with zeros
batch_max_length = np.zeros_like(sorted_input_length)
# Set the batch size
bs = CFG.batch_size
# Calculate the maximum length per batch
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# Add the batch_max_length to the test dataframe
sort_df['batch_max_length'] = batch_max_length

# Create the test dataset
test_dataset = TestDatasetFast(CFG, sort_df)
# Create the test loader
test_loader = DataLoader(test_dataset,
                      batch_size=CFG.batch_size,
                      shuffle=False,
                      num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
# Iterate over the training folds
for fold in CFG.trn_fold:
    # Create the model
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    # Load the model state from the specified path
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    # Load the model state
    model.load_state_dict(state['model'])
    # Get the predictions
    prediction = inference_fn_fast(test_loader, model, device)
    # Reshape the predictions
    prediction = prediction.reshape((len(test), CFG.max_len))
    # Re-sort the prediction
    prediction = prediction[np.argsort(length_sorted_idx)]
    # Get the character probabilities
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    # Append the character probabilities to the predictions list
    predictions.append(char_probs)
    # Delete the model, state, prediction, and character probabilities
    del model, state, prediction, char_probs; gc.collect()
    # Empty the cache
    torch.cuda.empty_cache()
# Calculate the mean of the predictions
predictions_xl = np.mean(predictions, axis=0)

## DeBERTa-Base

In [None]:
# # ====================================================
# # CFG
# # ====================================================
# class CFG:
#     num_workers=4
#     path="../input/nbme-deberta-base/"
#     config_path=path+'config.pth'
#     model="microsoft/deberta-base"
#     batch_size=24
#     fc_dropout=0.2
#     max_len=466
#     seed=42
#     n_fold=5
#     trn_fold=[0, 1, 2, 3, 4]

# CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# input_lengths = []
# tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
# for text, feature_text in tk0:
#     length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
#     input_lengths.append(length)
# test['input_lengths'] = input_lengths
# length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# # sort dataframe
# sort_df = test.iloc[length_sorted_idx]

# # calc max_len per batch
# sorted_input_length = sort_df['input_lengths'].values
# batch_max_length = np.zeros_like(sorted_input_length)
# bs = CFG.batch_size
# for i in range((len(sorted_input_length)//bs)+1):
#     batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# sort_df['batch_max_length'] = batch_max_length

# test_dataset = TestDatasetFast(CFG, sort_df)
# test_loader = DataLoader(test_dataset,
#                       batch_size=CFG.batch_size,
#                       shuffle=False,
#                       num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
#     state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn_fast(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
#     ## data re-sort ## 
#     prediction = prediction[np.argsort(length_sorted_idx)]

#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs; gc.collect()
#     torch.cuda.empty_cache()
# predictions_b_1 = np.mean(predictions, axis=0)

## Ensemble

In [None]:
# 0.55-0.40-0.10-0.15
w1 = 0.35     # v3 large    8825, 882
w2 = 0.25     # large       8783, 880
w3 = 0.35     # large-mnli  8795, 882
w4 = 0.25     # xlarge      8785, 882

# predictions_v3_large_2      0.8826	0.882
# predictions_v1_l            0.8783	0.88
# predictions_v1_2            0.8789	0.883
# predictions_large_mnli_2    0.8806	0.882
# predictions_xl              0.8785	0.882
# predictions_b_1             

# w1, w2, w3, w4 = 0.55, 0.15, 0.35, 0.15              0.886
# w1, w2, w3, w4, w5 = 0.55, 0.15, 0.25, 0.15, 0.1     0.885
# w1, w2, w3, w4, w5 = 0.45, 0.15, 0.35, 0.15, 0.1     0.885
# w1, w2, w3, w4, w5 = 0.45, 0.15, 0.35, 0.15, 0.05    0.886
# w1, w2, w3, w4, w5 = 0.55, 0.10, 0.40, 0.10, 0.05    0.885
# w1, w2, w3, w4 = 0.65, 0.15, 0.25, 0.15              0.886
# w1, w2, w3, w4 = 0.65, 0.10, 0.35, 0.10              0.885
# w1, w2, w3, w4 = 0.75, 0.10, 0.25, 0.10              0.885
# w1, w2, w3, w4 = 0.55, 0.15, 0.35, 0.15              0.886
# w1, w2, w3, w4 = 0.65, 0.15, 0.25, 0.15              0.886
# w1, w2, w3, w4 = 0.65, 0.15, 0.15, 0.10              0.885

# w1, w2, w3 = 0.55, 0.35, 0.20                        0.887

# 0.886
# w1, w2, w3, w4 = 0.55, 0.25, 0.15, 0.10

# 0.887
w1, w2, w3, w4 = 0.55, 0.20, 0.15, 0.10



predictions = []
# Ensemble the predictions
for p1, p2, p3, p4 in zip(predictions_v3_large_2, predictions_xl, predictions_large_mnli_2, predictions_v1_2):
    predictions.append(w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4)

In [None]:
# Get the results
results = get_results(predictions)
# Add the results to the submission dataframe
submission['location'] = results
# Display the submission dataframe
display(submission.head())
# Save the submission dataframe to a CSV file
submission[['id', 'location']].to_csv('submission.csv', index=False)