# Imports and Utils

Fast tokenizer borrowed from Thanh Nguyen's Fast Tokenizer with files [here](https://www.kaggle.com/datasets/thanhns/deberta-v2-3-fast-tokenizer) and [here](https://www.kaggle.com/datasets/thanhns/deberta-tokenizer).

In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path
import transformers

transformers_path = Path(transformers.__file__.replace("/__init__.py", ""))

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)         

In [None]:
import os
import gc
import sys
import ast
import random
from datetime import datetime

import numpy as np # linear algebra
import pandas as pd # data processing
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1300)
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
from transformers import RobertaTokenizerFast, AutoModel, AutoConfig
from transformers import AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F

%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class ModelParams:
    pt_batch_size = 4      # Batch size when pre-training with MLM, as transformer layers are trained, GPU RAM can only handle batch sizes of 4
    init_batch_size = 16    # For when model has just our beginning supervised data
    fin_batch_size = 32     # For when model has semi-supervised data
    eval_batch_size = 40     # For model evaluation, (no backward pass)
    max_length = 460        # Max length found from previous tokenizations
    rand_seed = 42
    num_workers = 0
    pin_mem = True if \
      num_workers > 0 else False
    num_epochs = 12
    n_folds = 5             # Num folds for cross validation
#     model_type = 'roberta'  # Fine tune the different models on separate notebooks

In [None]:
def seed_everything(seed=0):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(seed=ModelParams.rand_seed)

# Dataset creation

Data helper functions

In [None]:
def print_gpu_memory():
    info = nvmlDeviceGetMemoryInfo(H)
    print(f'total    : {info.total}')
    print(f'free     : {info.free}')
    print(f'used     : {info.used}')

def write_to_file(file_path, print_string, option='a'):
    # Need to write to file in this manner due to Colab's synchronization with Drive
    f = open(file_path, option)
    f.write(print_string)
    f.close()
    del f
    gc.collect()

def only_save_best_models(best_losses, epoch, loss, best_loss, fold_folder, model, optimizer=None):
    '''
    Function for managing data storage. Given the inputs, keep only the three best models
    in storage in terms of the loss (train_loss or val_loss).

    Returns: min(loss, best_loss)
    '''
    best_losses.append((epoch, loss))
    best_losses.sort(key=lambda x: x[1])
    epoch_folder = f"{fold_folder}/epoch_{epoch+1}"
    # Only want to save three best models to save memory
    if len(best_losses) > 3:
        popped = best_losses.pop(3)
        if popped[0] != epoch:
            shutil.rmtree(f"{fold_folder}/epoch_{popped[0]+1}")
            if not os.path.exists(epoch_folder): os.makedirs(epoch_folder)
            torch.save(model.state_dict(), epoch_folder+"/model.pth")
            if optimizer is not None: torch.save(optimizer.state_dict(), epoch_folder+"/optim.pth")
    else:
        if not os.path.exists(epoch_folder): os.makedirs(epoch_folder)
        torch.save(model.state_dict(), epoch_folder+"/model.pth")
        if optimizer is not None: torch.save(optimizer.state_dict(), epoch_folder+"/optim.pth")
    return min(loss, best_loss)

def create_k_fold_col(train_df):
    skf = StratifiedKFold(n_splits=ModelParams.n_folds, shuffle=True, random_state=ModelParams.rand_seed)
    if "gold" in train_df.columns:
        train_df["stratify_on"] = train_df["case_num"].astype(str) + train_df["feature_num"].astype(str) + train_df["gold"].astype(str)
    else:
        train_df["stratify_on"] = train_df["case_num"].astype(str) + train_df["feature_num"].astype(str)
    train_df["fold"] = -1      # Create a column to remember the fold
    for fold, (_, valid_idx) in enumerate(skf.split(np.zeros(len(train_df)), y=train_df["stratify_on"])):
        train_df.loc[valid_idx, "fold"] = fold
    return train_df

def location_list(string_locs):
    # example: ['0 1', '3 4', '4 4;5 6'] -> [(0, 1), (3, 4), (4, 4), (5, 6)]
    loc_list = []
    for loc_string in string_locs:
        for section in loc_string.split(';'):
            start, end = section.split()
            loc_list.append((int(start), int(end)))
    return loc_list

def calculate_f1_metrics(predictions, references, attention_mask):
    '''
    Given a batch of predictions, references, and their attention_masks, return
    the f1 metrics true_positive, false_positive, and false_negative from the batch.
    
    Args:
        predictions (tensor - list of list of ints): Predictions for each token in each sentence of the batch
        references (tensor - list of list of ints): Labels for each token in each sentence of the batch
        attention_mask (tensor - list of list of ints): Attention mask for tokens in the batch

    Returns:
        true_positive, false_positive, false_negative: ints for each of these values (wrt desired label 1)
    '''
    predictions = predictions * attention_mask
    references = references * attention_mask
    true_positive = torch.sum(predictions*references) + 0.1
    false_positive = torch.sum(predictions*(1-references)) + 0.1
    false_negative = torch.sum((1-predictions)*references) + 0.1
    return int(true_positive), int(false_positive), int(false_negative)

def convert_to_labels(offset_mapping, loc_list):
    '''
    Given a bunch of offset_mappings, return a tensor with label 1 for each token
    contained within the boundaries of a location in location_list.
    
    Args:
        offset_mapping (list of tuples of two ints): Token spans.
        loc_list (list of tuples of two ints): Char spans in sentence with positive feature label.

    Returns:
        torch tensor [ModelParams.max_length]: Binarized token label.
    '''
    labels = torch.zeros(ModelParams.max_length, dtype=torch.int64)
    if len(loc_list) == 0: return labels
    for i in range(ModelParams.max_length):
        if offset_mapping[i][0] >= loc_list[0][0]-1 and offset_mapping[i][1] <= loc_list[-1][1]:
            for loc in loc_list:
                if offset_mapping[i][0] >= loc[0]-1:
                    if offset_mapping[i][1] <= loc[1]:
                        labels[i] = 1
                else: break
    return labels

def get_char_probs(texts, predictions, offset_mappings):
    '''
    From the probabilities for individual tokens, generate probabilities for each
    of the characters using the offset mappings.
    '''
    results = [np.zeros(len(t)) for t in texts]
    for i, (prediction, offset_mapping) in enumerate(zip(predictions, offset_mappings)):
        prediction = prediction[:len(offset_mapping)]
        for idx, (pred, offset_tuple) in enumerate(zip(prediction, offset_mapping)):
            start = offset_tuple[0]
            end = offset_tuple[1]
            results[i][start:end] = pred
    return results

def get_results(prediction, th=0.5):
    '''
    From the probabilities for individual characters, generate locations for words.
    '''

    def return_word_spans(text): # _Standard_ tokenization function for returning spans of individual words and punctuation
        good_punctuation = set(';')
        punctuation = set([',', '.', '?', '!', '/', ':', '-', '>', '<', '~', '[', ']', '_', ')', '*', '\\', '=', '(', '&', '#', '$', '@', '+', '\n', '\t'])
        word_spans = []
        prev_punc = -1
        for i in range(len(text)):
            if text[i] in punctuation:
                if (i - prev_punc) != 1: word_spans.append((prev_punc+1, i))
                word_spans.append((i, i+1))
                prev_punc = i
            elif text[i].isspace() or text[i] in good_punctuation:
                if (i - prev_punc) != 1: word_spans.append((prev_punc+1, i))
                prev_punc = i
            elif text[i] == '"':
                if i+1 != len(text) and text[i+1] == '.': continue
                if (i - prev_punc) != 1: word_spans.append((prev_punc+1, i))
                prev_punc = i
            elif text[i] == "'" and not (i-1 >= 0 and text[i-1].isalpha() and i+1 < len(text) and text[i+1].isalpha()):
                if (i - prev_punc) != 1: word_spans.append((prev_punc+1, i))
                word_spans.append((i, i+1))
                prev_punc = i
        return word_spans

    def merge_spans(predictions, word_spans): # allows word spans to include spaces by merging consecutive word spans that are both inside the label
        results = []
        prev_word_inside = False
        for word in word_spans:
            label_prob = np.mean(predictions[word[0]:word[1]])
            if label_prob >= th: 
                if prev_word_inside: results.append((results.pop()[0], word[1]))
                else: results.append(word)
                prev_word_inside = True
            else: prev_word_inside = False
        results = [f"{tup[0]} {tup[1]}" for tup in results]
        results = ";".join(results)
        return results

    prediction["word_spans"] = prediction["text"].apply(return_word_spans)
    prediction["location"] = prediction.apply(lambda x: merge_spans(x["predictions"], x["word_spans"]), axis=1)
    prediction["id"] =  prediction.apply(lambda x: str(x["pn_num"]).rjust(5, '0') + '_' + str(x["feature_num"]).rjust(3, '0'), axis=1)
    prediction = prediction[['id', 'location']]

    return prediction

def get_char_locations(x, text):
    # Gold character labels as a numpy array (len of text)
    actual_locations = np.zeros(len(text))
    for offset_tuple in location_list(x):
        start = offset_tuple[0]
        end = offset_tuple[1]
        actual_locations[i][start:end] = 1
    return actual_locations

def produce_same_len_predictions(model_type, dataset, training_section="initial")
    # Produce similar length predictions as public Kaggle model has a different max_length
	prefix = ""
	if dataset == "train": prefix = "train_"

	if 'kaggle' not in model_type:
    	prediction_df = pd.read_csv(f"../public_models/{model_type}_{training_section}/{prefix}predictions.csv")
    	prediction_df["predictions"] = prediction_df["predictions"].apply(lambda x: np.array([float(i) for i in x[1:-1].split()]))
    	max_len_preds = np.array(prediction_df["predictions"])
    else:
        prediction_df = pd.read_csv(f"../public_models/deberta-v3-large-5-folds-public/{prefix}predictions.csv")
        prediction_df["predictions"] = prediction_df["predictions"].apply(lambda x: np.array([float(i) for i in x[1:-1].split()]))
        max_len_preds = list(prediction_df["predictions"])
        for index, row in enumerate(predictions[0]):
            newlen = len(row) - len(max_len_preds[index])
            max_len_preds[index] = np.concatenate([max_len_preds[index], np.ones(newlen)*0.335]) # 0.335 is ~ threshold value for Kaggle predictions
    return max_len_preds, prediction_df

### Dataset annotation error cleanup

In [None]:
# This function just cleans up some of the clear annotation errors
def clean_datasets(symptoms, pn_notes, train_df):
    symptoms.loc[27, 'feature_text'] = "last pap smear 1 year ago"

# When training these lines were run
#     train_df.loc[[338], 'location'] = '["764 783"]'
#     train_df.loc[[621], 'location'] = '["77 100", "398 420"]'
#     train_df.loc[[655], 'location'] = '["285 292;301 312", "285 287;296 312"]'
#     train_df.loc[[1262], 'location'] = '["551 557;565 580"]'
#     train_df.loc[[1265], 'location'] = '["131 135;181 212"]'
#     train_df.loc[[1424], 'location'] = '["74 88;109 129"]'
#     train_df.loc[[1591], 'location'] = '["176 184;201 212"]'
#     train_df.loc[[1615], 'location'] = '["249 257;271 288"]'
#     train_df.loc[[1664], 'location'] = '["822 824;907 924"]'
#     train_df.loc[[1714], 'location'] = '["101 129"]'
#     train_df.loc[[1929], 'location'] = '["531 539;549 561"]'
#     train_df.loc[[2134], 'location'] = '["540 560;581 593"]'
#     train_df.loc[[2191], 'location'] = '["32 57"]'
#     train_df.loc[[2553], 'location'] = '["308 317;376 384"]'
#     train_df.loc[[3124], 'location'] = '["549 557"]'
#     train_df.loc[[3858], 'location'] = '["102 123", "102 112;125 141", "102 112;143 157", "102 112;159 171"]'
#     train_df.loc[[4373], 'location'] = '["33 45"]'
#     train_df.loc[[4763], 'location'] = '["5 16"]'
#     train_df.loc[[4782], 'location'] = '["175 194"]'
#     train_df.loc[[4908], 'location'] = '["700 723"]'
#     train_df.loc[[6016], 'location'] = '["225 250"]'
#     train_df.loc[[6192], 'location'] = '["46 69", "197 218;236 260"]'
#     train_df.loc[[6380], 'location'] = '["480 482;507 519", "480 482;499 503;512 519", "480 482;521 531", "480 482;533 545", "480 482;564 582"]'
#     train_df.loc[[6562], 'location'] = '["290 320;327 337", "290 320;342 358"]'
#     train_df.loc[[6862], 'location'] = '["288 296;324 363"]'
#     train_df.loc[[7022], 'location'] = '["133 182"]'
#     train_df.loc[[7422], 'location'] = '["102 125"]'
#     train_df.loc[[8876], 'location'] = '["481 483;533 552"]'
#     train_df.loc[[9027], 'location'] = '["92 102", "123 164"]'
#     train_df.loc[[9875], 'location'] = '["151 156", "163 171", "221 225"]'
#     train_df.loc[[9938], 'location'] = '["89 117", "122 138"]'
#     train_df.loc[[9973], 'location'] = '["344 361"]'
#     train_df.loc[[10513], 'location'] = '["600 623"]'
#     train_df.loc[[11551], 'location'] = '["386 400;433 461"]'
#     train_df.loc[[11677], 'location'] = '["160 201"]'
#     train_df.loc[[12124], 'location'] = '["325 337;349 366"]'
#     train_df.loc[[12279], 'location'] = '["405 459;488 524"]'
#     train_df.loc[[12289], 'location'] = '["353 400;488 524"]'
#     train_df.loc[[13238], 'location'] = '["293 307", "321 331"]'
#     train_df.loc[[13297], 'location'] = '["182 221", "182 213;225 234"]'
#     train_df.loc[[13299], 'location'] = '["71 96"]'
#     train_df.loc[[13845], 'location'] = '["86 94;230 256", "86 94;237 256"]'
#     train_df.loc[[14083], 'location'] = '["56 64;156 179"]'

    return symptoms, pn_notes, train_df

### Data loading and torch dataset creation

In [None]:
# Data directory
DATA_DIR = '../input/nbme-score-clinical-patient-notes/'

symptoms = pd.read_csv(DATA_DIR + 'features.csv')
pn_notes = pd.read_csv(DATA_DIR + 'patient_notes.csv')
train_df = pd.read_csv(DATA_DIR + 'train.csv')

train_df.drop(['id', 'annotation'], axis=1, inplace=True)

symptoms['feature_text'] = symptoms['feature_text'].str.replace('-', ' ').str.replace(" OR ", "; ")
symptoms["feature_text"] = symptoms["feature_text"].apply(lambda x: x.lower())
pn_notes["pn_history"] = pn_notes["pn_history"].apply(lambda x: x.lower())

# Clean up some clear annotation errors
symptoms, pn_notes, train_df = clean_datasets(symptoms, pn_notes, train_df)

train_df['location'] = train_df['location'].apply(lambda x: ast.literal_eval(x))   # train_df['location'][index] has format "["1 2"]"

roberta_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')
deberta_tokenizer = DebertaV2TokenizerFast.from_pretrained('../input/deberta-tokenizer')
# Note: offset mapping for these decoders are the same, but DeBERTa tokenizer decodes single tokens without a space, multiple tokens with a space between them

train_dataset = symptoms.merge(pn_notes, how='outer', on=['case_num'])

# Model for token-wise classification
class TokenModel(nn.Module):

    def __init__(self, cfg, config_path=None):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(cfg, output_hidden_states=True)
        if config_path is not None:
            self.model = AutoModel.from_pretrained(config_path, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        if "roberta" in cfg:
            for param in self.model.roberta.parameters():
                param.requires_grad = False
        elif "deberta" in cfg:
            for param in self.model.deberta.parameters():
                param.requires_grad = False

        self.proj_size = 84   # hidden size between 2 linear layers
        self.fc1 = nn.Linear(self.config.hidden_size, self.proj_size)
        self.fc2 = nn.Linear(self.proj_size, 2)
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        hidden = self.relu(self.fc1(self.dropout(feature)))
        output = self.fc2(self.dropout(hidden))
        output.transpose_(1,2)
        
        return output


# Dataset class for test-set creation
class TestDataset(Dataset):
    
    def __init__(self, train_dataset, test_samples, tokenizer, public_kaggle_model=False):
        self.data = train_dataset.merge(test_samples, how='inner', on=['case_num', 'pn_num', 'feature_num'])
        self.data.reset_index(inplace=True, drop=True)
        self.tokenizer = tokenizer
        self.public_kaggle_model = public_kaggle_model
        
    def __getitem__(self, index):
        pn_note = self.data['pn_history'][index]

        if self.public_kaggle_model: length = 354
        else: length = ModelParams.max_length

        tokenized_text = self.tokenizer(pn_note, self.data['feature_text'][index],
                                           max_length=length,
                                           padding='max_length',
                                           return_offsets_mapping=True,
                                           return_tensors='pt')
        
        # returned outputs from tokenizer have batch size 1, need to reshape
        for k in tokenized_text.keys():
            tokenized_text[k] = tokenized_text[k].view(*tokenized_text[k].shape[1:])
        
        # Want to zero out the second sentence
        sep_tok_index = ModelParams.max_length-1
        for i, v in enumerate(tokenized_text['input_ids']):
            if v == self.tokenizer.sep_token_id:
                sep_tok_index = i
                break
        tokenized_text['attention_mask'][sep_tok_index+1:] = 0
        tokenized_text['offset_mapping'] = tokenized_text['offset_mapping'][:sep_tok_index+1]
        
        return {'input_ids': tokenized_text['input_ids'], \
                'attention_mask': tokenized_text['attention_mask'].type(torch.int8), \
                'offset_mapping': tokenized_text['offset_mapping'].type(torch.int16), \
                'text': pn_note}
        
    def __len__(self):
        return len(self.data)
    
class ScoringModel(nn.Module):
    def __init__(self, config_path=None, pretrained=False):
        super().__init__()
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained("microsoft/deberta-v3-large", output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained("microsoft/deberta-v3-large", config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.config.hidden_size, 1)
                
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        
        return output

# print([i for i in list(globals().keys()) if len(i) > 0 and i[0] != '_'])
# del globals()['symptoms'], globals()['pn_notes']

Produce predictions for examples not in the original training set for semi-supervised training.

In [None]:
# All samples we have not classified so far, this is something that needs to be changed for the submission test set
test_df = pd.read_csv(DATA_DIR + 'test.csv')
test_samples = pd.merge(train_dataset, test_df, on=['case_num', 'pn_num', 'feature_num'], how='inner')

test_samples.drop(columns=['feature_text', 'pn_history', 'location'], inplace=True)

softmax = nn.Softmax(dim=1)

models = ['roberta', 'deberta', 'deberta-kaggle']

for model_type in models:
    # Change directory names
    if model_type == 'roberta':
        test_tokenizer = roberta_tokenizer
        init_outfolder = '../input/nbme-roberta-large-folds'
    elif model_type == 'deberta':
        test_tokenizer = deberta_tokenizer
        init_outfolder = '../input/nbme-deberta-large-folds'
    else:
        test_tokenizer = deberta_tokenizer
        init_outfolder = '../input/deberta-v3-large-5-folds-public'
        
    # The Kaggle Public Model uses a different max length and model architecture so must be treated differently
    test_dataset = TestDataset(train_dataset, test_samples, cur_tokenizer, public_kaggle_model=("kaggle" in ModelParams.model_type))
    test_loader = DataLoader(dataset=test_dataset, batch_size=ModelParams.eval_batch_size, num_workers=ModelParams.num_workers, pin_memory=ModelParams.pin_mem)
    
    softmax = nn.Softmax(dim=1)

    predictions = []
    if "kaggle" in ModelParams.model_type: folds = list(range(ModelParams.n_folds))
    else: folds = [1, 2]
    for fold in folds:
        if ModelParams.model_type == 'roberta':
            initial_model = TokenModel('roberta-large')
        elif ModelParams.model_type == 'deberta':
            initial_model = TokenModel('microsoft/deberta-v3-large')
        else:
            initial_model = ScoringModel(config_path=init_outfolder+'/config.pth', pretrained=False)

        if "kaggle" in ModelParams.model_type: initial_model.load_state_dict(torch.load(f"{init_folder}/deberta-v3-large_fold{fold}_best.pth")['model'])
        else: initial_model.load_state_dict(torch.load(f"{init_folder}/fold_{fold}_weights/model.pth"))

        for param_group in initial_model.parameters():
            param_group.data = param_group.data.to(torch.float16)
            
        torch.cuda.empty_cache()

        preds = []
        offset_mappings = []
        test_texts = []
        initial_model.eval()
        initial_model.to(device)

        for inputs in test_loader:
            offset_mappings.append(inputs['offset_mapping'].numpy())
            test_texts += inputs['text']
            del inputs['offset_mapping'], inputs['text']
            inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}
            with torch.no_grad():
                y_preds = initial_model(inputs)
                if "kaggle" in ModelParams.model_type:
                    temp = y_preds.sigmoid().to('cpu').numpy()
                else:
                    temp = softmax(y_preds)[:, 1, :].to('cpu').numpy()
                preds.append(temp)
        offset_mappings = np.concatenate(offset_mappings)
        token_prediction = np.concatenate(preds)

        if "kaggle" in ModelParams.model_type: length = 354
        else: length = ModelParams.max_length
            
        token_prediction = token_prediction.reshape((len(test_samples), length))
        char_probs = get_char_probs(test_texts, token_prediction, offset_mappings)
        predictions.append(char_probs)
        del initial_model, token_prediction, char_probs
        gc.collect()
        torch.cuda.empty_cache()

    predictions = np.mean(predictions, axis=0)
    test_samples["predictions"] = predictions
    test_samples["text"] = test_texts
    test_samples.to_csv(f"{model_type}_predictions.csv", index=False)  

# Generate word level labels (locations) given predictions from multiple models

In [None]:
best_model_weights = {'roberta': 0.13,
                      'deberta': 0.27,
                      'deberta-kaggle': 0.6}

weight_averages = {'roberta': 0.7944,
                   'deberta': 0.787,
                   'deberta-kaggle': 0.5}

predictions = []
for model_type, w in best_model_weights.items():
    prediction = pd.read_csv(f"{model_type}_predictions.csv")
    prediction["predictions"] = prediction["predictions"].apply(lambda x: np.array([float(i) for i in x[1:-1].split()]))
    if 'kaggle' in model_type:
        max_len_preds = list(prediction["predictions"])
        for index, row in enumerate(predictions[0]):
            newlen = len(row) - len(max_len_preds[index])
            max_len_preds[index] = np.concatenate([max_len_preds[index], np.ones(newlen)*0.335])
    else:
        max_len_preds = np.array(prediction["predictions"])
    predictions.append(max_len_preds*w*0.5/weight_averages[model_type])

predictions = np.sum(predictions, axis=0)
prediction["predictions"] = predictions
semi_supervised_submissions = get_results(prediction)
semi_supervised_submissions.to_csv('submission.csv', index=False)