In [None]:
import os
from tqdm import tqdm
import itertools
import pickle
import random
import datetime
import json

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
torch.set_grad_enabled(False)

from transformers import DataCollatorForTokenClassification
from transformers import AutoModel, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import AutoTokenizer, DebertaV2Tokenizer
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification

BS = 4

# 模型结构
CHECKPOINTS = [
    '../input/deberta-v3-large', #https://www.kaggle.com/datasets/jonathanchan/deberta-v3-large
    '../input/microsoft-deberta-large', # https://www.kaggle.com/datasets/shinomoriaoshi/microsoft-deberta-large
    '../input/debertabase',  # https://www.kaggle.com/datasets/chenjinbridge/debertabase
]

# 权重文件
PATHS = [
    '../input/my-deberta-v3-large',
    '../input/my-deberta-large',
    '../input/my-deberta-base',
]

# 加权
WEIGHTS = {'w0': 0.6, 
           'w1': 0.25, 
           'w2': 0.15}

In [None]:
test_df = pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv') # 测试数据
features = preprocess_features(pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')) # features
pn = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv') # notes

# 合并到一张表
test_df = test_df.merge(pn, on='pn_num', how='left') 
test_df = test_df.merge(features, on='feature_num', how='left')
test_df['len'] = test_df['pn_history'].apply(len) + test_df['feature_text'].apply(len)
test_df = test_df.sort_values(by=['len']).reset_index(drop=True)
test_df

In [None]:
# DataSet
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago" # 修正feature_text
    return features


def prepare_input(tokenizer, text, feature_text):
    '''
    构造 input 数据
    '''
    inputs = tokenizer(text, feature_text, #note and feature
                       add_special_tokens=True, # 加入特殊token 如[CLS]，[SEP] 
                       return_offsets_mapping=False # 将每个tokens映射回原始文本character级别的位置。
                      )
    return inputs



class NBMEDatasetInfer(Dataset):
    def __init__(self, tokenizer, df):
        self.tokenizer = tokenizer
        self.feature_texts = df['feature_text'].values # feature_text
        self.pn_historys = df['pn_history'].values  # notes_text

    def __len__(self):
        return len(self.feature_texts) # 样本数

    def __getitem__(self, item):
        inputs = prepare_input(self.tokenizer,
                               self.pn_historys[item],
                               self.feature_texts[item]
                              )
        return inputs

In [None]:
# Model
class NBMEModel(nn.Module):
    def __init__(self, checkpoint):
        super().__init__()
        # output_hidden_states=True 返回所有层的隐藏状态。 返回值为 hidden_states。
        # hidden_states 是一个元组，它的第一个元素是Embedding，其余元素是各层的输出，shape == [bs, seq_len, hidden_size]
        self.config = AutoConfig.from_pretrained(checkpoint, output_hidden_states=True) # AutoConfig
        self.backbone = AutoModel.from_pretrained(checkpoint) # AutoModel
        self.dropout = nn.Dropout(0.1) # Dropout
        self.classifier = nn.Linear(self.config.hidden_size, 1) #MLP
        self._init_weights(self.classifier) # 初始化 

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # Linear 层
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # Embedding 层 
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)  # initializer_range: 0.02
            if module.padding_idx is not None:
                # padding部分置零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm): 
            # Normalization层 
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, **inputs):
        #inputs(dict)
        #    input_ids, token_type_ids, attention_mask, label
        outputs = self.backbone(**{k: v for k, v in inputs.items() if k != 'label'})
        # outputs: [last_hidden_state], last_hidden_state: [bs, seq_len, hidden_size]
        sequence_output = outputs[0]
        logits = self.classifier(self.dropout(sequence_output)) #获得 preds
        loss = None
        if 'label' in inputs:
            # 计算loss
            loss_fct = nn.BCEWithLogitsLoss(reduction="none")
            loss = loss_fct(logits.view(-1, 1), inputs['label'].view(-1, 1).float())
            loss = torch.masked_select(loss, inputs['label'].view(-1, 1) != -100).mean()
        # 返回值
        return TokenClassifierOutput(
            loss=loss, # loss
            logits=logits, # logits
        )

In [None]:
def get_char_logits(texts, predictions, tokenizer):
    '''
    获得每个char级的预测概率值
    texts: 原始notes文本数据(会重复，notes对应多个features)
    predictions: token级预测概率值
    '''
    results = [np.zeros(len(t)) for t in texts] # 输出列表 [[0,0,0],[0,0,0,0]]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, # note
                            add_special_tokens=True, # 加入特殊token 如[CLS]，[SEP] 
                            return_offsets_mapping=True # 将每个tokens映射回原始文本char级别的位置。
                           )
        offset_mappings = encoded['offset_mapping']
        for idx, (offset_mapping, pred) in enumerate(zip(offset_mappings, prediction)):
            start, end = offset_mapping
            results[i][start:end] = pred # char级填上logits
    return results

def my_get_results(char_logits, texts, th=0):
    '''
    生成所有样本的span字符串 of list，同一样本的span用;隔开
    '''
    results = []
    for i, char_prob in enumerate(char_logits): # 循环所有样本
        result = np.where(char_prob > th)[0] # 大于阈值的索引值
        # 根据数值是否连续进行分组
        # result: array([  0,   1,  90,  91,  92,  93,  94,  95,  96,  97,  98, 628, 629, 630])
        # to
        # result: [[0, 1], [90, 91, 92, 93, 94, 95, 96, 97, 98], [628, 629, 630]]
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        temp = []
        for r in result:
            s, e = min(r), max(r)
            while texts[i][s] == ' ': # 去掉左侧空格
                s += 1 
            while texts[i][e] == ' ': # 去掉右侧空格
                e -= 1
            temp.append(f"{s} {e+1}")
        result = temp
        result = ";".join(result) # 加入;后保存
        results.append(result)
        
    #  results like ['0 5;64 72', '91 99', '128 134']
    return results

def get_predictions(results):
    '''
    span 字符串 转 list
    from ['0 5;64 72', '91 99', '128 134']  
    return [[[0, 5], [64, 72]], [[91, 99]], [[128, 134]]]
    '''
    predictions = []
    for result in results: # 循环所有样本的span字符串
        prediction = []
        if result != "": # 非空span
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

In [None]:
char_logits_blend = [np.zeros(len(text)) for text in test_df.pn_history.values] # blend后，char级的预测概率值
for i, ckpt in enumerate(CHECKPOINTS): # 循环所有融合的模型
    model_path = PATHS[i] # 模型路径
    w = WEIGHTS[f'w{i}'] # 加权
    print(f'{model_path} - weight = {w}')
    tokenizer = AutoTokenizer.from_pretrained(ckpt, trim_offsets=False) # Tokenizer
    test_dataset = NBMEDatasetInfer(tokenizer, test_df) # datasets
    maxlen = max([len(x['input_ids']) for x in test_dataset]) # 最长样本的len作为maxlen
    test_dataloader = DataLoader(test_dataset, batch_size=BS, shuffle=False, collate_fn=DataCollatorForTokenClassification(tokenizer), pin_memory=True)
    model = NBMEModel(ckpt).cuda()  # 创建模型
    preds_folds = []

    for fold in range(5):
        model.load_state_dict(torch.load(os.path.join(model_path, f'{fold}.pt'))) # 载入模型权重
        model.eval() # 评估模式
        preds = []
        for b in tqdm(test_dataloader, total=len(test_dataset)//BS+1):
            b = {k: v.cuda() for k, v in b.items()} # batch
            pred = model(**b).logits #[bs, maxlen, 1]
            pred = pred.view(pred.shape[0], pred.shape[1]) #[bs, maxlen]
            pred = F.pad(input=pred, pad=(0, maxlen-pred.shape[1]), mode='constant', value=-100).cpu().numpy() # pad满maxlen，填充值-100 
            preds.append(pred)
        preds = np.concatenate(preds, axis=0)   # 所有样本的预测值 # [n, maxlen]
        preds_folds.append(preds) # 单fold 所有预测值
    preds_folds = np.stack(preds_folds)
    print('preds_folds shape:', preds_folds.shape)
    preds = np.mean(preds_folds, axis=0) 
    char_logits = get_char_logits(test_df['pn_history'].values, preds, tokenizer) # 获得char级的预测概率值
    for j in range(len(test_df)):
        char_logits_blend[j] += w * char_logits[j] # 加权求和

In [None]:
results = my_get_results(char_logits_blend, test_df.pn_history.values, th=0) # 生成所有样本的span字符串 of list

# 生成 submission.csv ，用作提交
test_df['location'] = results 
sub = pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")
sub = sub[['id']].merge(test_df[['id', "location"]], how="left", on="id")
sub.to_csv('submission.csv', index=False)
sub