In [None]:
!pip install seqeval evaluate -q

In [None]:
import json
import argparse
from itertools import chain
from functools import partial
import gc
import pickle as pkl

from sklearn.model_selection import KFold
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np

## Обучение моделей
В решении были использованы модели https://huggingface.co/microsoft/deberta-v3-large и https://huggingface.co/microsoft/deberta-v3-base. Они обучались для определения большей части меток. Другая часть меток была найдена с помощью регулярных выражений. При обучении моделей был использован взвешаный Log loss, с весами пропорциональными частоте меток в обучающем наборе. Обучающий набор был дополнен данными, которые описаны в https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/473139, таким образом, чтобы содержалось хотя бы 100 примеров для каждого класса.

Далее показано как обучалась одна модель на одном фолде.

In [None]:
old_data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))

p = []
n = []

for d in old_data:
    if any(np.array(d["labels"]) != "O"):
        p.append(d)
    else:
        n.append(d)
more_1 = json.load(open("/kaggle/input/more-pii-data-2/data_1.json"))
more_2 = json.load(open("/kaggle/input/more-pii-data-2/data_2.json"))

data = p + more_1[:275] + more_2[:250]
data = data + n[:(3 * len(data)) // 2]
print("sum datapoints: ", len(data))

no_labels = {"B-PHONE_NUM", "I-PHONE_NUM", "B-EMAIL"}
for d in data:
    for i in range(len(d["labels"])):
        if d["labels"][i] in no_labels:
            d["labels"][i] = "O"

In [None]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v: k for k,v in label2id.items()}
target = list(list(label2id.keys())[:-1])

In [None]:
def tokenize(example, tokenizer, label2id):
    text = []

    labels = [] # Метки побуквеннно с пробелами
    targets = [] 

    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l]*len(t))
        
        if l in target:
            targets.append(1)
        else:
            targets.append(0)
        if ws:
            text.append(" ")
            labels.append("O")


    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=TRAINING_MAX_LENGTH)
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num > 0 else 0
    }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data], # Полный текст
    "document": [str(x["document"]) for x in data], # Номер документа
    "tokens": [x["tokens"] for x in data], # Исходные токены 
    "trailing_whitespace": [x["trailing_whitespace"] for x in data], # Есть ли пробел после исходного токена
    "provided_labels": [x["labels"] for x in data], # Исходные метки
})

In [None]:
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id}, num_proc=1)
ds = ds.class_encode_column("group")
print(ds)

In [None]:
nfolds = 3
seed = 42
fold = 0

ds = ds.shuffle(seed=seed)
train_idxs, test_idxs = list(KFold(n_splits=nfolds, random_state=seed, shuffle=True).split(ds))[fold]
train_ds = Dataset.from_dict(ds[train_idxs])
test_ds = Dataset.from_dict(ds[test_idxs])
old_docs = {d["document"] for d in old_data}
idxs = []
for i, d in enumerate(test_ds):
    if d["document"].isdigit() and int(d["document"]) in old_docs:
        idxs.append(i)

test_ds = Dataset.from_dict(test_ds[idxs])

del ds
gc.collect()

In [None]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    beta = 5
    f_score = (1 + beta ** 2) * recall * precision / (beta ** 2 * precision + recall + 1e-9)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f_beta': f_score
    }
    return results

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [None]:
FREEZE_EMBEDDINGS = True
FREEZE_LAYERS = 7

if FREEZE_EMBEDDINGS:
    print('Freezing embeddings.')
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
        
if FREEZE_LAYERS > 0:
    print(f'Freezing {FREEZE_LAYERS} layers.')
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

In [None]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    # optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    report_to="none",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_checkpointing_kwargs={'use_reentrant': False},
    save_total_limit=1,
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    weight_decay=0.01
)

In [None]:
import torch.nn.functional as F
import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss

loss_fn = CrossEntropyLoss(weight=torch.Tensor([7, 5, 7, 7, 7, 7, 5, 7, 7, 1]).to(device='cuda:0'))

def sent_loss(logits, labels):
    l = 1.0
    return F.cross_entropy(logits, labels)
    

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = torch.flatten(inputs.pop("labels"), start_dim=0, end_dim=1)
        outputs = model(**inputs)
        ignore_index = labels.int()

        flat_outputs = outputs.logits.squeeze()[ignore_index != -100]
        flat_labels = labels.squeeze()[ignore_index != -100]
        loss = loss_fn(flat_outputs, flat_labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CustomTrainer(
    model=model, 
    args=args, 
    train_dataset=train_ds, 
    eval_dataset=test_ds, 
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

In [None]:
%%time
trainer.train()

### Значения метрик на кросс-валидации для deberta-v3-large
Метрики были посчитаны для изначального набора данных, без добавленных.

|   | fold 0   | fold 1 | fold 2 | mean |
|-----|----------|----------|----------|---|
| F5    | $0.966247943171328$   |  $0.963711492042711$  | $0.962475756809174$ | $0.9641450640077377$ |

### Значения метрик на кросс-валидации для deberta-v3-base

|   | fold 0   | fold 1 | fold 2 | mean |
|-----|----------|----------|----------|---|
| F5    | $0.956667331074347$   |  $0.954059829059829$  | $0.956017079136094$ | $0.95558141309009$ |

## Посылка решения
Веса для предсказаний моделей были выбраны пропорцианально значениям метрик на кросс валидации. Усреднение меток между можелями было произведено на токенах, требуемых в задаче.

In [None]:
max_len = 3500

def tokenize(example, tokenizer):
    text = []
    token_map = []
    idx = 0
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
        idx += 1
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_len)
    return {
        **tokenized,
        "token_map": token_map,
    }

In [None]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

model_paths = {
    '/kaggle/input/debertav3-base-cl-fold-0' : 0.956667331074347,
    '/kaggle/input/debertav3-base-cl-fold-1': 0.954059829059829,
    '/kaggle/input/debertav3-base-cl-fold-2' : 0.956017079136094
}

first_model_path = list(model_paths.keys())[0]
tokenizer = AutoTokenizer.from_pretrained(first_model_path)
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=1)

all_preds = []

total_weight = sum(model_paths.values())

for model_path, weight in model_paths.items():
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    
    args = TrainingArguments(
        ".", 
        per_device_eval_batch_size=1, 
        report_to="none",
    )
    
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=collator, 
        tokenizer=tokenizer,
    )

    if len(all_preds) == 0:
        all_preds = softmax(trainer.predict(ds).predictions, axis = -1) * weight
    else:
        all_preds += softmax(trainer.predict(ds).predictions, axis = -1) * weight
    
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

weighted_average_predictions = all_preds / total_weight
final_predictions = []


In [None]:
for p, token_map, offsets, tokens, doc in zip(weighted_average_predictions, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
    pairs = set()
    current_predictions = {}
    for prob, (start_idx, end_idx) in zip(p, offsets):
        if start_idx + end_idx == 0: 
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map): 
            break
        token_id = token_map[start_idx]
        pair = (doc, token_id)
        if pair in pairs:
            continue
        current_predictions[token_id] = prob
        pairs.add(pair)
    final_predictions.append(current_predictions)

In [None]:
del ds, weighted_average_predictions, all_preds
gc.collect()

In [None]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

del data
gc.collect()

model_paths = {
    '/kaggle/input/debertav3-large-cl-fold-0': 0.966247943171328,
    '/kaggle/input/debertav3-large-cl-fold-1': 0.963711492042711,
    '/kaggle/input/debertav3-large-cl-fold-2': 0.962475756809174
}

first_model_path = list(model_paths.keys())[0]
tokenizer = AutoTokenizer.from_pretrained(first_model_path)
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=1)

all_preds = []

total_weight = sum(model_paths.values())

for model_path, weight in model_paths.items():
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    
    args = TrainingArguments(
        ".", 
        per_device_eval_batch_size=1, 
        report_to="none",
    )
    
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=collator, 
        tokenizer=tokenizer,
    )

    if len(all_preds) == 0:
        all_preds = softmax(trainer.predict(ds).predictions, axis =-1) * weight
    else:
        all_preds += softmax(trainer.predict(ds).predictions, axis=-1) * weight
    
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

weighted_average_predictions = all_preds / total_weight

In [None]:
idx = 0
w_1 = 0.955581413 / (0.964145064 + 0.955581413)
w_2 = 1.0 - w_1

for p, token_map, offsets, tokens, doc in zip(weighted_average_predictions, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
    pairs = set()
    for prob, (start_idx, end_idx) in zip(p, offsets):
        if start_idx + end_idx == 0: 
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map): 
            break
        token_id = token_map[start_idx]
        pair = (doc, token_id)
        if pair in pairs:
            continue
        if token_id not in final_predictions[idx]:
            final_predictions[idx][token_id] = prob
        else:
            final_predictions[idx][token_id] = w_1 * final_predictions[idx][token_id] + w_2 * prob
        pairs.add(pair)
    idx += 1

In [None]:
del all_preds
gc.collect()

In [None]:
config = json.load(open("/kaggle/input/debertav3-base-cl-fold-0/config.json"))
id2label = config["id2label"]

In [None]:
document, token, label, token_str = [], [], [], []
url_regex = re.compile("^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$")
urls_blacklist = ["wikipedia.org", "coursera.org", "google.com", "cyberleninka", "arxiv.org"]

for p, doc, tokens in zip(final_predictions, ds["document"], ds["tokens"]):
    for k in p:
        
        label_pred = id2label[str(np.argmax(p[k]))]
        if "B-URL" in label_pred:
            f = re.fullmatch(url_regex, tokens[k]) is not None
            for u in urls_blacklist:
                f = f and (u not in tokens[k])
            if not f:
                continue
        if ("B-NAME" in label_pred) and not tokens[k].title() == tokens[k]:
            continue

        if label_pred != "O":
            document.append(doc)
            token.append(k)
            label.append(label_pred)
            token_str.append(tokens[k])

Часть меток была найдена с помощью реглярных выражений.

In [None]:
email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")

for d in ds:
    for i, t in enumerate(d["tokens"]):
        if re.fullmatch(email_regex, t) is not None:
            document.append(d["document"])
            token.append(i)
            label.append("B-EMAIL")
            token_str.append(t)
    
    matches = phone_num_regex.findall(d["full_text"])
    if matches:
        for match in matches:
            i = 0
            while i < len(d["tokens"]):
                if match.startswith(d["tokens"][i]):
                    cur_nums = [{"document": d["document"], "token": i, "label": "B-PHONE_NUM", "token_str": d["tokens"][i]}]
                    res = d["tokens"][i]
                    i += 1
                    while i < len(d["tokens"]) and d["tokens"][i] in match:
                        cur_nums.append({"document": d["document"], "token": i, "label": "I-PHONE_NUM", "token_str": d["tokens"][i]})
                        res += d["tokens"][i]
                        i += 1
                    if res == match:
                        for n in cur_nums:
                            document.append(n["document"])
                            token.append(n["token"])
                            label.append(n["label"])
                            token_str.append(n["token_str"])
                else:
                    i += 1

In [None]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

df["row_id"] = list(range(len(df)))
display(df.head(100))
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)