In [1]:
# like HF-3 but with multi sample dropout
exp_name = "HF-pret-7"
extra_tags = ['pretraining']

In [2]:
%env WANDB_PROJECT=fbck
%env WANDB_SILENT=true

env: WANDB_PROJECT=fbck
env: WANDB_SILENT=true


In [3]:
DEBUG = False
if DEBUG: extra_tags += ['debug']
k_folds = 2 if DEBUG else 5
n_epochs = 1 if DEBUG else 10

In [4]:
cfg = {
    "num_proc": 2,
    "k_folds": k_folds,
    "max_length": 2048,
    "padding": False,
    "stride": 0,
    "data_dir": "../input/fbck2021",
    "load_from_disk": None,
    "pad_multiple": 8,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "dropout": 0.1,
    "trainingargs": {
        "output_dir": f"../output/{exp_name}",
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 4,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "num_train_epochs": n_epochs,
        "warmup_ratio": 0.1,
        "optim": 'adamw_torch',
        "logging_steps": 50,
        "save_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "report_to": "wandb",
        "group_by_length": True,
        "save_total_limit": 1,
        "metric_for_best_model": "loss",
        "greater_is_better": False,
        "seed": 42,
        "fp16": True,
        "gradient_checkpointing": True,
        "gradient_accumulation_steps": 1,
    }
}

In [5]:
import re
import pickle
import codecs
import warnings
import logging
from functools import partial
from pathlib import Path
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple

import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, set_seed

from datasets import Dataset, load_from_disk

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def read_text_files(example, data_dir):
    
    id_ = example["essay_id"]
    
    with open(data_dir / "train" / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())
    
    return example

set_seed(cfg["trainingargs"]["seed"])

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
data_dir = Path(cfg["data_dir"])

if cfg["load_from_disk"]:
    if not cfg["load_from_disk"].endswith(".dataset"):
        cfg["load_from_disk"] += ".dataset"
    ds = load_from_disk(cfg["load_from_disk"])
    
    pkl_file = f"{cfg['load_from_disk'][:-len('.dataset')]}_pkl"
    with open(pkl_file, "rb") as fp: 
        grouped = pickle.load(fp)
        
    print("loading from saved files")
else:
    train_df = pd.read_csv("../input/2021_data_for_pseudo_mlm.csv")
    
    if DEBUG: train_df = train_df.sample(n=2000)
    
    text_ds = Dataset.from_dict({"essay_id": train_df.essay_id.unique()})
    
    text_ds = text_ds.map(
        partial(read_text_files, data_dir=data_dir),
        num_proc=cfg["num_proc"],
        batched=False,
        desc="Loading text files",
    )
    
    text_df = text_ds.to_pandas()
    
    train_df["discourse_text"] = [
        resolve_encodings_and_normalize(x) for x in train_df["discourse_text"]
    ]
    
    train_df = train_df.merge(text_df, on="essay_id", how="left")
    
disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]

cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

tokenizer = AutoTokenizer.from_pretrained(cfg["model_name_or_path"])
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(cls_tokens_map.values())+list(end_tokens_map.values())}
)

cls_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in cls_tokens_map.items()
}

end_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in end_tokens_map.items()
}



Loading text files #0:   0%|                                                | 0/7797 [00:00<?, ?ex/s]
Loading text files #0:   3%|▉                                   | 209/7797 [00:00<00:03, 2089.47ex/s][A
Loading text files #0:   5%|█▉                                  | 419/7797 [00:00<00:03, 2091.19ex/s][A
Loading text files #0:   8%|██▉                                 | 629/7797 [00:00<00:03, 2070.61ex/s][A
Loading text files #0:  11%|███▊                                | 839/7797 [00:00<00:03, 2079.02ex/s][A
Loading text files #0:  13%|████▋                              | 1047/7797 [00:00<00:03, 1901.84ex/s][A
Loading text files #0:  16%|█████▋                             | 1266/7797 [00:00<00:03, 1988.88ex/s][A
Loading text files #0:  19%|██████▋                            | 1497/7797 [00:00<00:03, 2086.84ex/s][A
Loading text files #0:  22%|███████▋                           | 1713/7797 [00:00<00:02, 2106.85ex/s][A
Loading text files #0:  25%|████████▋                     

In [7]:
special_tokens = list(set(cls_id_map.values())) + list(set(end_id_map.values()))

In [8]:
def find_positions(example):

    text = example["text"][0]
    
    # keeps track of what has already
    # been located
    min_idx = 0
    
    # stores start and end indexes of discourse_texts
    idxs = []
    
    for dt in example["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))
        
        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1]) # will filter out later
            continue  
        # If one match is found
        else:
            m = matches[0]
            
        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(example):
    example["idxs"] = find_positions(example)

    text = example["text"][0]
    text = text.replace('\n', '|')
    chunks = []
    labels = []
    prev = 0

    zipped = zip(
        example["idxs"],
        example["discourse_type"],
        example["discourse_effectiveness"],
    )
    for idxs, disc_type, disc_effect in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs

        # if the start of the current discourse_text is not 
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is 
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])
        
        prev = e

        labels.append(label2id[disc_effect])

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=cfg["max_length"],
        add_special_tokens=True,
    )
    
    # at this point, labels is not the same shape as input_ids.
    # The following loop will add -100 so that the loss function
    # ignores all tokens except CLS tokens

    # idx for labels list
    idx = 0
    final_labels = []
    for id_ in tokenized["input_ids"]:
        # if this id belongs to a CLS token
        if id_ in cls_id_map.values():
            final_labels.append(labels[idx])
            idx += 1
        else:
            # -100 will be ignored by loss function
            final_labels.append(-100)
    
    # tokenized["labels"] = final_labels

    return tokenized

In [9]:
# I frequently restart my notebook, so to reduce time
# you can set this to just load the tokenized dataset from disk.
# It gets loaded in the 3rd code cell, but a check is done here
# to skip tokenizing
if cfg["load_from_disk"] is None:

    # make lists of discourse_text, discourse_effectiveness
    # for each essay
    grouped = train_df.groupby(["essay_id"]).agg(list)
    grouped['fold'] = [x[0] for x in grouped['fold']]

    ds = Dataset.from_pandas(grouped)

    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=cfg["num_proc"],
        desc="Tokenizing",
    )

    save_dir = f"{cfg['trainingargs']['output_dir']}"
    ds.save_to_disk(f"{save_dir}.dataset")
    with open(f"{save_dir}_pkl", "wb") as fp:
        pickle.dump(grouped, fp)
    print("Saving dataset to disk:", cfg['trainingargs']['output_dir'])

Tokenizing #0:   6%|██▊                                          | 498/7797 [00:03<00:46, 157.37ex/s]
Tokenizing #0:   7%|██▉                                          | 515/7797 [00:03<00:45, 159.67ex/s][A
Tokenizing #0:   7%|███                                          | 532/7797 [00:03<00:44, 161.64ex/s][A
Tokenizing #0:   7%|███▏                                         | 552/7797 [00:03<00:43, 167.90ex/s][A
Tokenizing #0:   7%|███▎                                         | 570/7797 [00:03<00:42, 170.75ex/s][A
Tokenizing #0:   8%|███▍                                         | 588/7797 [00:03<00:42, 170.03ex/s][A
Tokenizing #0:   8%|███▍                                         | 606/7797 [00:03<00:41, 172.82ex/s][A
Tokenizing #0:   8%|███▌                                         | 624/7797 [00:03<00:41, 172.92ex/s][A
Tokenizing #0:   8%|███▋                                         | 642/7797 [00:03<00:42, 170.34ex/s][A
Tokenizing #0:   8%|███▊                                  

Saving dataset to disk: ../output/HF-pret-7


In [10]:
ds

Dataset({
    features: ['discourse_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'fold', 'text', 'essay_id', 'idxs', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 15594
})

In [11]:
# bad_matches = []
# cls_ids = set(list(cls_id_map.values()))
# for id_, l, ids, dt in zip(ds["essay_id"], ds["labels"], ds["input_ids"], grouped.discourse_text):
    
#     # count number of labels (ignoring -100)
#     num_cls_label = sum([x!=-100 for x in l])
#     # count number of cls ids
#     num_cls_id = sum([x in cls_ids for x in ids])
#     # true number of discourse_texts
#     num_dt = len(dt)
    
#     if num_cls_label != num_dt or num_cls_id != num_dt:
#         bad_matches.append((id_, l, ids, dt))
        
# print("Num bad matches", len(bad_matches))
# # temp = train_df[train_df["essay_id"]==bad_matches[0][0]]
# # temp_txt = temp.text.values[0]
# # print(temp_txt)
# # print("*"*100)
# # print([x for x in temp.discourse_text if x.strip() not in temp_txt])

In [12]:
for t in ds[0]["discourse_text"]:
    print(t, "\n")
print("*"*100)
print(tokenizer.decode(ds[0]["input_ids"]))
print("*"*100)
print(ds[0]["text"][0])

Some people belive that the so called "face" on mars was created by life on mars. This is not the case. The face on Mars is a naturally occuring land form called a mesa.  

It was not created by aliens, and there is no consiracy to hide alien lifeforms on mars. There is no evidence that NASA has found that even suggests that this face was created by aliens.  

A mesa is a naturally occuring rock formation, that is found on Mars and Earth.  

This "face" on mars only looks like a face because humans tend to see faces wherever we look, humans are obviously extremely social, which is why our brain is designed to recognize faces.  

Many conspiracy theorists believe that NASA is hiding life on Mars from the rest of the world.  

These people would be very wrong. If NASA found life on Mars, then they would get millions of people's attention.  

NASA's budget would increase drasticly, which means that their workers would get paid more. There is no good reason that NASA would hide life on Mar

In [13]:
# add "special_tokens_mask" to dataset .... and remove labels from it...

In [14]:
import random
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

from transformers.data.data_collator import DataCollatorForLanguageModeling

class MyMLMCollator(DataCollatorForLanguageModeling):
    def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        import torch

        labels = inputs.clone()
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        
        for tok in special_tokens: 
            probability_matrix = torch.where(labels == tok, 1., probability_matrix)
        
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels

In [15]:
import gc
import torch
from transformers import Trainer, TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification
from torch.utils.checkpoint import checkpoint
import wandb
from transformers import AutoModelForMaskedLM


args = TrainingArguments(**cfg["trainingargs"])

# if using longformer pad to multiple of 512
# for others pad to multiple of 8

collator = MyMLMCollator(
    tokenizer=tokenizer, pad_to_multiple_of=cfg["pad_multiple"]
)

output = args.output_dir
for fold in range(k_folds):
    
    args.output_dir = f"{output}-fold{fold}"
    
    model_config = AutoConfig.from_pretrained(
        cfg["model_name_or_path"],
    )
    model_config.update(
        {
            "cls_tokens": list(cls_id_map.values()),
        }
    )
    
    model = AutoModelForMaskedLM.from_pretrained(cfg["model_name_or_path"], config=model_config)
    
    # need to resize embeddings because of added tokens
    model.resize_token_embeddings(len(tokenizer))
    
    # split dataset to train and eval
    keep_cols = {"input_ids", "attention_mask"}
    train_dataset = ds.filter(lambda example: example["fold"] != fold).remove_columns([c for c in ds.column_names if c not in keep_cols])
    eval_dataset = ds.filter(lambda example: example["fold"] == fold).remove_columns([c for c in ds.column_names if c not in keep_cols])
    
    print(len(train_dataset), len(eval_dataset))
    
    wandb.init(project="fbck", 
           name=f"{exp_name}_fold_{fold}",
           tags=["HF", f"fold_{fold}"]+extra_tags,
           group=f"{exp_name}")
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    
    trainer.train()
    wandb.finish()
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.63ba/s]
100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.62ba/s]


14756 838


Epoch,Training Loss,Validation Loss
1,2.3,2.311811
2,1.6517,1.707427
3,1.4758,1.559386
4,1.3861,1.459124
5,1.3252,1.390096
6,1.3072,1.331887
7,1.2148,1.302294
8,1.1409,1.293744
9,1.1831,1.268446
10,1.1974,1.272851


100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.65ba/s]
100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.66ba/s]

14760 834





Epoch,Training Loss,Validation Loss
1,2.2327,2.242669
2,1.678,1.709234
3,1.4684,1.542061
4,1.3705,1.461067
5,1.2998,1.38401
6,1.2599,1.341524
7,1.2172,1.287704
8,1.182,1.26495
9,1.1357,1.26187
10,1.1527,1.248705


100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.66ba/s]
100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.67ba/s]

14751 843





Epoch,Training Loss,Validation Loss
1,2.2458,2.29516
2,1.6819,1.719467
3,1.4919,1.566225
4,1.3479,1.443473
5,1.3005,1.400023
6,1.2589,1.341576
7,1.1891,1.311184
8,1.1625,1.286844
9,1.1289,1.274059
10,1.1372,1.248969


100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.66ba/s]
100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.66ba/s]

14756 838





Epoch,Training Loss,Validation Loss
1,2.299,2.287766
2,1.6688,1.734881
3,1.5051,1.565792
4,1.378,1.468693
5,1.3204,1.398813
6,1.2644,1.367329
7,1.2152,1.308802
8,1.1654,1.295953
9,1.1548,1.291261
10,1.1431,1.286047


100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.67ba/s]
100%|████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.67ba/s]

14756 838





Epoch,Training Loss,Validation Loss
1,2.2756,2.296321
2,1.6372,1.724932
3,1.4838,1.556997
4,1.3933,1.461208
5,1.2752,1.373111
6,1.2372,1.341725
7,1.2059,1.315659
8,1.2262,1.2874
9,1.1671,1.251884
10,1.1496,1.262574


In [16]:
# !rm -rf ../output/HF-pret-1-fold0/

In [23]:
import json
best_metrics = []
best_checkpoints = []

for fold in range(k_folds):
    folder = Path(f"../output/{exp_name}-fold{fold}")
    checkpoint = sorted(list(folder.glob("checkpoint*")))[-1]
    with open(checkpoint/"trainer_state.json", "r") as fp:
        data = json.load(fp)
        best_metrics.append(data["best_metric"])
        best_checkpoints.append(data["best_model_checkpoint"])
        
print(best_metrics)
average = sum(best_metrics)/len(best_metrics)
average

[1.2684463262557983, 1.2487046718597412, 1.2489689588546753, 1.2860469818115234, 1.2518844604492188]


1.2608102798461913

In [24]:
best_checkpoints

['../output/HF-pret-7-fold0/checkpoint-16605',
 '../output/HF-pret-7-fold1/checkpoint-18450',
 '../output/HF-pret-7-fold2/checkpoint-18440',
 '../output/HF-pret-7-fold3/checkpoint-18450',
 '../output/HF-pret-7-fold4/checkpoint-16605']

In [18]:
# for fold in range(5):
#     folder = best_checkpoints[fold]
#     !~/gdrive upload {folder}/pytorch_model.bin --name pytorch_model_{fold}.bin