In [1]:
exp_name = 'HF-43'
import json
from pathlib import Path
best_metrics = []
best_checkpoints = []

for fold in range(5):
    folder = Path(f"../output/{exp_name}-fold{fold}")
    checkpoint = sorted(list(folder.glob("checkpoint*")))[-1]
    with open(checkpoint/"trainer_state.json", "r") as fp:
        data = json.load(fp)
        best_metrics.append(data["best_metric"])
        best_checkpoints.append(data["best_model_checkpoint"])
        
print(best_metrics)
average = sum(best_metrics)/len(best_metrics)
average

[0.5772432088851929, 0.5771093368530273, 0.5714321732521057, 0.5780883431434631, 0.5812585949897766]


0.5770263314247132

In [2]:
best_checkpoints

['../output/HF-43-fold0/checkpoint-2100',
 '../output/HF-43-fold1/checkpoint-1950',
 '../output/HF-43-fold2/checkpoint-2050',
 '../output/HF-43-fold3/checkpoint-1800',
 '../output/HF-43-fold4/checkpoint-2000']

In [3]:
DEBUG = False
cfg = {
    "num_proc": 2,
    "aug_prob": 0.05,
    "k_folds": 5,
    "max_length": 2048,
    "padding": False,
    "stride": 0,
    "data_dir": "../input/fbck2021",
    "load_from_disk": None,
    "pad_multiple": 8,
    "model_name_or_path": "../output/HF-43-fold0/checkpoint-2100/",
    "dropout": 0.0,
    "trainingargs": {
        "output_dir": f"../output/{exp_name}",
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 4,
        "learning_rate": 1.2e-5,
        # "label_smoothing_factor": 0.05,
        "weight_decay": 0.01,
        "num_train_epochs": 2.2,
        "warmup_ratio": 0.1,
        "optim": 'adamw_torch',
        "logging_steps": 25,
        "save_strategy": "steps",
        "save_steps": 25,
        "evaluation_strategy": "steps",
        "eval_steps": 25,
        "eval_delay": 600,
        "report_to": "wandb",
        "group_by_length": True,
        "save_total_limit": 1,
        "metric_for_best_model": "loss",
        "greater_is_better": False,
        "seed": 42,
        "fp16": True,
        "gradient_checkpointing": True,
        "gradient_accumulation_steps": 1,
    }
}

In [4]:
import re
import pickle
import codecs
import warnings
import logging
from functools import partial
from pathlib import Path
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple

import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, set_seed

from datasets import Dataset, load_from_disk

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def read_text_files(example, data_dir):
    
    id_ = example["essay_id"]
    
    with open(data_dir / "train" / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())
    
    return example

set_seed(cfg["trainingargs"]["seed"])

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data_dir = Path(cfg["data_dir"])

if cfg["load_from_disk"]:
    if not cfg["load_from_disk"].endswith(".dataset"):
        cfg["load_from_disk"] += ".dataset"
    ds = load_from_disk(cfg["load_from_disk"])
    
    pkl_file = f"{cfg['load_from_disk'][:-len('.dataset')]}_pkl"
    with open(pkl_file, "rb") as fp: 
        grouped = pickle.load(fp)
        
    print("loading from saved files")
else:
    train_df = pd.read_csv("../input/2021_data_for_pseudo_mlm.csv")
    
    train_df = train_df[train_df.discourse_id != '56744a66949a'].reset_index(drop=True)
    train_df = train_df[train_df.discourse_id != 1623258656795].reset_index(drop=True)
    
    if DEBUG: train_df = train_df.sample(n=100).reset_index(drop=True)
    
    text_ds = Dataset.from_dict({"essay_id": train_df.essay_id.unique()})
    
    text_ds = text_ds.map(
        partial(read_text_files, data_dir=data_dir),
        num_proc=cfg["num_proc"],
        batched=False,
        desc="Loading text files",
    )
    
    text_df = text_ds.to_pandas()
    
    train_df["discourse_text"] = [
        resolve_encodings_and_normalize(x) for x in train_df["discourse_text"]
    ]
    
    train_df = train_df.merge(text_df, on="essay_id", how="left")
    
disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]

cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

tokenizer = AutoTokenizer.from_pretrained(cfg["model_name_or_path"])
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(cls_tokens_map.values())+list(end_tokens_map.values())}
)

cls_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in cls_tokens_map.items()
}

Loading text files #0:   0%|                                                  | 0/7797 [00:00<?, ?ex/s]
Loading text files #0:   3%|█                                     | 209/7797 [00:00<00:03, 2085.83ex/s][A
Loading text files #0:   5%|██                                    | 419/7797 [00:00<00:03, 2094.06ex/s][A
Loading text files #0:   8%|███                                   | 637/7797 [00:00<00:03, 2124.58ex/s][A
Loading text files #0:  11%|████▏                                 | 850/7797 [00:00<00:03, 2096.62ex/s][A
Loading text files #0:  14%|█████                                | 1060/7797 [00:00<00:03, 1907.59ex/s][A
Loading text files #0:  16%|██████                               | 1280/7797 [00:00<00:03, 1999.89ex/s][A
Loading text files #0:  19%|███████▏                             | 1511/7797 [00:00<00:02, 2095.60ex/s][A
Loading text files #0:  22%|████████▏                            | 1723/7797 [00:00<00:02, 2095.15ex/s][A
Loading text files #0:  25%|█████████▏  

In [6]:
def find_positions(example):

    text = example["text"][0]
    
    # keeps track of what has already
    # been located
    min_idx = 0
    
    # stores start and end indexes of discourse_texts
    idxs = []
    
    for dt in example["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))
        
        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1]) # will filter out later
            continue  
        # If one match is found
        else:
            m = matches[0]
            
        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(example):
    example["idxs"] = find_positions(example)

    text = example["text"][0]
    text = text.replace('\n', '|')

    chunks = []
    labels = []
    prev = 0

    zipped = zip(
        example["idxs"],
        example["discourse_type"],
        example["discourse_effectiveness"],
    )
    for idxs, disc_type, disc_effect in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs

        # if the start of the current discourse_text is not 
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is 
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])
        
        prev = e

        labels.append(label2id[disc_effect])

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=cfg["max_length"],
        add_special_tokens=True,
    )
    
    # at this point, labels is not the same shape as input_ids.
    # The following loop will add -100 so that the loss function
    # ignores all tokens except CLS tokens

    # idx for labels list
    idx = 0
    final_labels = []
    for id_ in tokenized["input_ids"]:
        # if this id belongs to a CLS token
        if id_ in cls_id_map.values():
            final_labels.append(labels[idx])
            idx += 1
        else:
            # -100 will be ignored by loss function
            final_labels.append(-100)
    
    tokenized["labels"] = final_labels

    return tokenized

In [7]:
# I frequently restart my notebook, so to reduce time
# you can set this to just load the tokenized dataset from disk.
# It gets loaded in the 3rd code cell, but a check is done here
# to skip tokenizing
if cfg["load_from_disk"] is None:

    # make lists of discourse_text, discourse_effectiveness
    # for each essay
    grouped = train_df.groupby(["essay_id"]).agg(list)

    ds = Dataset.from_pandas(grouped)

    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=cfg["num_proc"],
        desc="Tokenizing",
    )

    save_dir = f"{cfg['trainingargs']['output_dir']}"
    ds.save_to_disk(f"{save_dir}.dataset")
    with open(f"{save_dir}_pkl", "wb") as fp:
        pickle.dump(grouped, fp)
    print("Saving dataset to disk:", cfg['trainingargs']['output_dir'])


Tokenizing #0:   6%|███                                            | 501/7797 [00:03<00:44, 162.16ex/s]
Tokenizing #0:   7%|███                                            | 518/7797 [00:03<00:44, 163.96ex/s][A
Tokenizing #0:   7%|███▏                                           | 535/7797 [00:03<00:43, 165.58ex/s][A
Tokenizing #0:   7%|███▎                                           | 554/7797 [00:03<00:42, 171.85ex/s][A
Tokenizing #0:   7%|███▍                                           | 572/7797 [00:03<00:42, 171.96ex/s][A
Tokenizing #0:   8%|███▌                                           | 590/7797 [00:03<00:41, 172.89ex/s][A
Tokenizing #0:   8%|███▋                                           | 609/7797 [00:03<00:40, 175.73ex/s][A
Tokenizing #0:   8%|███▊                                           | 627/7797 [00:03<00:41, 172.42ex/s][A
Tokenizing #0:   8%|███▉                                           | 645/7797 [00:03<00:42, 167.66ex/s][A
Tokenizing #0:   9%|███▉                

Saving dataset to disk: ../output/HF-43


In [8]:
bad_matches = []
cls_ids = set(list(cls_id_map.values()))
for did_, id_, l, ids, dt in zip(ds["discourse_id"], ds["essay_id"], ds["labels"], ds["input_ids"], grouped.discourse_text):
    
    # count number of labels (ignoring -100)
    num_cls_label = sum([x!=-100 for x in l])
    # count number of cls ids
    num_cls_id = sum([x in cls_ids for x in ids])
    # true number of discourse_texts
    num_dt = len(dt)
    
    if num_cls_label != num_dt or num_cls_id != num_dt:
        bad_matches.append((did_, id_, l, ids, dt))
        
print("Num bad matches", len(bad_matches))

Num bad matches 0


In [9]:
ds

Dataset({
    features: ['discourse_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'fold', 'text', 'essay_id', 'idxs', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 15594
})

In [10]:
keep_df = {"discourse_id", "essay_id", "discourse_text", "discourse_type", "discourse_effectiveness", "labels", "fold"}
test_df = ds.remove_columns([c for c in ds.column_names if c not in keep_df]).to_pandas()
test_df.head()

Unnamed: 0,discourse_id,discourse_text,discourse_type,discourse_effectiveness,fold,essay_id,labels
0,"[1617734767734.0, 1617734782429.0, 16177348077...","[Some people belive that the so called ""face"" ...","[Position, Evidence, Evidence, Claim, Counterc...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[-1, -1, -1, -1, -1, -1, -1, -1]",0000D23A521A,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."
1,"[1621104238021.0, 1621104245981.0, 16211043488...",[Driverless cars are exaclty what you would ex...,"[Lead, Position, Claim, Evidence, Claim, Evide...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[2, 2, 2, 2, 2, 2, 2, 2, 2]",00066EA9880D,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."
2,"[1617296637311.0, 1617296650644.0, 16172966674...","[I am arguing against the policy change , even...","[Position, Counterclaim, Rebuttal, Evidence, C...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",000E6DE9E817,"[-100, -100, -100, -100, -100, -100, 0, -100, ..."
3,"[1622844028582.0, 1622844050451.0, 16228440600...",[Would you be able to give your car up? Having...,"[Lead, Evidence, Claim, Claim, Evidence, Claim...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",001552828BD0,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."
4,"[1621080957958.0, 1621081369014.0, 16210813821...",[I think that students would benefit from lear...,"[Position, Claim, Claim, Claim, Claim, Evidenc...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",0016926B079C,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."


In [11]:
import gc
import torch
from torch.utils.checkpoint import checkpoint
import numpy as np
from tqdm import tqdm
from transformers import Trainer, TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification
import sklearn

args = TrainingArguments(**cfg["trainingargs"])

# if using longformer pad to multiple of 512
# for others pad to multiple of 8

collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=cfg["pad_multiple"], padding=True
)

output = args.output_dir

fold_dfs = []

for fold in range(cfg["k_folds"]):
    
    args.output_dir = f"{output}-fold{fold}"
    
    model_config = AutoConfig.from_pretrained(
        cfg["model_name_or_path"],
    )
    model_config.update(
        {
            "num_labels": 3,
            "cls_tokens": list(cls_id_map.values()),
            "label2id": label2id,
            "id2label": {v:k for k, v in label2id.items()},
        }
    )
    
    model = AutoModelForTokenClassification.from_pretrained(cfg["model_name_or_path"], config=model_config)
    
    # need to resize embeddings because of added tokens
    model.resize_token_embeddings(len(tokenizer))
    
    PATH = f'{best_checkpoints[fold]}/pytorch_model.bin'
    
    model.load_state_dict(torch.load(PATH))
    
    # split dataset to train and eval
    keep_cols = {"input_ids", "attention_mask", "labels"}
    eval_dataset = ds.remove_columns([c for c in ds.column_names if c not in keep_cols])
   
    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    
    preds = trainer.predict(eval_dataset)
    preds_torch = torch.tensor(preds.predictions, dtype=torch.float32)
    
    all_preds = []
    all_logits = []
    all_labels = []

    for i in tqdm(range(len(eval_dataset))):
        indices = np.array(eval_dataset[i]['labels']) != -100
        mylabls = torch.tensor(np.array(eval_dataset[i]['labels']))[indices]
        mylogits = preds_torch[i][:len(indices),:][indices]
        mypreds = torch.nn.functional.softmax(mylogits, dim=-1)
        all_preds.append(mypreds)
        all_logits.append(mylogits)
        all_labels.append(mylabls)

    all_preds = torch.cat(all_preds, dim=0).numpy()
    all_logits = torch.cat(all_logits, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()

    full_eval = ds
    
    assert(len(eval_dataset) == len(full_eval))
    df = pd.DataFrame()
    df['discourse_id'] = [x for z in full_eval['discourse_id'] for x in z]
    df['preds'] = [x for x in all_preds]
    df['Ineffective'] = all_logits[:,2]
    df['Adequate'] = all_logits[:,0]
    df['Effective'] = all_logits[:,1]
    df['labels'] = all_labels
    df['discourse_type'] = [x for z in full_eval['discourse_type'] for x in z]
    df['discourse_effectiveness'] = [x for z in full_eval['discourse_effectiveness'] for x in z]
    df['discourse_text'] = [x for z in full_eval['discourse_text'] for x in z]
    df['loss'] = [sklearn.metrics.log_loss(np.expand_dims(np.array(x), 0), np.expand_dims(y, 0), labels=[0,1,2]) for x,y in zip(df.labels.values, np.stack(df.preds.values))]

    fold_dfs.append(df)
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

100%|███████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 740.30it/s]


100%|███████████████████████████████████████████████████████████| 15594/15594 [00:20<00:00, 747.65it/s]


100%|███████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 735.05it/s]
100%|███████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 740.03it/s]


100%|███████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 735.25it/s]


In [17]:
[len(x) for x in fold_dfs]

[144292, 144292, 144292, 144292, 144292]

In [18]:
colsBmod = ['Ineffective', 'Adequate', 'Effective', 'preds']
fold = 0
colsAmod = [f'{exp_name}_fold{fold}_{x}' for x in colsBmod]
colsAmod

['HF-43_fold0_Ineffective',
 'HF-43_fold0_Adequate',
 'HF-43_fold0_Effective',
 'HF-43_fold0_preds']

In [14]:
pseudo = fold_dfs[0].copy()
for c in colsBmod: del pseudo[c]
for fold in range(5):
    colsAmod = [f'{exp_name}_fold{fold}_{x}' for x in colsBmod]
    pseudo[colsAmod] = fold_dfs[fold][colsBmod]
pseudo.columns

Index(['discourse_id', 'labels', 'discourse_type', 'discourse_effectiveness',
       'discourse_text', 'loss', 'HF-43_fold0_Ineffective',
       'HF-43_fold0_Adequate', 'HF-43_fold0_Effective', 'HF-43_fold0_preds',
       'HF-43_fold1_Ineffective', 'HF-43_fold1_Adequate',
       'HF-43_fold1_Effective', 'HF-43_fold1_preds', 'HF-43_fold2_Ineffective',
       'HF-43_fold2_Adequate', 'HF-43_fold2_Effective', 'HF-43_fold2_preds',
       'HF-43_fold3_Ineffective', 'HF-43_fold3_Adequate',
       'HF-43_fold3_Effective', 'HF-43_fold3_preds', 'HF-43_fold4_Ineffective',
       'HF-43_fold4_Adequate', 'HF-43_fold4_Effective', 'HF-43_fold4_preds'],
      dtype='object')

In [15]:
pseudo.head().T

Unnamed: 0,0,1,2,3,4
discourse_id,1617734767734.0,1617734782429.0,1617734807715.0,1617734792635.0,1617734817866.0
labels,0,0,0,0,0
discourse_type,Position,Evidence,Evidence,Claim,Counterclaim
discourse_effectiveness,Adequate,Adequate,Adequate,Adequate,Adequate
discourse_text,"Some people belive that the so called ""face"" o...","It was not created by aliens, and there is no ...","A mesa is a naturally occuring rock formation,...","This ""face"" on mars only looks like a face bec...",Many conspiracy theorists believe that NASA is...
loss,0.35426,0.35111,0.121578,0.257545,0.188954
HF-43_fold0_Ineffective,-2.316406,0.636719,-0.23999,-0.729492,-0.552246
HF-43_fold0_Adequate,0.944824,1.583984,2.0625,1.124023,1.588867
HF-43_fold0_Effective,-0.005062,-1.832031,-1.46875,-0.863281,-0.813965
HF-43_fold0_preds,"[0.7016922, 0.27140403, 0.026903715]","[0.7039066, 0.023118429, 0.27297497]","[0.88552177, 0.025917724, 0.088560574]","[0.7729471, 0.10594349, 0.121109486]","[0.82782465, 0.07488618, 0.097289205]"


In [20]:
cols_to_delete = ['labels', 'discourse_effectiveness', 'discourse_text', 'loss']
for c in cols_to_delete:
    del pseudo[c]

In [21]:
pseudo.columns

Index(['discourse_id', 'discourse_type', 'HF-43_fold0_Ineffective',
       'HF-43_fold0_Adequate', 'HF-43_fold0_Effective', 'HF-43_fold0_preds',
       'HF-43_fold1_Ineffective', 'HF-43_fold1_Adequate',
       'HF-43_fold1_Effective', 'HF-43_fold1_preds', 'HF-43_fold2_Ineffective',
       'HF-43_fold2_Adequate', 'HF-43_fold2_Effective', 'HF-43_fold2_preds',
       'HF-43_fold3_Ineffective', 'HF-43_fold3_Adequate',
       'HF-43_fold3_Effective', 'HF-43_fold3_preds', 'HF-43_fold4_Ineffective',
       'HF-43_fold4_Adequate', 'HF-43_fold4_Effective', 'HF-43_fold4_preds'],
      dtype='object')

In [22]:
pseudo.to_csv(f'../output/{exp_name}_pseudo.csv', index=False)

In [23]:
# psamed = pd.read_csv('../input/psl_deberta_xlarge.csv')

In [107]:
# sel = ['essay_id', 'labels', 'fold_k_5_seed_42', 'discourse_id',
#        'fold2_Ineffective', 'fold2_Adequate', 'fold2_Effective',
#        'fold4_Ineffective', 'fold4_Adequate', 'fold4_Effective',
#        'fold0_Ineffective', 'fold0_Adequate', 'fold0_Effective',
#        'fold1_Ineffective', 'fold1_Adequate', 'fold1_Effective',
#        'fold3_Ineffective', 'fold3_Adequate', 'fold3_Effective']

# join = pd.merge(pseudo, psamed[sel], how='left', on='discourse_id')

In [112]:
# join.to_csv('../input/hf_39_amed_pseudo.csv', index=False)