In [2]:
exp_name = 'HF-43b'
import json
from pathlib import Path
best_metrics = []
best_checkpoints = []

for fold in range(5):
    folder = Path(f"../output/{exp_name}-fold{fold}")
    checkpoint = sorted(list(folder.glob("checkpoint*")))[-1]
    with open(checkpoint/"trainer_state.json", "r") as fp:
        data = json.load(fp)
        best_metrics.append(data["best_metric"])
        best_checkpoints.append(data["best_model_checkpoint"])
        
print(best_metrics)
average = sum(best_metrics)/len(best_metrics)
average

[0.5762104988098145, 0.5742188692092896, 0.5772905349731445, 0.5760306715965271, 0.5808395147323608]


0.5769180178642273

In [4]:
best_checkpoints = [f'../output/HF-43b-fold{x}/swa' for x in range(5)]
best_checkpoints

['../output/HF-43b-fold0/swa',
 '../output/HF-43b-fold1/swa',
 '../output/HF-43b-fold2/swa',
 '../output/HF-43b-fold3/swa',
 '../output/HF-43b-fold4/swa']

In [5]:
DEBUG = False
cfg = {
    "num_proc": 2,
    "aug_prob": 0.05,
    "k_folds": 5,
    "max_length": 2048,
    "padding": False,
    "stride": 0,
    "data_dir": "../input/feedback-prize-effectiveness",
    "load_from_disk": None,
    "pad_multiple": 8,
    "model_name_or_path": "../output/HF-pret-7-fold0/checkpoint-16605/",
    "dropout": 0.0,
    "trainingargs": {
        "output_dir": f"../output/{exp_name}",
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 4,
        "learning_rate": 1.2e-5,
        # "label_smoothing_factor": 0.05,
        "weight_decay": 0.01,
        "num_train_epochs": 2.2,
        "warmup_ratio": 0.1,
        "optim": 'adamw_torch',
        "logging_steps": 25,
        "save_strategy": "steps",
        "save_steps": 25,
        "evaluation_strategy": "steps",
        "eval_steps": 25,
        "eval_delay": 600,
        "report_to": "wandb",
        "group_by_length": True,
        "save_total_limit": 1,
        "metric_for_best_model": "loss",
        "greater_is_better": False,
        "seed": 42,
        "fp16": True,
        "gradient_checkpointing": True,
        "gradient_accumulation_steps": 1,
    }
}

In [6]:
import re
import pickle
import codecs
import warnings
import logging
from functools import partial
from pathlib import Path
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple

import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, set_seed

from datasets import Dataset, load_from_disk

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def read_text_files(example, data_dir):
    
    id_ = example["essay_id"]
    
    with open(data_dir / "train" / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())
    
    return example

set_seed(cfg["trainingargs"]["seed"])

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
essay_folds = pd.read_csv('../input/feedback-folds/df_folds.csv')
essay_folds.head()
essay_folds_dict = {x:y for x,y in zip(essay_folds.essay_id.values.tolist(), essay_folds.fold_k_5_seed_42.values.tolist())}

In [8]:
data_dir = Path(cfg["data_dir"])

if cfg["load_from_disk"]:
    if not cfg["load_from_disk"].endswith(".dataset"):
        cfg["load_from_disk"] += ".dataset"
    ds = load_from_disk(cfg["load_from_disk"])
    
    pkl_file = f"{cfg['load_from_disk'][:-len('.dataset')]}_pkl"
    with open(pkl_file, "rb") as fp: 
        grouped = pickle.load(fp)
        
    print("loading from saved files")
else:
    train_df = pd.read_csv(data_dir / "train.csv")
    
    train_df = train_df[train_df.discourse_id != '56744a66949a'].reset_index(drop=True)
    
    if DEBUG: train_df = train_df.sample(n=100).reset_index(drop=True)
    
    text_ds = Dataset.from_dict({"essay_id": train_df.essay_id.unique()})
    
    text_ds = text_ds.map(
        partial(read_text_files, data_dir=data_dir),
        num_proc=cfg["num_proc"],
        batched=False,
        desc="Loading text files",
    )
    
    text_df = text_ds.to_pandas()
    
    train_df["discourse_text"] = [
        resolve_encodings_and_normalize(x) for x in train_df["discourse_text"]
    ]
    
    train_df = train_df.merge(text_df, on="essay_id", how="left")
    
disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]

cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

tokenizer = AutoTokenizer.from_pretrained(cfg["model_name_or_path"])
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(cls_tokens_map.values())+list(end_tokens_map.values())}
)

cls_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in cls_tokens_map.items()
}

Loading text files #0:   0%|                                                  | 0/2096 [00:00<?, ?ex/s]
Loading text files #0:   9%|███▌                                  | 194/2096 [00:00<00:00, 1935.70ex/s][A
Loading text files #0:  19%|███████                               | 388/2096 [00:00<00:00, 1921.25ex/s][A
Loading text files #0:  28%|██████████▋                           | 590/2096 [00:00<00:00, 1961.02ex/s][A
Loading text files #1:  28%|██████████▌                           | 583/2095 [00:00<00:00, 1932.02ex/s][A
Loading text files #0:  46%|█████████████████▌                    | 970/2096 [00:00<00:00, 1798.68ex/s][A
Loading text files #0:  55%|████████████████████▎                | 1151/2096 [00:00<00:00, 1783.07ex/s][A
Loading text files #0:  65%|████████████████████████             | 1364/2096 [00:00<00:00, 1890.09ex/s][A
Loading text files #0:  76%|███████████████████████████▉         | 1584/2096 [00:00<00:00, 1984.76ex/s][A
Loading text files #0:  86%|████████████

In [9]:
def find_positions(example):

    text = example["text"][0]
    
    # keeps track of what has already
    # been located
    min_idx = 0
    
    # stores start and end indexes of discourse_texts
    idxs = []
    
    for dt in example["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))
        
        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1]) # will filter out later
            continue  
        # If one match is found
        else:
            m = matches[0]
            
        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(example):
    example["idxs"] = find_positions(example)

    text = example["text"][0]
    text = text.replace('\n', '|')

    chunks = []
    labels = []
    prev = 0

    zipped = zip(
        example["idxs"],
        example["discourse_type"],
        example["discourse_effectiveness"],
    )
    for idxs, disc_type, disc_effect in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs

        # if the start of the current discourse_text is not 
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is 
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])
        
        prev = e

        labels.append(label2id[disc_effect])

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=cfg["max_length"],
        add_special_tokens=True,
    )
    
    # at this point, labels is not the same shape as input_ids.
    # The following loop will add -100 so that the loss function
    # ignores all tokens except CLS tokens

    # idx for labels list
    idx = 0
    final_labels = []
    for id_ in tokenized["input_ids"]:
        # if this id belongs to a CLS token
        if id_ in cls_id_map.values():
            final_labels.append(labels[idx])
            idx += 1
        else:
            # -100 will be ignored by loss function
            final_labels.append(-100)
    
    tokenized["labels"] = final_labels

    return tokenized

In [10]:
def add_fold(example):
    example["fold"] = essay_folds_dict[example["essay_id"]]
    return example

In [11]:
# I frequently restart my notebook, so to reduce time
# you can set this to just load the tokenized dataset from disk.
# It gets loaded in the 3rd code cell, but a check is done here
# to skip tokenizing
if cfg["load_from_disk"] is None:

    # make lists of discourse_text, discourse_effectiveness
    # for each essay
    grouped = train_df.groupby(["essay_id"]).agg(list)

    ds = Dataset.from_pandas(grouped)

    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=cfg["num_proc"],
        desc="Tokenizing",
    )

    save_dir = f"{cfg['trainingargs']['output_dir']}"
    ds.save_to_disk(f"{save_dir}.dataset")
    with open(f"{save_dir}_pkl", "wb") as fp:
        pickle.dump(grouped, fp)
    print("Saving dataset to disk:", cfg['trainingargs']['output_dir'])


Tokenizing #0:   5%|██▎                                            | 105/2096 [00:00<00:15, 127.71ex/s]
Tokenizing #0:   6%|██▋                                            | 119/2096 [00:00<00:15, 129.85ex/s][A
Tokenizing #0:   6%|██▉                                            | 133/2096 [00:01<00:15, 129.31ex/s][A
Tokenizing #0:   7%|███▎                                           | 149/2096 [00:01<00:14, 137.98ex/s][A
Tokenizing #0:   8%|███▊                                           | 168/2096 [00:01<00:12, 150.47ex/s][A
Tokenizing #0:   9%|████▏                                          | 185/2096 [00:01<00:12, 154.16ex/s][A
Tokenizing #0:  10%|████▌                                          | 201/2096 [00:01<00:12, 147.85ex/s][A
Tokenizing #0:  10%|████▉                                          | 220/2096 [00:01<00:11, 158.80ex/s][A
Tokenizing #0:  11%|█████▎                                         | 237/2096 [00:01<00:11, 158.19ex/s][A
Tokenizing #0:  12%|█████▋              

Saving dataset to disk: ../output/HF-43b


In [12]:
ds = ds.map(add_fold)

100%|█████████████████████████████████████████████████████████████| 4191/4191 [00:05<00:00, 769.20ex/s]


In [13]:
bad_matches = []
cls_ids = set(list(cls_id_map.values()))
for id_, l, ids, dt in zip(ds["essay_id"], ds["labels"], ds["input_ids"], grouped.discourse_text):
    
    # count number of labels (ignoring -100)
    num_cls_label = sum([x!=-100 for x in l])
    # count number of cls ids
    num_cls_id = sum([x in cls_ids for x in ids])
    # true number of discourse_texts
    num_dt = len(dt)
    
    if num_cls_label != num_dt or num_cls_id != num_dt:
        bad_matches.append((id_, l, ids, dt))
        
print("Num bad matches", len(bad_matches))

Num bad matches 0


In [14]:
ds

Dataset({
    features: ['discourse_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'text', 'essay_id', 'idxs', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'fold'],
    num_rows: 4191
})

In [15]:
keep_df = {"discourse_id", "essay_id", "discourse_text", "discourse_type", "discourse_effectiveness", "labels", "fold"}
test_df = ds.remove_columns([c for c in ds.column_names if c not in keep_df]).to_pandas()
test_df.head()

Unnamed: 0,discourse_id,discourse_text,discourse_type,discourse_effectiveness,essay_id,labels,fold
0,"[fe6dfbd53216, ca9e1b60c9fb, 6cf2157f4f19, d92...",[Driverless cars are exaclty what you would ex...,"[Lead, Position, Claim, Evidence, Claim, Evide...","[Adequate, Effective, Effective, Effective, Ef...",00066EA9880D,"[-100, 0, -100, -100, -100, -100, -100, -100, ...",2
1,"[695d181861a1, cd97ee1cc0ad, 1b775274990b, 567...","[I am arguing against the policy change , even...","[Position, Counterclaim, Rebuttal, Evidence, C...","[Adequate, Adequate, Adequate, Adequate, Adequ...",000E6DE9E817,"[-100, -100, -100, -100, -100, -100, 0, -100, ...",2
2,"[89304284cef1, 4f2e871a4908, a885c3aa214b, 953...",[I think that students would benefit from lear...,"[Position, Claim, Claim, Claim, Claim, Evidenc...","[Adequate, Adequate, Adequate, Adequate, Adequ...",0016926B079C,"[-100, 0, -100, -100, -100, -100, -100, -100, ...",3
3,"[a713d0f6dc68, 2fd9bb2bfedf, 0e5ecdf1516e, 499...",[It is every student's dream to be able to lou...,"[Lead, Position, Evidence, Counterclaim, Rebut...","[Effective, Effective, Effective, Adequate, Ef...",00203C45FC55,"[-100, 1, -100, -100, -100, -100, -100, -100, ...",3
4,"[1082de1aa198, e425994b2124, bf086f9911f6, 29c...",[I heard you are considering changing the scho...,"[Lead, Position, Claim, Evidence, Counterclaim...","[Adequate, Effective, Ineffective, Adequate, A...",0029F4D19C3F,"[-100, -100, -100, -100, -100, -100, 0, -100, ...",3


In [16]:
import gc
import torch
from torch.utils.checkpoint import checkpoint
import numpy as np
from tqdm import tqdm
from transformers import Trainer, TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification
import sklearn



args = TrainingArguments(**cfg["trainingargs"])

# if using longformer pad to multiple of 512
# for others pad to multiple of 8

collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=cfg["pad_multiple"], padding=True
)

output = args.output_dir

fold_dfs = []

for fold in range(cfg["k_folds"]):
    
    args.output_dir = f"{output}-fold{fold}"
    
    model_config = AutoConfig.from_pretrained(
        cfg["model_name_or_path"],
    )
    model_config.update(
        {
            "num_labels": 3,
            "cls_tokens": list(cls_id_map.values()),
            "label2id": label2id,
            "id2label": {v:k for k, v in label2id.items()},
        }
    )
    
    model = AutoModelForTokenClassification.from_pretrained(cfg["model_name_or_path"], config=model_config)
    
    # need to resize embeddings because of added tokens
    model.resize_token_embeddings(len(tokenizer))
    
    PATH = f'{best_checkpoints[fold]}/pytorch_model.bin'
    
    model.load_state_dict(torch.load(PATH))
    
    # split dataset to train and eval
    keep_cols = {"input_ids", "attention_mask", "labels"}
    eval_dataset = ds.filter(lambda example: example["fold"] == fold).remove_columns([c for c in ds.column_names if c not in keep_cols])
   
    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    
    preds = trainer.predict(eval_dataset)
    preds_torch = torch.tensor(preds.predictions, dtype=torch.float32)
    
    all_preds = []
    all_logits = []
    all_labels = []

    for i in tqdm(range(len(eval_dataset))):
        indices = np.array(eval_dataset[i]['labels']) != -100
        mylabls = torch.tensor(np.array(eval_dataset[i]['labels']))[indices]
        mylogits = preds_torch[i][:len(indices),:][indices]
        mypreds = torch.nn.functional.softmax(mylogits, dim=-1)
        all_preds.append(mypreds)
        all_logits.append(mylogits)
        all_labels.append(mylabls)

    all_preds = torch.cat(all_preds, dim=0).numpy()
    all_logits = torch.cat(all_logits, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()

    full_eval = ds.filter(lambda example: example["fold"] == fold)
    
    assert(len(eval_dataset) == len(full_eval))
    df = pd.DataFrame()
    df['discourse_id'] = [x for z in full_eval['discourse_id'] for x in z]
    df['preds'] = [x for x in all_preds]
    df['Ineffective'] = all_logits[:,2]
    df['Adequate'] = all_logits[:,0]
    df['Effective'] = all_logits[:,1]
    df['labels'] = all_labels
    df['discourse_type'] = [x for z in full_eval['discourse_type'] for x in z]
    df['discourse_effectiveness'] = [x for z in full_eval['discourse_effectiveness'] for x in z]
    df['discourse_text'] = [x for z in full_eval['discourse_text'] for x in z]
    df['loss'] = [sklearn.metrics.log_loss(np.expand_dims(np.array(x), 0), np.expand_dims(y, 0), labels=[0,1,2]) for x,y in zip(df.labels.values, np.stack(df.preds.values))]

    fold_dfs.append(df)
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

full_df = pd.concat(fold_dfs).reset_index(drop=True)
full_df.head()

100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.49ba/s]


100%|███████████████████████████████████████████████████████████████| 838/838 [00:01<00:00, 671.94it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.46ba/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.54ba/s]


100%|███████████████████████████████████████████████████████████████| 834/834 [00:01<00:00, 693.81it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.51ba/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.57ba/s]


100%|███████████████████████████████████████████████████████████████| 843/843 [00:01<00:00, 680.81it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.50ba/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.55ba/s]


100%|███████████████████████████████████████████████████████████████| 838/838 [00:01<00:00, 667.50it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.48ba/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.58ba/s]


100%|███████████████████████████████████████████████████████████████| 838/838 [00:01<00:00, 718.82it/s]
100%|████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.50ba/s]


Unnamed: 0,discourse_id,preds,Ineffective,Adequate,Effective,labels,discourse_type,discourse_effectiveness,discourse_text,loss
0,4bbb368a6ffd,"[0.050281677, 0.94874567, 0.00097270764]",-4.128906,-0.183594,2.753906,1,Lead,Effective,In life all of us suffer many trials and obsta...,0.052615
1,d4bb753babd0,"[0.11812812, 0.8779847, 0.0038872708]",-3.539062,-0.125,1.880859,1,Claim,Effective,"it could help you explore different mindsets,",0.130126
2,62ececba9b36,"[0.12580502, 0.86456245, 0.009632601]",-2.894531,-0.324951,1.602539,1,Claim,Effective,"get an outside unbiased opinion,",0.145532
3,4a70f8078d80,"[0.12156934, 0.86958325, 0.008847398]",-3.0,-0.379639,1.587891,1,Claim,Effective,give you a chance to express and organize your...,0.139741
4,60861279dee4,"[0.0697352, 0.9289554, 0.0013094506]",-4.035156,-0.060059,2.529297,1,Evidence,Effective,Talking to someone to get an outside opinion c...,0.073695


In [17]:
sklearn.metrics.log_loss(full_df.labels.values, np.stack(full_df.preds.values), labels=[0,1,2])

0.5774768610028812

In [22]:
exp_name

'HF-43b'

In [23]:
full_df.to_csv(f'../output/{exp_name}-OOF.csv', index=False)

In [24]:
best_checkpoints

['../output/HF-43b-fold0/swa',
 '../output/HF-43b-fold1/swa',
 '../output/HF-43b-fold2/swa',
 '../output/HF-43b-fold3/swa',
 '../output/HF-43b-fold4/swa']

In [25]:
for fold in range(5):
    folder = best_checkpoints[fold]
    !~/gdrive upload {folder}/pytorch_model.bin --name pytorch_model_{fold}.bin

Uploading ../output/HF-43b-fold0/swa/pytorch_model.bin
Uploaded 1B7jZh1UCE7jZZy_XbFDZDD1_kJNHaBah at 27.6 MB/s, total 1.7 GB
Uploading ../output/HF-43b-fold1/swa/pytorch_model.bin
Uploaded 1GQ_YW-N6tlKmZlSPD-6JmV09J1vlPoeX at 25.6 MB/s, total 1.7 GB
Uploading ../output/HF-43b-fold2/swa/pytorch_model.bin
Uploaded 1835oX2lM4gv2C9Hyg9x5_ptUPdcf7PhU at 28.4 MB/s, total 1.7 GB
Uploading ../output/HF-43b-fold3/swa/pytorch_model.bin
Uploaded 1_a1O0CiKQgsMcL7ixFRelT4V1yfUzzq2 at 28.4 MB/s, total 1.7 GB
Uploading ../output/HF-43b-fold4/swa/pytorch_model.bin
Uploaded 1cS4FbiU_ny0tNG4syrZ-Ra3eRbyhNHf9 at 26.0 MB/s, total 1.7 GB
