In [55]:
exp_name = 'HF-39'
import json
from pathlib import Path
best_metrics = []
best_checkpoints = []

for fold in range(5):
    folder = Path(f"../output/{exp_name}-fold{fold}")
    checkpoint = sorted(list(folder.glob("checkpoint*")))[-1]
    with open(checkpoint/"trainer_state.json", "r") as fp:
        data = json.load(fp)
        best_metrics.append(data["best_metric"])
        best_checkpoints.append(data["best_model_checkpoint"])
        
print(best_metrics)
average = sum(best_metrics)/len(best_metrics)
average

[0.5916528105735779, 0.5912548899650574, 0.5870948433876038, 0.5914664268493652, 0.5966428518295288]


0.5916223645210266

In [56]:
best_checkpoints

['../output/HF-39-fold0/checkpoint-850',
 '../output/HF-39-fold1/checkpoint-900',
 '../output/HF-39-fold2/checkpoint-850',
 '../output/HF-39-fold3/checkpoint-900',
 '../output/HF-39-fold4/checkpoint-750']

In [57]:
DEBUG = False
cfg = {
    "num_proc": 2,
    "aug_prob": 0.05,
    "k_folds": 5,
    "max_length": 2048,
    "padding": False,
    "stride": 0,
    "data_dir": "../input/fbck2021",
    "load_from_disk": None,
    "pad_multiple": 8,
    "model_name_or_path": "../output/HF-39-fold0/checkpoint-850/",
    "dropout": 0.0,
    "trainingargs": {
        "output_dir": f"../output/{exp_name}",
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 4,
        "learning_rate": 1.2e-5,
        # "label_smoothing_factor": 0.05,
        "weight_decay": 0.01,
        "num_train_epochs": 2.2,
        "warmup_ratio": 0.1,
        "optim": 'adamw_torch',
        "logging_steps": 25,
        "save_strategy": "steps",
        "save_steps": 25,
        "evaluation_strategy": "steps",
        "eval_steps": 25,
        "eval_delay": 600,
        "report_to": "wandb",
        "group_by_length": True,
        "save_total_limit": 1,
        "metric_for_best_model": "loss",
        "greater_is_better": False,
        "seed": 42,
        "fp16": True,
        "gradient_checkpointing": True,
        "gradient_accumulation_steps": 1,
    }
}

In [58]:
import re
import pickle
import codecs
import warnings
import logging
from functools import partial
from pathlib import Path
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple

import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, set_seed

from datasets import Dataset, load_from_disk

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def read_text_files(example, data_dir):
    
    id_ = example["essay_id"]
    
    with open(data_dir / "train" / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())
    
    return example

set_seed(cfg["trainingargs"]["seed"])

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [88]:
data_dir = Path(cfg["data_dir"])

if cfg["load_from_disk"]:
    if not cfg["load_from_disk"].endswith(".dataset"):
        cfg["load_from_disk"] += ".dataset"
    ds = load_from_disk(cfg["load_from_disk"])
    
    pkl_file = f"{cfg['load_from_disk'][:-len('.dataset')]}_pkl"
    with open(pkl_file, "rb") as fp: 
        grouped = pickle.load(fp)
        
    print("loading from saved files")
else:
    train_df = pd.read_csv("../input/2021_data_for_pseudo_mlm.csv")
    
    train_df = train_df[train_df.discourse_id != '56744a66949a'].reset_index(drop=True)
    train_df = train_df[train_df.discourse_id != 1623258656795].reset_index(drop=True)
    
    if DEBUG: train_df = train_df.sample(n=100).reset_index(drop=True)
    
    text_ds = Dataset.from_dict({"essay_id": train_df.essay_id.unique()})
    
    text_ds = text_ds.map(
        partial(read_text_files, data_dir=data_dir),
        num_proc=cfg["num_proc"],
        batched=False,
        desc="Loading text files",
    )
    
    text_df = text_ds.to_pandas()
    
    train_df["discourse_text"] = [
        resolve_encodings_and_normalize(x) for x in train_df["discourse_text"]
    ]
    
    train_df = train_df.merge(text_df, on="essay_id", how="left")
    
disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]

cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

tokenizer = AutoTokenizer.from_pretrained(cfg["model_name_or_path"])
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(cls_tokens_map.values())+list(end_tokens_map.values())}
)

cls_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in cls_tokens_map.items()
}

Loading text files #0:   0%|                                                | 0/7797 [00:00<?, ?ex/s]
Loading text files #0:   2%|▉                                   | 193/7797 [00:00<00:03, 1924.93ex/s][A
Loading text files #1:   3%|█▏                                  | 266/7797 [00:00<00:02, 2656.84ex/s][A
Loading text files #0:   5%|█▊                                  | 386/7797 [00:00<00:03, 1872.27ex/s][A
Loading text files #0:  10%|███▌                                | 784/7797 [00:00<00:03, 1942.14ex/s][A
Loading text files #0:  13%|████▌                               | 979/7797 [00:00<00:03, 1887.04ex/s][A
Loading text files #0:  15%|█████▏                             | 1169/7797 [00:00<00:03, 1803.81ex/s][A
Loading text files #0:  17%|██████                             | 1364/7797 [00:00<00:03, 1847.32ex/s][A
Loading text files #0:  20%|██████▉                            | 1559/7797 [00:00<00:03, 1877.39ex/s][A
Loading text files #0:  22%|███████▊                      

In [90]:
def find_positions(example):

    text = example["text"][0]
    
    # keeps track of what has already
    # been located
    min_idx = 0
    
    # stores start and end indexes of discourse_texts
    idxs = []
    
    for dt in example["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))
        
        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1]) # will filter out later
            continue  
        # If one match is found
        else:
            m = matches[0]
            
        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(example):
    example["idxs"] = find_positions(example)

    text = example["text"][0]
    text = text.replace('\n', '|')

    chunks = []
    labels = []
    prev = 0

    zipped = zip(
        example["idxs"],
        example["discourse_type"],
        example["discourse_effectiveness"],
    )
    for idxs, disc_type, disc_effect in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs

        # if the start of the current discourse_text is not 
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is 
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])
        
        prev = e

        labels.append(label2id[disc_effect])

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=cfg["max_length"],
        add_special_tokens=True,
    )
    
    # at this point, labels is not the same shape as input_ids.
    # The following loop will add -100 so that the loss function
    # ignores all tokens except CLS tokens

    # idx for labels list
    idx = 0
    final_labels = []
    for id_ in tokenized["input_ids"]:
        # if this id belongs to a CLS token
        if id_ in cls_id_map.values():
            final_labels.append(labels[idx])
            idx += 1
        else:
            # -100 will be ignored by loss function
            final_labels.append(-100)
    
    tokenized["labels"] = final_labels

    return tokenized

In [91]:
# I frequently restart my notebook, so to reduce time
# you can set this to just load the tokenized dataset from disk.
# It gets loaded in the 3rd code cell, but a check is done here
# to skip tokenizing
if cfg["load_from_disk"] is None:

    # make lists of discourse_text, discourse_effectiveness
    # for each essay
    grouped = train_df.groupby(["essay_id"]).agg(list)

    ds = Dataset.from_pandas(grouped)

    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=cfg["num_proc"],
        desc="Tokenizing",
    )

    save_dir = f"{cfg['trainingargs']['output_dir']}"
    ds.save_to_disk(f"{save_dir}.dataset")
    with open(f"{save_dir}_pkl", "wb") as fp:
        pickle.dump(grouped, fp)
    print("Saving dataset to disk:", cfg['trainingargs']['output_dir'])


Tokenizing #0:   8%|███▋                                         | 645/7797 [00:04<00:43, 164.86ex/s]
Tokenizing #0:   9%|███▊                                         | 663/7797 [00:04<00:43, 164.71ex/s][A
Tokenizing #0:   9%|███▉                                         | 680/7797 [00:04<00:44, 161.50ex/s][A
Tokenizing #0:   9%|████                                         | 700/7797 [00:04<00:41, 169.91ex/s][A
Tokenizing #0:   9%|████▏                                        | 718/7797 [00:04<00:42, 166.61ex/s][A
Tokenizing #0:   9%|████▏                                        | 736/7797 [00:04<00:41, 168.52ex/s][A
Tokenizing #0:  10%|████▎                                        | 755/7797 [00:04<00:40, 173.50ex/s][A
Tokenizing #0:  10%|████▍                                        | 773/7797 [00:04<00:41, 170.90ex/s][A
Tokenizing #0:  10%|████▌                                        | 793/7797 [00:04<00:39, 178.53ex/s][A
Tokenizing #0:  10%|████▋                                 

Saving dataset to disk: ../output/HF-39


In [92]:
bad_matches = []
cls_ids = set(list(cls_id_map.values()))
for did_, id_, l, ids, dt in zip(ds["discourse_id"], ds["essay_id"], ds["labels"], ds["input_ids"], grouped.discourse_text):
    
    # count number of labels (ignoring -100)
    num_cls_label = sum([x!=-100 for x in l])
    # count number of cls ids
    num_cls_id = sum([x in cls_ids for x in ids])
    # true number of discourse_texts
    num_dt = len(dt)
    
    if num_cls_label != num_dt or num_cls_id != num_dt:
        bad_matches.append((did_, id_, l, ids, dt))
        
print("Num bad matches", len(bad_matches))

Num bad matches 0


In [93]:
ds

Dataset({
    features: ['discourse_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'fold', 'text', 'essay_id', 'idxs', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 15594
})

In [94]:
keep_df = {"discourse_id", "essay_id", "discourse_text", "discourse_type", "discourse_effectiveness", "labels", "fold"}
test_df = ds.remove_columns([c for c in ds.column_names if c not in keep_df]).to_pandas()
test_df.head()

Unnamed: 0,discourse_id,discourse_text,discourse_type,discourse_effectiveness,fold,essay_id,labels
0,"[1617734767734.0, 1617734782429.0, 16177348077...","[Some people belive that the so called ""face"" ...","[Position, Evidence, Evidence, Claim, Counterc...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[-1, -1, -1, -1, -1, -1, -1, -1]",0000D23A521A,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."
1,"[1621104238021.0, 1621104245981.0, 16211043488...",[Driverless cars are exaclty what you would ex...,"[Lead, Position, Claim, Evidence, Claim, Evide...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[2, 2, 2, 2, 2, 2, 2, 2, 2]",00066EA9880D,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."
2,"[1617296637311.0, 1617296650644.0, 16172966674...","[I am arguing against the policy change , even...","[Position, Counterclaim, Rebuttal, Evidence, C...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",000E6DE9E817,"[-100, -100, -100, -100, -100, -100, 0, -100, ..."
3,"[1622844028582.0, 1622844050451.0, 16228440600...",[Would you be able to give your car up? Having...,"[Lead, Evidence, Claim, Claim, Evidence, Claim...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",001552828BD0,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."
4,"[1621080957958.0, 1621081369014.0, 16210813821...",[I think that students would benefit from lear...,"[Position, Claim, Claim, Claim, Claim, Evidenc...","[Adequate, Adequate, Adequate, Adequate, Adequ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",0016926B079C,"[-100, 0, -100, -100, -100, -100, -100, -100, ..."


In [95]:
import gc
import torch
from torch.utils.checkpoint import checkpoint
import numpy as np
from tqdm import tqdm
from transformers import Trainer, TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification
import sklearn

args = TrainingArguments(**cfg["trainingargs"])

# if using longformer pad to multiple of 512
# for others pad to multiple of 8

collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=cfg["pad_multiple"], padding=True
)

output = args.output_dir

fold_dfs = []

for fold in range(cfg["k_folds"]):
    
    args.output_dir = f"{output}-fold{fold}"
    
    model_config = AutoConfig.from_pretrained(
        cfg["model_name_or_path"],
    )
    model_config.update(
        {
            "num_labels": 3,
            "cls_tokens": list(cls_id_map.values()),
            "label2id": label2id,
            "id2label": {v:k for k, v in label2id.items()},
        }
    )
    
    model = AutoModelForTokenClassification.from_pretrained(cfg["model_name_or_path"], config=model_config)
    
    # need to resize embeddings because of added tokens
    model.resize_token_embeddings(len(tokenizer))
    
    PATH = f'{best_checkpoints[fold]}/pytorch_model.bin'
    
    model.load_state_dict(torch.load(PATH))
    
    # split dataset to train and eval
    keep_cols = {"input_ids", "attention_mask", "labels"}
    eval_dataset = ds.remove_columns([c for c in ds.column_names if c not in keep_cols])
   
    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    
    preds = trainer.predict(eval_dataset)
    preds_torch = torch.tensor(preds.predictions, dtype=torch.float32)
    
    all_preds = []
    all_logits = []
    all_labels = []

    for i in tqdm(range(len(eval_dataset))):
        indices = np.array(eval_dataset[i]['labels']) != -100
        mylabls = torch.tensor(np.array(eval_dataset[i]['labels']))[indices]
        mylogits = preds_torch[i][:len(indices),:][indices]
        mypreds = torch.nn.functional.softmax(mylogits, dim=-1)
        all_preds.append(mypreds)
        all_logits.append(mylogits)
        all_labels.append(mylabls)

    all_preds = torch.cat(all_preds, dim=0).numpy()
    all_logits = torch.cat(all_logits, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()

    full_eval = ds
    
    assert(len(eval_dataset) == len(full_eval))
    df = pd.DataFrame()
    df['discourse_id'] = [x for z in full_eval['discourse_id'] for x in z]
    df['preds'] = [x for x in all_preds]
    df['Ineffective'] = all_logits[:,2]
    df['Adequate'] = all_logits[:,0]
    df['Effective'] = all_logits[:,1]
    df['labels'] = all_labels
    df['discourse_type'] = [x for z in full_eval['discourse_type'] for x in z]
    df['discourse_effectiveness'] = [x for z in full_eval['discourse_effectiveness'] for x in z]
    df['discourse_text'] = [x for z in full_eval['discourse_text'] for x in z]
    df['loss'] = [sklearn.metrics.log_loss(np.expand_dims(np.array(x), 0), np.expand_dims(y, 0), labels=[0,1,2]) for x,y in zip(df.labels.values, np.stack(df.preds.values))]

    fold_dfs.append(df)
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

100%|█████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 726.36it/s]


100%|█████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 716.60it/s]


100%|█████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 722.60it/s]


100%|█████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 728.39it/s]


100%|█████████████████████████████████████████████████████████| 15594/15594 [00:21<00:00, 713.12it/s]


In [96]:
[len(x) for x in fold_dfs]

[144292, 144292, 144292, 144292, 144292]

In [97]:
colsBmod = ['Ineffective', 'Adequate', 'Effective', 'loss', 'preds']
fold = 0
colsAmod = [f'HF39_fold{fold}_{x}' for x in colsBmod]
colsAmod

['HF39_fold0_Ineffective',
 'HF39_fold0_Adequate',
 'HF39_fold0_Effective',
 'HF39_fold0_loss',
 'HF39_fold0_preds']

In [98]:
pseudo = fold_dfs[0].copy()
for c in colsBmod: del pseudo[c]
for fold in range(5):
    colsAmod = [f'HF39_fold{fold}_{x}' for x in colsBmod]
    pseudo[colsAmod] = fold_dfs[fold][colsBmod]
pseudo.columns

Index(['discourse_id', 'labels', 'discourse_type', 'discourse_effectiveness',
       'discourse_text', 'HF39_fold0_Ineffective', 'HF39_fold0_Adequate',
       'HF39_fold0_Effective', 'HF39_fold0_loss', 'HF39_fold0_preds',
       'HF39_fold1_Ineffective', 'HF39_fold1_Adequate', 'HF39_fold1_Effective',
       'HF39_fold1_loss', 'HF39_fold1_preds', 'HF39_fold2_Ineffective',
       'HF39_fold2_Adequate', 'HF39_fold2_Effective', 'HF39_fold2_loss',
       'HF39_fold2_preds', 'HF39_fold3_Ineffective', 'HF39_fold3_Adequate',
       'HF39_fold3_Effective', 'HF39_fold3_loss', 'HF39_fold3_preds',
       'HF39_fold4_Ineffective', 'HF39_fold4_Adequate', 'HF39_fold4_Effective',
       'HF39_fold4_loss', 'HF39_fold4_preds'],
      dtype='object')

In [99]:
pseudo.head().T

Unnamed: 0,0,1,2,3,4
discourse_id,1617734767734.0,1617734782429.0,1617734807715.0,1617734792635.0,1617734817866.0
labels,0,0,0,0,0
discourse_type,Position,Evidence,Evidence,Claim,Counterclaim
discourse_effectiveness,Adequate,Adequate,Adequate,Adequate,Adequate
discourse_text,"Some people belive that the so called ""face"" o...","It was not created by aliens, and there is no ...","A mesa is a naturally occuring rock formation,...","This ""face"" on mars only looks like a face bec...",Many conspiracy theorists believe that NASA is...
HF39_fold0_Ineffective,-2.111328,0.289795,-0.468994,-0.903809,-0.524902
HF39_fold0_Adequate,0.521973,1.536133,1.932617,0.946289,1.762695
HF39_fold0_Effective,-0.226807,-1.550781,-1.481445,-0.558594,-0.522949
HF39_fold0_loss,0.434884,0.287581,0.11643,0.321551,0.185
HF39_fold0_preds,"[0.64733946, 0.306155, 0.046505477]","[0.7500758, 0.034235403, 0.21568875]","[0.8900922, 0.02929048, 0.08061734]","[0.7250238, 0.16098668, 0.113989554]","[0.83110416, 0.084530346, 0.08436541]"


In [100]:
loss_cols = [f'HF39_fold{x}_loss' for x in range(5)]
loss_vals = pseudo[loss_cols].values
loss_range = loss_vals.max(axis=1) - loss_vals.min(axis=1)
pseudo['loss_range'] = loss_range

In [101]:
pseudo.loss_range.min(), pseudo.loss_range.max()

(0.002623407170176506, 2.895520269870758)

In [102]:
pseudo.sort_values('loss_range', ascending=False)[[x for x in pseudo.columns if 'preds' in x]].head()

Unnamed: 0,HF39_fold0_preds,HF39_fold1_preds,HF39_fold2_preds,HF39_fold3_preds,HF39_fold4_preds
87106,"[0.505767, 0.006890734, 0.48734227]","[0.50798327, 0.007295695, 0.484721]","[0.7231347, 0.008883374, 0.26798198]","[0.18034552, 0.00107552, 0.81857896]","[0.039967846, 0.00040773826, 0.95962447]"
130752,"[0.09543086, 0.05731924, 0.8472499]","[0.12396775, 0.015893072, 0.8601392]","[0.260195, 0.031311944, 0.7084931]","[0.01707585, 0.00682555, 0.9760986]","[0.042484287, 0.012515482, 0.9450002]"
82307,"[0.12225663, 0.43705022, 0.4406932]","[0.20252532, 0.25720632, 0.5402684]","[0.28119588, 0.47022516, 0.24857895]","[0.019404879, 0.061009668, 0.9195854]","[0.05498934, 0.17168978, 0.7733209]"
139549,"[0.24362724, 0.008143351, 0.74822944]","[0.27182373, 0.007103748, 0.7210725]","[0.41205797, 0.03210199, 0.55584]","[0.5340474, 0.022887304, 0.4430653]","[0.04258298, 0.0029674333, 0.9544496]"
75125,"[0.18851373, 0.023990247, 0.78749603]","[0.41455376, 0.0549649, 0.5304813]","[0.5050039, 0.10977614, 0.38521987]","[0.042172316, 0.024791824, 0.93303585]","[0.09072663, 0.011457413, 0.897816]"


In [103]:
pseudo.to_csv('hf_39_pseudo.csv', index=False)

In [104]:
psamed = pd.read_csv('../input/psl_deberta_xlarge.csv')

In [105]:
pseudo.columns

Index(['discourse_id', 'labels', 'discourse_type', 'discourse_effectiveness',
       'discourse_text', 'HF39_fold0_Ineffective', 'HF39_fold0_Adequate',
       'HF39_fold0_Effective', 'HF39_fold0_loss', 'HF39_fold0_preds',
       'HF39_fold1_Ineffective', 'HF39_fold1_Adequate', 'HF39_fold1_Effective',
       'HF39_fold1_loss', 'HF39_fold1_preds', 'HF39_fold2_Ineffective',
       'HF39_fold2_Adequate', 'HF39_fold2_Effective', 'HF39_fold2_loss',
       'HF39_fold2_preds', 'HF39_fold3_Ineffective', 'HF39_fold3_Adequate',
       'HF39_fold3_Effective', 'HF39_fold3_loss', 'HF39_fold3_preds',
       'HF39_fold4_Ineffective', 'HF39_fold4_Adequate', 'HF39_fold4_Effective',
       'HF39_fold4_loss', 'HF39_fold4_preds', 'loss_range'],
      dtype='object')

In [106]:
psamed.columns

Index(['essay_id', 'labels', 'fold_k_5_seed_42', 'fold_k_5_seed_2020',
       'fold_k_8_seed_42', 'fold_k_8_seed_2020', 'fold_k_10_seed_42',
       'fold_k_10_seed_2020', 'id', 'discourse_id', 'discourse_start',
       'discourse_end', 'discourse_text', 'discourse_type',
       'discourse_type_num', 'predictionstring', 'Ineffective', 'Adequate',
       'Effective', 'fold2_Ineffective', 'fold2_Adequate', 'fold2_Effective',
       'fold4_Ineffective', 'fold4_Adequate', 'fold4_Effective',
       'fold0_Ineffective', 'fold0_Adequate', 'fold0_Effective',
       'fold1_Ineffective', 'fold1_Adequate', 'fold1_Effective',
       'fold3_Ineffective', 'fold3_Adequate', 'fold3_Effective'],
      dtype='object')

In [107]:
sel = ['essay_id', 'labels', 'fold_k_5_seed_42', 'discourse_id',
       'fold2_Ineffective', 'fold2_Adequate', 'fold2_Effective',
       'fold4_Ineffective', 'fold4_Adequate', 'fold4_Effective',
       'fold0_Ineffective', 'fold0_Adequate', 'fold0_Effective',
       'fold1_Ineffective', 'fold1_Adequate', 'fold1_Effective',
       'fold3_Ineffective', 'fold3_Adequate', 'fold3_Effective']

join = pd.merge(pseudo, psamed[sel], how='left', on='discourse_id')

In [108]:
join.columns

Index(['discourse_id', 'labels_x', 'discourse_type', 'discourse_effectiveness',
       'discourse_text', 'HF39_fold0_Ineffective', 'HF39_fold0_Adequate',
       'HF39_fold0_Effective', 'HF39_fold0_loss', 'HF39_fold0_preds',
       'HF39_fold1_Ineffective', 'HF39_fold1_Adequate', 'HF39_fold1_Effective',
       'HF39_fold1_loss', 'HF39_fold1_preds', 'HF39_fold2_Ineffective',
       'HF39_fold2_Adequate', 'HF39_fold2_Effective', 'HF39_fold2_loss',
       'HF39_fold2_preds', 'HF39_fold3_Ineffective', 'HF39_fold3_Adequate',
       'HF39_fold3_Effective', 'HF39_fold3_loss', 'HF39_fold3_preds',
       'HF39_fold4_Ineffective', 'HF39_fold4_Adequate', 'HF39_fold4_Effective',
       'HF39_fold4_loss', 'HF39_fold4_preds', 'loss_range', 'essay_id',
       'labels_y', 'fold_k_5_seed_42', 'fold2_Ineffective', 'fold2_Adequate',
       'fold2_Effective', 'fold4_Ineffective', 'fold4_Adequate',
       'fold4_Effective', 'fold0_Ineffective', 'fold0_Adequate',
       'fold0_Effective', 'fold1_Ineffective'

In [109]:
len(join)

144292

In [110]:
join.isna().any()

discourse_id               False
labels_x                   False
discourse_type             False
discourse_effectiveness    False
discourse_text             False
HF39_fold0_Ineffective     False
HF39_fold0_Adequate        False
HF39_fold0_Effective       False
HF39_fold0_loss            False
HF39_fold0_preds           False
HF39_fold1_Ineffective     False
HF39_fold1_Adequate        False
HF39_fold1_Effective       False
HF39_fold1_loss            False
HF39_fold1_preds           False
HF39_fold2_Ineffective     False
HF39_fold2_Adequate        False
HF39_fold2_Effective       False
HF39_fold2_loss            False
HF39_fold2_preds           False
HF39_fold3_Ineffective     False
HF39_fold3_Adequate        False
HF39_fold3_Effective       False
HF39_fold3_loss            False
HF39_fold3_preds           False
HF39_fold4_Ineffective     False
HF39_fold4_Adequate        False
HF39_fold4_Effective       False
HF39_fold4_loss            False
HF39_fold4_preds           False
loss_range

In [112]:
join.to_csv('../input/hf_39_amed_pseudo.csv', index=False)

In [113]:
ls

 2-debertas-test.ipynb
 2021_data_for_pseudo_mlm.csv
 25-28.ipynb
 Create_MLM_dataset.ipynb
 EDA.ipynb
'Error analysis.ipynb'
 HF-0.ipynb
 HF-1.ipynb
 HF-10.ipynb
 HF-11.ipynb
 HF-12.ipynb
 HF-13.ipynb
 HF-14.ipynb
 HF-15.ipynb
 HF-16.ipynb
 HF-17.ipynb
 HF-18.ipynb
 HF-18b.ipynb
 HF-19.ipynb
 HF-19b.ipynb
 HF-2.ipynb
 HF-20.ipynb
 HF-21.ipynb
 HF-22.ipynb
 HF-23.ipynb
 HF-24-scr.ipynb
 HF-24.ipynb
 HF-3.ipynb
 HF-33.ipynb
 HF-34.ipynb
 HF-35.ipynb
 HF-36.ipynb
 HF-37.ipynb
 HF-38.ipynb
 HF-39.ipynb
 HF-4.ipynb
 HF-5.ipynb
 HF-6.ipynb
 HF-7.ipynb
 HF-8.ipynb
 HF-9.ipynb
 HF-pret-1.ipynb
 HF-pret-2-script.ipynb
 HF-pret-2.ipynb
 HF-pret-3.ipynb
 HF-pret-3.py
 HF-pret-4.py
 HF-pret-5.py
 HF-pret-6.py
 HF-pret-7.ipynb
 HF-pret-7.py
 OOF-39.ipynb
 PL-1.ipynb
 PL-10.ipynb
 PL-11.ipynb
 PL-12.ipynb
 PL-13.ipynb
 PL-14.ipynb
 PL-15.ipynb
 PL-16-all.ipynb
 PL-16.ipynb
 PL-17-all.ipynb
 PL-18-all.ipynb
 PL-19-all.ipynb
 PL-2.ipynb
 PL-20-all.ipynb
 PL-21-all.ipynb
 PL-22-all.ipynb
 PL-23-all.ip