In [1]:
DEBUG = False

In [2]:
cfg = {
    "num_proc": 2,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "data_dir": "../input/feedback-prize-effectiveness",
    "trainingargs": {
        "seed": 42,
    }
}

In [3]:
import re
import pickle
import codecs
import warnings
import logging
from functools import partial
from pathlib import Path
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple

import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, set_seed

from datasets import Dataset, load_from_disk

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def read_text_files(example, data_dir):
    
    id_ = example["essay_id"]
    
    with open(data_dir / "train" / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())
    
    return example

set_seed(cfg["trainingargs"]["seed"])

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_dir = Path(cfg["data_dir"])

train_df = pd.read_csv(data_dir / "train.csv")

if DEBUG: train_df = train_df[:100]

text_ds = Dataset.from_dict({"essay_id": train_df.essay_id.unique()})

text_ds = text_ds.map(
    partial(read_text_files, data_dir=data_dir),
    num_proc=cfg["num_proc"],
    batched=False,
    desc="Loading text files",
)

text_df = text_ds.to_pandas()

train_df["discourse_text"] = [
    resolve_encodings_and_normalize(x) for x in train_df["discourse_text"]
]

train_df = train_df.merge(text_df, on="essay_id", how="left")
    
disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]

type2id = {'Lead': 0,
 'Position': 1,
 'Claim': 2,
 'Evidence': 3,
 'Counterclaim': 4,
 'Rebuttal': 5,
 'Concluding Statement': 6,
 'Other': 7}

label2id = {
    "Ineffective": 0,
    "Adequate": 1,
    "Effective": 2,
}

tokenizer = AutoTokenizer.from_pretrained(cfg["model_name_or_path"])

Loading text files #0:   0%|                                                | 0/2096 [00:00<?, ?ex/s]
Loading text files #0:  11%|███▊                                | 221/2096 [00:00<00:00, 2198.06ex/s][A
Loading text files #0:  21%|███████▌                            | 442/2096 [00:00<00:00, 2203.35ex/s][A
Loading text files #0:  32%|███████████▍                        | 669/2096 [00:00<00:00, 2232.01ex/s][A
Loading text files #0:  43%|███████████████▎                    | 893/2096 [00:00<00:00, 2183.60ex/s][A
Loading text files #1:  42%|██████████████▉                     | 870/2095 [00:00<00:00, 2153.03ex/s][A
Loading text files #0:  53%|██████████████████▌                | 1112/2096 [00:00<00:00, 2132.73ex/s][A
Loading text files #0:  76%|██████████████████████████▍        | 1584/2096 [00:00<00:00, 2252.79ex/s][A
Loading text files #0:  87%|██████████████████████████████▌    | 1832/2096 [00:00<00:00, 2323.19ex/s][A
Loading text files #0: 100%|██████████████████████████████

In [5]:
def find_positions(example):

    text = example["text"][0]
    
    # keeps track of what has already
    # been located
    min_idx = 0
    
    # stores start and end indexes of discourse_texts
    idxs = []
    
    for dt in example["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))
        
        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1]) # will filter out later
            print('!!!! MISS !!!')
            print(dt.strip())
            print('!!here!!')
            print(text)
            print()
            continue  
        # If one match is found
        else:
            m = matches[0]
            
        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(example):
    example["idxs"] = find_positions(example)
    
    # print("New example")
    # print(example["idxs"])
    # print()

    text = example["text"][0]
    text = text.replace('\n', '|')
    chunks = []
    chunk_example = []
    chunk_idxs = []
    examples_classes = [type2id[disc_type] for disc_type in example["discourse_type"]]
    examples_scores = [label2id[disc_effect] for disc_effect in example["discourse_effectiveness"]]
    
    prev = 0

    zipped = zip(
        example["idxs"],
        example["discourse_type"],
        example["discourse_effectiveness"],
    )
    
    for idxs, disc_type, disc_effect in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            chunk_idxs.append([-1])
            chunk_example.append(-1)
            chunks.append('')
            continue

        s, e = idxs

        # if the start of the current discourse_text is not 
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunk_idxs.append([prev,s])
            chunk_example.append(-1)
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is 
        # the same as the end of the previous discourse_text
        if s == prev:
            chunk_idxs.append([s,e])
            chunks.append(text[s:e])
            chunk_example.append(1)
        
        prev = e
        
    input_ids = [tokenizer.cls_token_id]
    token_class_labels = [-100]
    token_scores_labels = [-100]
    token_examples_mapping = [-100]
    
    assert len(examples_classes) == len(examples_scores) 
    assert len(chunks) == len(chunk_idxs) 
    assert len(examples_classes) == len(example["discourse_effectiveness"])

    i = 0
    
    for j, chunk in enumerate(chunks):
        chunk_ids = tokenizer(chunk, padding=False, truncation=False, add_special_tokens=False)
        chunk_ids = chunk_ids['input_ids']
        if len(chunk_ids) == 0: 
            assert chunk_example[j] == -1
            continue
            
        if chunk_example[j] == -1:
            input_ids.extend(chunk_ids)
            token_class_labels += [-100] * len(chunk_ids)
            token_scores_labels += [-100] * len(chunk_ids)
            token_examples_mapping += [-100] * len(chunk_ids)
        if chunk_example[j] == 1: 
            input_ids.extend(chunk_ids)
            token_class_labels += [examples_classes[i]] * len(chunk_ids)
            token_scores_labels += [examples_scores[i]] * len(chunk_ids)
            token_examples_mapping += [i] * len(chunk_ids)

            # DEBUG
            # print(i)
            # print('class', examples_classes[i])
            # print('score', examples_scores[i])
            # ss,ee = example["idxs"][i]
            # print(text[ss:ee])
            # print('***********************')
            # print(tokenizer.decode(chunk_ids))
            # print('***********************')
            # print()            
            # DEBUG
            
            i += 1
            
              
    # print(example["idxs"])
        
    # if (i+1 < len(example["idxs"])):
    #     print('ouch!!!!')
    #     for sss,eee in example["idxs"]:
    #           print(text[sss:eee])
        
    input_ids += [tokenizer.sep_token_id]
    token_class_labels += [-100]
    token_scores_labels += [-100]
    token_examples_mapping += [-100]
    attention_mask = [1] * len(input_ids)

    example['input_ids'] = input_ids
    example['attention_mask'] = attention_mask
    example['token_class_labels'] = token_class_labels
    example['token_scores_labels'] = token_scores_labels
    example['token_examples_mapping'] = token_examples_mapping
    example['examples_scores'] = examples_scores
    example['examples_classes'] = examples_classes
    
    return example

In [6]:

# make lists of discourse_text, discourse_effectiveness
# for each essay
grouped = train_df.groupby(["essay_id"]).agg(list)

ds = Dataset.from_pandas(grouped)

ds = ds.map(
    tokenize,
    batched=False,
    
)   

 98%|█████████████████████████████████████████████████████████▋ | 4097/4191 [00:27<00:00, 133.62ex/s]

!!!! MISS !!!
This whole thing is point less how they have us in here for two days im missing my education. We could have finished this in one day and had the rest of the week to get back on the track of learning. I've missed both days of weight lifting, algebra, and my world history that i do not want to fail again! If their are any people actually gonna sit down and take the time to read this then

DO NOT DO THIS NEXT YEAR

.

They are giving us cold lunches. ham and cheese and an apple, I am 16 years old and my body needs proper food. I wouldnt be complaining if they served actual breakfast. but because of Michelle Obama and her healthy diet rule they surve us 1 poptart in the moring. How does the school board expect us to last from 7:05-12:15 on a pop tart? then expect us to get A's, we are more focused on lunch than anything else. I am about done so if you have the time to read this even though this does not count. Bring PROPER_NAME a big Mac from mc donalds, SCHOOL_NAME, (idk are

100%|███████████████████████████████████████████████████████████| 4191/4191 [00:27<00:00, 150.21ex/s]


In [7]:
ds

Dataset({
    features: ['discourse_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'text', 'essay_id', 'idxs', 'input_ids', 'attention_mask', 'token_class_labels', 'token_scores_labels', 'token_examples_mapping', 'examples_scores', 'examples_classes'],
    num_rows: 4191
})

In [8]:
bad_matches = []
for id_, l, ids, dt, tem in zip(ds["essay_id"], ds["examples_scores"], ds["input_ids"], grouped.discourse_text,
                               ds["token_examples_mapping"]):
    
    # count number of labels (ignoring -100)
    num_cls_label = len(set(tem)) - 1
    # count number of cls ids
    num_cls_id = max(tem) + 1
    # true number of discourse_texts
    num_dt = len(dt)
    # print(num_cls_label, num_cls_id, num_dt)
    
    if num_cls_label != num_dt or num_cls_id != num_dt:
        bad_matches.append((id_, l, ids, dt))
        
print("Num bad matches", len(bad_matches))
# temp = train_df[train_df["essay_id"]==bad_matches[0][0]]
# temp_txt = temp.text.values[0]
# print(temp_txt)
# print("*"*100)
# print([x for x in temp.discourse_text if x.strip() not in temp_txt])

Num bad matches 1


In [14]:
for t in ds[0]["discourse_text"]:
    print(t, "\n")
print("*"*100)
print(tokenizer.decode(ds[0]["input_ids"]))
print("*"*100)
print(ds[0]["text"][0])

Driverless cars are exaclty what you would expect them to be. Cars that will drive without a person actually behind the wheel controlling the actions of the vehicle. The idea of driverless cars going in to developement shows the amount of technological increase that the wolrd has made. The leader of this idea of driverless cars are the automobiles they call Google cars. The arduous task of creating safe driverless cars has not been fully mastered yet.  

The developement of these cars should be stopped immediately because there are too many hazardous and dangerous events that could occur.  

the driver will be alerted when they will need to take over the driving responsibilites of the car.  

This is such a dangerous thing because we all know that whenever humans get their attention drawn in on something interesting it is hard to draw their focus somewhere else. The article explains that companies are trying to implement vibrations when the car is in trouble. Their are some people out 

In [18]:
pdf = ds.to_pandas()
essays = pd.read_csv('../input/feedback-effective-folds/essay_scores.csv')
essays = essays[['essay_id', 'fold']]
essays.head()

Unnamed: 0,essay_id,fold
0,F98E8D4EA700,0
1,66BB82BD76B2,0
2,85F4C57672EA,0
3,06936C8AA35D,0
4,61C3ADEA1DD5,0


In [19]:
pdfm = pd.merge(pdf, essays, on='essay_id', how='left')

In [20]:
len(pdf), len(pdfm), len(essays)

(4191, 4191, 4191)

In [33]:
all([len(x) > 0 for x in pdfm.token_class_labels.values])

True

In [34]:
ls

Create_MLM_dataset.ipynb  PL-16-all.ipynb
HF-0.ipynb                PL-16.ipynb
HF-1.ipynb                PL-17-all.ipynb
HF-10.ipynb               PL-2.ipynb
HF-11.ipynb               PL-3.ipynb
HF-12.ipynb               PL-4.ipynb
HF-13.ipynb               PL-5.ipynb
HF-14.ipynb               PL-6.ipynb
HF-2.ipynb                PL-7.ipynb
HF-3.ipynb                PL-8.ipynb
HF-4.ipynb                PL-9.ipynb
HF-5.ipynb                deb619.py
HF-6.ipynb                feedback-effective-baseline-3.ipynb
HF-7.ipynb                feedback-effective-create-labels-deb-v3-1024.ipynb
HF-8.ipynb                feedback-effective-create-labels-deb-v3-all.ipynb
HF-9.ipynb                feedback-effective-create-labels.ipynb
PL-1.ipynb                hf3infer.py
PL-10.ipynb               processed-deberta-v3-large-all.pickle
PL-11.ipynb               processed-deberta-v3-large-nbroad.pickle
PL-12.ipynb               processed-deberta-v3-large.pickle
PL-13.ipynb               tokenize_nb

In [27]:
pdfm.fold.value_counts()

0    839
1    838
4    838
2    838
3    838
Name: fold, dtype: int64

In [42]:
# pdfm.input_ids.loc[313]

In [38]:
list_cols = ['input_ids',
       'attention_mask', 'token_class_labels', 'token_scores_labels',
       'token_examples_mapping', 'examples_scores', 'examples_classes']

In [40]:
for c in list_cols:
    pdfm[c] = [x.tolist() for x in pdfm[c].values]

In [43]:
import pickle
with open(f'processed-{cfg["model_name_or_path"].split("/")[1]}-nbroad.pickle', 'wb') as handle:
    pickle.dump(pdfm, handle, protocol=pickle.HIGHEST_PROTOCOL)