In [42]:
DEBUG = False

In [43]:
cfg = {
    "num_proc": 2,
    "aug_prob": 0.03,
    "k_folds": 5,
    "max_length": 2048,
    "padding": False,
    "stride": 0,
    "data_dir": "../input/feedback-prize-effectiveness",
    "load_from_disk": None,
    "pad_multiple": 8,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "dropout": 0.1,
    "trainingargs": {
        "output_dir": f"../output/",
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 4,
        "learning_rate": 9e-6,
        "weight_decay": 0.01,
        "num_train_epochs": 3,
        "warmup_ratio": 0.1,
        "optim": 'adamw_torch',
        "logging_steps": 50,
        "save_strategy": "steps",
        "evaluation_strategy": "steps",
        "eval_steps": 50,
        "eval_delay": 600,
        "report_to": "wandb",
        "group_by_length": True,
        "save_total_limit": 1,
        "metric_for_best_model": "loss",
        "greater_is_better": False,
        "seed": 42,
        "fp16": True,
        "gradient_checkpointing": True,
        "gradient_accumulation_steps": 1,
    }
}

In [44]:
import re
import pickle
import codecs
import warnings
import logging
from functools import partial
from pathlib import Path
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple

import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, set_seed

from datasets import Dataset, load_from_disk

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def read_text_files(example, data_dir):
    
    id_ = example["id"]
    
    with open(data_dir / "train" / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())
    
    return example

set_seed(cfg["trainingargs"]["seed"])

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [45]:
data_dir = Path("../input/2021")

train_df = pd.read_csv(data_dir / "train.csv")

if DEBUG: train_df = train_df.sample(n=100).reset_index(drop=True)

text_ds = Dataset.from_dict({"id": train_df.id.unique()})

text_ds = text_ds.map(
    partial(read_text_files, data_dir=data_dir),
    num_proc=cfg["num_proc"],
    batched=False,
    desc="Loading text files",
)

Loading text files #0:   0%|                                                | 0/7797 [00:00<?, ?ex/s]
Loading text files #0:   3%|▉                                   | 195/7797 [00:00<00:03, 1942.48ex/s][A
Loading text files #0:   5%|█▊                                  | 390/7797 [00:00<00:03, 1929.01ex/s][A
Loading text files #0:   8%|██▋                                 | 593/7797 [00:00<00:03, 1972.93ex/s][A
Loading text files #0:  10%|███▋                                | 791/7797 [00:00<00:03, 1961.41ex/s][A
Loading text files #0:  13%|████▌                               | 988/7797 [00:00<00:03, 1890.83ex/s][A
Loading text files #0:  15%|█████▎                             | 1178/7797 [00:00<00:03, 1819.10ex/s][A
Loading text files #0:  18%|██████▎                            | 1393/7797 [00:00<00:03, 1918.80ex/s][A
Loading text files #0:  21%|███████▏                           | 1604/7797 [00:00<00:03, 1975.40ex/s][A
Loading text files #0:  23%|████████                      

In [46]:
text_ds

Dataset({
    features: ['id', 'text'],
    num_rows: 15594
})

In [47]:
text_ds[0]

{'id': '423A1CA112E2',
 'text': "Phones\n\nModern humans today are always on their phone. They are always on their phone more than 5 hours a day no stop .All they do is text back and forward and just have group Chats on social media. They even do it while driving. They are some really bad consequences when stuff happens when it comes to a phone. Some certain areas in the United States ban phones from class rooms just because of it.\n\nWhen people have phones, they know about certain apps that they have .Apps like Facebook Twitter Instagram and Snapchat. So like if a friend moves away and you want to be in contact you can still be in contact by posting videos or text messages. People always have different ways how to communicate with a phone. Phones have changed due to our generation.\n\nDriving is one of the way how to get around. People always be on their phones while doing it. Which can cause serious Problems. That's why there's a thing that's called no texting while driving. That's 

In [48]:
text_ds.save_to_disk("../input/2021mlm")

In [49]:
split_ds = text_ds.train_test_split(test_size=0.05)

In [50]:
split_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 14814
    })
    test: Dataset({
        features: ['id', 'text'],
        num_rows: 780
    })
})

In [51]:
import json

with open("train.json", "w") as fp:
    for t in split_ds["train"]["text"]:
        fp.write(json.dumps({"text":t})+'\n')

with open("val.json", "w") as fp:
    for t in split_ds["test"]["text"]:
        fp.write(json.dumps({"text":t})+'\n')