In [4]:
import argparse
import os
import json
from pathlib import Path
from transformers import Trainer
import pandas as pd
from tqdm.auto import tqdm
import torch
from datasets import load_dataset
import tokenizers
import transformers
import math
from transformers import AutoTokenizer, AutoConfig
from transformers import DataCollatorForLanguageModeling, AutoModelForMaskedLM, Trainer
from transformers import TrainingArguments
from transformers.utils import logging
from datasets import Dataset
from transformers import TrainingArguments
# logging.set_verbosity_info()
# logger = logging.get_logger(__name__)
# logger.info("INFO")
# logger.warning("WARN")

import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"
os.environ["WANDB_DISABLED"] = "true"
from pathlib import Path
#指定设备
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [5]:
from types import SimpleNamespace
config = SimpleNamespace()
config.exp_id = 'deberta-v3-large'
config.chunk_size = 2048
config.warmup_ratio = 0.06
config.max_length = 2048
config.batch_size = 2
config.epoch = 5
config.num_workers = 14
config.learning_rate = 1e-5
config.seed = 42
config.mlm_probability = 0.15
config.debug = False

In [6]:
model_path = 'microsoft/deberta-v3-large'  
INPUT_DIR = 'autodl-tmp/data/'

OUTPUT_DIR = f'autodl-tmp/result/{config.exp_id}/mlm-{config.exp_id}/'
data_path = "autodl-tmp/data/feedback-prize-2021/train.csv" 
TRAIN_DIR = 'autodl-tmp/data/feedback-prize-2021/train/'

use_colab = False
if use_colab:
    INPUT_DIR = '/content/drive/MyDrive/feedback2022/data/'
    OUTPUT_DIR = '/content/drive/MyDrive/feedback2022/result/' + config.exp_id + '/'
    data_path = "/content/drive/MyDrive/feedback2022/data/feedback-prize-2021/train.csv" 
    
use_kaggle = False
if use_kaggle:
    INPUT_DIR = '/content/drive/MyDrive/feedback2022/data/'
    OUTPUT_DIR = './' + config.exp_id + '/'
    data_path = "/content/drive/MyDrive/feedback2022/data/feedback-prize-2021/train.csv"  
    TRAIN_DIR = '../input/feedback-prize-2021/train' 

In [8]:
train_data = []
for i in tqdm(os.listdir(TRAIN_DIR)):
    with open(os.path.join(TRAIN_DIR, i), 'r') as f:
        train_data.append({'text': f.read(), 'id': i[:-4]})

df_train = pd.DataFrame(train_data)
df_train.info()

  0%|          | 0/15594 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15594 entries, 0 to 15593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    15594 non-null  object
 1   id      15594 non-null  object
dtypes: object(2)
memory usage: 243.8+ KB


In [10]:
import os
for root, dirs, files in os.walk(OUTPUT_DIR, topdown=False):
    for name in files:
        os.remove(os.path.join(root, name))
    for name in dirs:
        os.rmdir(os.path.join(root, name)) 

In [11]:
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [13]:
from datetime import datetime
dt=datetime.now()
def get_logger(filename):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
LOGGER = get_logger(filename=OUTPUT_DIR+'train_{}'.format(dt.strftime('%Y-%m-%d-%H-%M')))

In [14]:
if config.debug == True:
    df_train = df_train[:5]
df_train.head()

Unnamed: 0,text,id
0,"Some people belive that the so called ""face"" o...",0000D23A521A
1,Driverless cars are exaclty what you would exp...,00066EA9880D
2,Dear: Principal\n\nI am arguing against the po...,000E6DE9E817
3,Would you be able to give your car up? Having ...,001552828BD0
4,I think that students would benefit from learn...,0016926B079C


In [15]:
# https://www.kaggle.com/competitions/feedback-prize-2021/discussion/313330
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [16]:
# https://www.kaggle.com/code/brandonhu0215/feedback-deberta-large-lb0-619
df_train['text'] = df_train['text'].apply(lambda x : resolve_encodings_and_normalize(x))

CPU times: user 8.87 s, sys: 0 ns, total: 8.87 s
Wall time: 8.87 s


In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15594 entries, 0 to 15593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    15594 non-null  object
 1   id      15594 non-null  object
dtypes: object(2)
memory usage: 243.8+ KB


In [18]:
dataset = Dataset.from_pandas(df_train, split='train')
del df_train
# dataset['text'][0]

In [19]:
#model 和 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
# model = AutoModelForMaskedLM.from_pretrained(model_path).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('autodl-tmp/result/deberta-v3-large/mlm-deberta-v3-large/tokenizer/tokenizer_config.json',
 'autodl-tmp/result/deberta-v3-large/mlm-deberta-v3-large/tokenizer/special_tokens_map.json',
 'autodl-tmp/result/deberta-v3-large/mlm-deberta-v3-large/tokenizer/spm.model',
 'autodl-tmp/result/deberta-v3-large/mlm-deberta-v3-large/tokenizer/added_tokens.json',
 'autodl-tmp/result/deberta-v3-large/mlm-deberta-v3-large/tokenizer/tokenizer.json')

In [20]:
def tokenize_function(batched_data):
    result = tokenizer(batched_data['text'], padding='max_length', truncation=True, max_length=config.max_length)
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result

In [21]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text', 'id'])
chunk_size = config.chunk_size

  0%|          | 0/16 [00:00<?, ?ba/s]

In [22]:
tokenized_datasets[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'])

In [23]:
import pdb

In [24]:
def group_texts(batched_data):
    concatenated_examples = {k: sum(batched_data[k], []) for k in batched_data.keys()}
    total_length = len(concatenated_examples[list(batched_data.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {k : [t[i: i+chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items()}
    result['labels'] = result['input_ids'].copy()
    return result

In [25]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability)
dataset_split = lm_datasets.train_test_split(test_size=0.1, seed=config.seed)

  0%|          | 0/16 [00:00<?, ?ba/s]

In [26]:
lm_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 15594
})

In [27]:
dataset_split

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 14034
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1560
    })
})

In [29]:
batch_size = config.batch_size
# Show the training loss with every epoch
logging_steps = len(dataset_split["train"]) // batch_size
model_name = model_path.split("/")[-1]

In [30]:
logging_steps

7017

In [32]:
cfglog = []
for key, value in config.__dict__.items():
    if not key.startswith('__'):
        cfglog.append(str(key)+'='+str(value))
cfglog = '\n'.join(cfglog)
LOGGER.info(cfglog)

exp_id=deberta-v3-large
chunk_size=2048
warmup_ratio=0.06
max_length=2048
batch_size=2
epoch=5
num_workers=14
learning_rate=1e-05
seed=42
mlm_probability=0.15
debug=False


In [33]:
from transformers import TrainerCallback
import numpy as np

class SaveBestModelCallback(TrainerCallback):
    def __init__(self):
        self.bestScore = np.inf

    def on_train_begin(self, args, state, control, **kwargs):
        assert args.evaluation_strategy != "no", "SaveBestModelCallback requires IntervalStrategy of steps or epoch"

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        metric_value = metrics.get("eval_loss")
        
        LOGGER.info(f">>> Perplexity: {math.exp(metric_value):.2f}")
        LOGGER.info(f">>> metrics: {metrics}")
        if metric_value < self.bestScore:
            print(f"** eval_loss improved from {np.round(self.bestScore, 4)} to {np.round(metric_value, 4)} **")
            self.bestScore = metric_value
            control.should_save = False
            
            torch.save(kwargs["model"].state_dict(), os.path.join(OUTPUT_DIR, "mlm-{}.bin".format(config.exp_id)))
            torch.save(kwargs["model"].config, os.path.join(OUTPUT_DIR, 'mlm-{}-config.pth'.format(config.exp_id)))

        else:
            control.should_save = False
            print(f"eval_loss {np.round(metric_value, 4)} (Prev. Best {np.round(self.bestScore, 4)}) ")
               

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=config.learning_rate, # 1e-5
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    push_to_hub=False,
    fp16=True,
    logging_strategy="no",
    group_by_length=True, 
    dataloader_num_workers=config.num_workers,
    warmup_ratio=config.warmup_ratio, 
    num_train_epochs=config.epoch,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    prediction_loss_only=True,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
)

model = AutoModelForMaskedLM.from_pretrained(model_path)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    data_collator=data_collator,
    callbacks=[SaveBestModelCallback],
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['deberta.embeddings.position_embeddings.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTrainin

Epoch,Training Loss,Validation Loss
0,No log,2.347217


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DebertaV2ForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1560
  Batch size = 4
>>> Perplexity: 10.46
>>> metrics: {'eval_loss': 2.347217082977295, 'eval_runtime': 509.1911, 'eval_samples_per_second': 3.064, 'eval_steps_per_second': 0.766, 'epoch': 1.0}


** eval_loss improved from inf to 2.3472 **




In [None]:
#test
eval_results = trainer.evaluate()
LOGGER.info(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

trainer.save_model()