In [1]:
import os
import copy
from pathlib import Path
from dataclasses import dataclass
import gc
import re
# from datasets import Dataset
import torch
import numpy as np
import pandas as pd
from datasets.arrow_dataset import Dataset
from transformers.trainer import Trainer
from transformers.modeling_utils import PreTrainedModel
from transformers.trainer_utils import EvalPrediction
from transformers.training_args import TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tokenizers import AddedToken
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

2024-05-25 07:21:38.574592: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 07:21:38.574697: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 07:21:38.679949: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Config

In [2]:
@dataclass
class Config:
    checkpoint: str = "microsoft/deberta-v3-base"
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 8
    gradient_accumulation_steps: int = 8 // torch.cuda.device_count() / per_device_train_batch_size
    num_train_epochs: float = 4 #sample 4
    train_max_length: int = 1024
    eval_max_length: int = 2048
    lr: float = 1e-5
    scheduler: str = "linear"
    warmup_ratio: float = 0.0
    weight_decay = 0.01
    amp: bool = True
    n_splits: int = 5
    gamma: float = 2.
    optim: str = "adamw_torch"
    inference: bool = True#False:train True:inference
    inference_checkpoints_dir: str = "/kaggle/input/focal-loss-finetuning-hisa/output/"
    
config = Config()
print('shoq_config:',config)

shoq_config: Config(checkpoint='microsoft/deberta-v3-base', per_device_train_batch_size=4, per_device_eval_batch_size=8, gradient_accumulation_steps=1.0, num_train_epochs=4, train_max_length=1024, eval_max_length=2048, lr=1e-05, scheduler='linear', warmup_ratio=0.0, amp=True, n_splits=5, gamma=2.0, optim='adamw_torch', inference=True, inference_checkpoints_dir='/kaggle/input/focal-loss-finetuning-hisa/output/')


In [3]:
args = TrainingArguments(
    output_dir="output",
    report_to="none",
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    num_train_epochs=config.num_train_epochs,
    weight_decay=config.weight_decay,
    evaluation_strategy='epoch',
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=1,
    metric_for_best_model="qwk",
    greater_is_better=True,
    load_best_model_at_end=True,
    fp16=config.amp,
    learning_rate=config.lr,
    lr_scheduler_type=config.scheduler,
    warmup_ratio=config.warmup_ratio,
    optim=config.optim #"adamw_torch"
)

## Instantiate the model & tokenizer

In [4]:
class ModelInit:
    model_class = AutoModelForSequenceClassification
    
    def __init__(self, checkpoint: str, num_labels: int = 6) -> None:
        self.model = self.model_class.from_pretrained(checkpoint, num_labels=num_labels)
        self.state_dict = copy.deepcopy(self.model.state_dict())
        
    def __call__(self) -> model_class:
        self.model.load_state_dict(self.state_dict)
        return self.model

## Instantiate the dataset

In [5]:
if config.inference:
    df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")
    print('read test.csv')
else:
    df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
    print('read train.csv')
ds = Dataset.from_pandas(df)

read test.csv


In [6]:
class Encoder:
    def __init__(self, tokenizer, **encode_kwargs):
        self.tokenizer = tokenizer
        self.kwargs = encode_kwargs
        
    def __call__(self, batch: dict) -> dict:
        encoded = self.tokenizer(batch["full_text"], **self.kwargs)
        encoded["labels"] = [s-1 for s in batch["score"]]  # score is 1~6
        return encoded

## Compute Metrics

In [7]:
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    predictions = eval_pred.predictions
    y_true = eval_pred.label_ids
    y_pred = predictions.argmax(-1)
    kappa = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    corr = matthews_corrcoef(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    return {"qwk": kappa, "corr": corr, "acc": acc}

## Custom Trainer with Focal Loss

In [8]:
class FocalLoss(torch.nn.Module):
    def __init__(self, weight: torch.Tensor | None = None, gamma: float = 2,) -> None:
        super().__init__()
        self.ce = torch.nn.CrossEntropyLoss(weight=weight)
        self.gamma = gamma

    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        ce_loss: torch.Tensor = self.ce(input, target)
        pt = torch.exp(-ce_loss)
        f_loss = (1 - pt) ** self.gamma * ce_loss
        f_loss = torch.mean(f_loss)
        return f_loss
    
    
class FocalLossTrainer(Trainer):
    def compute_loss(self, model: PreTrainedModel, inputs: dict, return_outputs: bool = False) -> tuple:
        ce_loss, outputs = super().compute_loss(model, inputs, True)
        labels = inputs["labels"]
        logits = outputs["logits"]
        loss_fn = FocalLoss(gamma=config.gamma)
        loss = loss_fn(input=logits, target=labels)
        outputs["loss"] = loss
        return (loss, outputs) if return_outputs else loss

# Features engineering

In [9]:
cList = {
  "ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because",  "could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not",
  "haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he will have","he's": "he is",
  "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is","I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have",
  "isn't": "is not","it'd": "it had","it'd've": "it would have","it'll": "it will", "it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
  "might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
  "shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
  "should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so is","that'd": "that would","that'd've": "that would have","that's": "that is","there'd": "there had","there'd've": "there would have","there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we had",
  "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
  "weren't": "were not","what'll": "what will","what'll've": "what will have",
  "what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have",
  "where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is",
  "why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not",
  "wouldn't've": "would not have","y'all": "you all","y'alls": "you alls","y'all'd": "you all would",
  "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have","you're": "you are",  "you've": "you have"
   }

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

# def count_spelling_errors(text):
#     doc = nlp(text)
#     lemmatized_tokens = [token.lemma_.lower() for token in doc]
#     spelling_errors = sum(1 for token in lemmatized_tokens if token not in english_vocab)
#     return spelling_errors

def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)
def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = expandContractions(x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

# 前処理の適用
df["full_text"] = df["full_text"] .apply(dataPreprocessing)

## Train

In [10]:
if not config.inference:
    model = ModelInit(config.checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)
    tokenizer.add_tokens([AddedToken("\n", normalized=False)])
    tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
    
    train_encoder = Encoder(tokenizer, max_length=config.train_max_length, truncation=True)
    eval_encoder = Encoder(tokenizer, max_length=config.eval_max_length, truncation=True)
    
    # 5-fold stratified cv
    cv = StratifiedKFold(n_splits=config.n_splits, shuffle=True, random_state=42)
    folds = list(cv.split(np.zeros(len(df)), y=df["score"].values))
    idx2fold = {idx: fold for fold, (_, val_idx) in enumerate(folds) for idx in val_idx}
    df["fold"] = [idx2fold[i] for i in df.index]
    # 'fold'列が正しく作成されているか確認
    print('fold_split',df.head())
    df.to_csv("train_split.csv", index=False)
    

    # データフレームをHugging Faceデータセットに変換
    ds = Dataset.from_pandas(df)

    # データセットに'fold'列が含まれているか確認
    print('ds.features:',ds.features)
    print(ds[0])
    
    cv_res = []
    
    
    for fold_idx in sorted(df["fold"].unique()):
        args.output_dir = os.path.join("output", f"fold_{fold_idx}")
        args.run_name = f"{config.checkpoint}_fold-{fold_idx}"
        train_ds = ds.select([i for i, d in enumerate(ds) if d["fold"] != fold_idx])
        eval_ds = ds.select([i for i, d in enumerate(ds) if d["fold"] == fold_idx])
        train_ds = train_ds.map(train_encoder, batched=True)
        eval_ds = eval_ds.map(eval_encoder, batched=True)
        #Focalloss（損失関数）
        trainer = FocalLossTrainer(
            args=args, 
            train_dataset=train_ds, 
            eval_dataset=eval_ds,
            tokenizer=tokenizer,
            model_init=model,
            compute_metrics=compute_metrics,
        )
        trainer.train()
        preds = trainer.predict(eval_ds).predictions
        qwk = cohen_kappa_score(y1=np.array(eval_ds["labels"]), y2=preds.argmax(-1), weights="quadratic")
        fig, ax = plt.subplots()
        ConfusionMatrixDisplay.from_predictions(
            y_true=np.array(eval_ds["labels"]), 
            y_pred=preds.argmax(-1),
            ax=ax
        )
        ax.set_title(f"fold-{fold_idx} qwk: {qwk:.3f}")
        fig.show()
        cv_res.append(qwk)

        # foldごとにモデルを保存
        model_path = f'./finetuning_deberta_fold_{fold_idx}'
        Path(model_path).mkdir(parents=True, exist_ok=True)
        trainer.model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        
        # メモリ解放
        del trainer
        del train_ds
        del eval_ds
        del preds
        torch.cuda.empty_cache()
        gc.collect()
        
        

    # foldとcv_resの長さを確認
    print(f"Number of folds: {len(sorted(df['fold'].unique()))}")
    print(f"Number of cv_res: {len(cv_res)}")
    res_df = pd.DataFrame(
        {
            "fold": list(sorted(df["fold"].unique())) + ["mean"],
            "qwk": cv_res + [np.mean(cv_res)]
        }
    )
    print(res_df)

In [11]:
if config.inference:
    predictions = 0
    checkpoints = list(Path(config.inference_checkpoints_dir).glob("fold*/checkpoint*"))
    print(checkpoints)

    for checkpoint in checkpoints:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
        _ds = ds.map(
            lambda i: tokenizer(i["full_text"], max_length=config.eval_max_length, truncation=True), 
            batched=True,
        )
        args = TrainingArguments(
            output_dir=".",
            per_device_eval_batch_size=config.per_device_eval_batch_size,
            fp16=config.amp,
        )
        trainer = Trainer(args=args, model=model, tokenizer=tokenizer)
        preds = trainer.predict(_ds)
        predictions += preds.predictions / len(checkpoints)

    predicted_scores = predictions.argmax(-1) + 1  # [0,5] -> [1,6]
    
    df["score"] = predicted_scores
    df = df[["essay_id", "score"]]
    display(df)
    df.to_csv("submission.csv", index=False)

[PosixPath('/kaggle/input/focal-loss-finetuning-hisa/output/fold_0/checkpoint-6924'), PosixPath('/kaggle/input/focal-loss-finetuning-hisa/output/fold_4/checkpoint-5193'), PosixPath('/kaggle/input/focal-loss-finetuning-hisa/output/fold_1/checkpoint-6924'), PosixPath('/kaggle/input/focal-loss-finetuning-hisa/output/fold_3/checkpoint-6924'), PosixPath('/kaggle/input/focal-loss-finetuning-hisa/output/fold_2/checkpoint-6924')]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,3
2,001ab80,4
