In [1]:
import numpy as np
import pandas as pd
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from src.data_load import load_train, load_test
from src.preprocessing import add_text_column

In [2]:
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA available: True
Device: NVIDIA GeForce RTX 4080 SUPER


In [3]:
import torch
print(torch.__version__)

2.6.0+cu124


### 1. Data Loading

In [4]:
train = load_train()
test = load_test()

# text column 생성
train = add_text_column(train)
test = add_text_column(test)

# target 생성
train["Misconception"] = train["Misconception"].fillna("NA").astype(str)
train["CatMis"] = train["Category"].astype(str) + ":" + train["Misconception"]

### 2. Label Encoding

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train["label"] = le.fit_transform(train["CatMis"])

num_labels = len(le.classes_)
print("Number of labels:", num_labels)

Number of labels: 65


### 3. Train/validation split

In [6]:
train_df, val_df = train_test_split(
    train,
    test_size=0.2,
    random_state=42
)

print("Train:", len(train_df), "Val:", len(val_df))

Train: 29356 Val: 7340


### 4. Modeling

In [7]:
MODEL_NAME = "microsoft/deberta-v3-base"

In [8]:
# Tokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



In [9]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=256
    )

In [10]:
# Dataset 정의

from datasets import Dataset

train_ds = Dataset.from_pandas(
    train_df[["text", "label"]]
).map(tokenize, batched=True)

val_ds = Dataset.from_pandas(
    val_df[["text", "label"]]
).map(tokenize, batched=True)

Map:   0%|          | 0/29356 [00:00<?, ? examples/s]

Map:   0%|          | 0/7340 [00:00<?, ? examples/s]

### 5. Model load

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
### 6. Metric 정의

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro", zero_division=0)
    }

### 6. Train

In [14]:
import transformers
print(transformers.__version__)

4.57.1


In [15]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none"
)

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [18]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.8319,0.76556,0.693052,0.219501
2,0.7535,0.736559,0.69673,0.279355
3,0.7531,0.714758,0.70436,0.287614


TrainOutput(global_step=5505, training_loss=0.8938200467288981, metrics={'train_runtime': 554.3319, 'train_samples_per_second': 158.872, 'train_steps_per_second': 9.931, 'total_flos': 3657693501481272.0, 'train_loss': 0.8938200467288981, 'epoch': 3.0})

In [19]:
### Class-weighted loss

In [20]:
num_labels = train["CatMis"].nunique()
print(num_labels)

65


In [21]:
import numpy as np
import torch

# y_train은 정수 라벨(0~64)이어야 함
# 만약 train_df["label"]이 있다면 그걸 쓰는게 가장 안전
y_train_labels = np.array(train_df["label"].values)

num_labels = 65
counts = np.bincount(y_train_labels, minlength=num_labels)

# inverse frequency (너무 극단적이면 학습 불안정 → sqrt나 log를 추천)
weights = 1.0 / np.sqrt(counts + 1e-6)

# normalize (평균 1로 맞추면 안정적)
weights = weights / weights.mean()

class_weights = torch.tensor(weights, dtype=torch.float)
class_weights[:10], class_weights.mean(), class_weights.max()

(tensor([0.0047, 0.0040, 0.0072, 0.0024, 0.0165, 0.0165, 0.0094, 0.0039, 0.0093,
         0.0028]),
 tensor(1.),
 tensor(63.8073))

In [22]:
class_weights = torch.clamp(class_weights, max=10.0)

In [23]:
from transformers import Trainer
import torch.nn as nn

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if self.class_weights is not None:
            cw = self.class_weights.to(logits.device)
            loss_fct = nn.CrossEntropyLoss(weight=cw)
        else:
            loss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [24]:
train_ds = train_ds.rename_column("label", "labels") if "label" in train_ds.column_names else train_ds
val_ds   = val_ds.rename_column("label", "labels") if "label" in val_ds.column_names else val_ds

train_ds.column_names, val_ds.column_names

(['text',
  'labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 ['text',
  'labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask'])

In [25]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

weighted_trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

  super().__init__(*args, **kwargs)


In [26]:
weighted_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.94,1.032244,0.666213,0.342737
2,1.0432,0.966088,0.661444,0.359357
3,1.0014,0.96511,0.659128,0.367652


TrainOutput(global_step=5505, training_loss=0.9784520681937319, metrics={'train_runtime': 556.2018, 'train_samples_per_second': 158.338, 'train_steps_per_second': 9.897, 'total_flos': 3657693501481272.0, 'train_loss': 0.9784520681937319, 'epoch': 3.0})

### Note

- unweighted
    - Accuracy ≈ 0.70
    - Macro F1 ≈ 0.29
    - Loss ≈ 0.71

- class-weighted
    - Accuracy ≈ 0.66
    - Macro F1 ≈ 0.37 (+0.08 이상 상승)
    - Loss ≈ 0.96
- “전체 맞춘 비율(Accuracy)” ↓ / “모든 클래스를 공평하게 본 점수(Macro F1)” ↑

In [27]:
# epoch 조정 (Early stopping)
from transformers import EarlyStoppingCallback
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_wloss",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,          # 조금 낮추는 게 안정적
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,         # epoch
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",
)

weighted_trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


# weight 조정
weights = 1.0 / np.log(counts + 2)
weights = weights / weights.mean()
class_weights = torch.clamp(torch.tensor(weights), max=5.0)


  super().__init__(*args, **kwargs)


In [28]:
weighted_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.866,1.010115,0.667711,0.361112
2,1.0003,0.987287,0.632153,0.364359
3,1.0279,0.977763,0.666485,0.369032
4,0.9655,0.967729,0.659946,0.36792
5,0.8232,0.964226,0.658174,0.367222


TrainOutput(global_step=9175, training_loss=0.9216540128203764, metrics={'train_runtime': 925.2947, 'train_samples_per_second': 475.892, 'train_steps_per_second': 29.747, 'total_flos': 6105348843779856.0, 'train_loss': 0.9216540128203764, 'epoch': 5.0})

In [29]:
weighted_trainer.evaluate()

{'eval_loss': 0.9777626991271973,
 'eval_accuracy': 0.6664850136239782,
 'eval_macro_f1': 0.3690322787233685,
 'eval_runtime': 12.1247,
 'eval_samples_per_second': 605.376,
 'eval_steps_per_second': 37.857,
 'epoch': 5.0}

### Stratified split 버전

In [30]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

SEED = 42

# label이 정수(0~64)라고 가정
label_counts = train["label"].value_counts()
singleton_labels = label_counts[label_counts < 2].index

train_strat = train[~train["label"].isin(singleton_labels)].copy()
removed = len(train) - len(train_strat)

print("Original:", len(train))
print("After removing singleton labels:", len(train_strat))
print("Removed samples:", removed)
print("Remaining unique labels:", train_strat["label"].nunique())

Original: 36696
After removing singleton labels: 36691
Removed samples: 5
Remaining unique labels: 60


In [31]:
train_df, val_df = train_test_split(
    train_strat,
    test_size=0.2,
    random_state=SEED,
    stratify=train_strat["label"]
)

print("Train:", len(train_df), "Val:", len(val_df))
print("Train unique labels:", train_df["label"].nunique())
print("Val unique labels:", val_df["label"].nunique())

Train: 29352 Val: 7339
Train unique labels: 60
Val unique labels: 55


In [32]:
import torch

num_labels = train["label"].nunique()  # 65
y_train_labels = train_df["label"].values

counts = np.bincount(y_train_labels, minlength=num_labels)
weights = 1.0 / np.sqrt(counts + 1e-6)
weights = weights / weights.mean()

class_weights = torch.tensor(weights, dtype=torch.float)
class_weights = torch.clamp(class_weights, max=10.0)

print("weights mean/max:", class_weights.mean().item(), class_weights.max().item())
print("min count:", counts.min(), "max count:", counts.max())

weights mean/max: 0.7720506191253662 10.0
min count: 0 max count: 11841


In [33]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df[["text", "label"]]).map(tokenize, batched=True)
val_ds   = Dataset.from_pandas(val_df[["text", "label"]]).map(tokenize, batched=True)

# labels 컬럼명 맞추기
train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")

train_ds.column_names, val_ds.column_names

Map:   0%|          | 0/29352 [00:00<?, ? examples/s]

Map:   0%|          | 0/7339 [00:00<?, ? examples/s]

(['text',
  'labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 ['text',
  'labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask'])

In [34]:
from transformers import EarlyStoppingCallback

weighted_trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  super().__init__(*args, **kwargs)


In [35]:
weighted_trainer.train()
metrics = weighted_trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.9536,0.86673,0.657719,0.388152
2,0.8378,0.887156,0.663442,0.355953
3,0.998,0.859849,0.646137,0.385571


In [36]:
metrics = weighted_trainer.evaluate()

print(metrics)
print("Best metric:", weighted_trainer.state.best_metric)
print("Best ckpt:", weighted_trainer.state.best_model_checkpoint)

{'eval_loss': 0.8667296767234802, 'eval_accuracy': 0.6577190352909116, 'eval_macro_f1': 0.3881515446159474, 'eval_runtime': 12.8891, 'eval_samples_per_second': 569.394, 'eval_steps_per_second': 35.611, 'epoch': 3.0}
Best metric: 0.3881515446159474
Best ckpt: ./results_wloss\checkpoint-1835


In [37]:
# 텍스트 템플릿 2종 비교 (Q+E vs Q+A+E)

def make_text_QE(df: pd.DataFrame) -> pd.Series:
    return (
        "[Q] " + df["QuestionText"].fillna("").astype(str) +
        " [E] " + df["StudentExplanation"].fillna("").astype(str)
    )

def make_text_QAE(df: pd.DataFrame) -> pd.Series:
    return (
        "[Q] " + df["QuestionText"].fillna("").astype(str) +
        " [A] " + df["MC_Answer"].fillna("").astype(str) +
        " [E] " + df["StudentExplanation"].fillna("").astype(str)
    )

# train_strat / train_df / val_df 기준으로 text를 덮어씌워서 비교
train_df_QE = train_df.copy()
val_df_QE   = val_df.copy()
train_df_QE["text"] = make_text_QE(train_df_QE)
val_df_QE["text"]   = make_text_QE(val_df_QE)

train_df_QAE = train_df.copy()
val_df_QAE   = val_df.copy()
train_df_QAE["text"] = make_text_QAE(train_df_QAE)
val_df_QAE["text"]   = make_text_QAE(val_df_QAE)

print(train_df_QE["text"].iloc[0][:200])
print(train_df_QAE["text"].iloc[0][:200])

[Q] What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.] [E] i counted it and got 3 and a ha
[Q] What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.] [A] \( \frac{1}{3} \) [E] i counted


In [38]:
from transformers import AutoModelForSequenceClassification

def run_experiment(train_df_in, val_df_in, class_weights, tag="exp"):
    # dataset
    tr_ds = Dataset.from_pandas(train_df_in[["text", "label"]]).map(tokenize, batched=True)
    va_ds = Dataset.from_pandas(val_df_in[["text", "label"]]).map(tokenize, batched=True)
    tr_ds = tr_ds.rename_column("label", "labels")
    va_ds = va_ds.rename_column("label", "labels")

    # 모델을 새로 로드(템플릿 비교는 같은 초기화에서 시작해야 공정)
    model_local = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels
    )

    trainer_local = WeightedTrainer(
        model=model_local,
        args=training_args,
        train_dataset=tr_ds,
        eval_dataset=va_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        class_weights=class_weights,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer_local.train()
    metrics = trainer_local.evaluate()
    best = trainer_local.state.best_metric

    return metrics, best

In [39]:
metrics_QE, best_QE = run_experiment(train_df_QE, val_df_QE, class_weights, tag="QE")
metrics_QAE, best_QAE = run_experiment(train_df_QAE, val_df_QAE, class_weights, tag="QAE")

print("=== QE ===")
print(metrics_QE)
print("best macro_f1:", best_QE)

print("\n=== QAE ===")
print(metrics_QAE)
print("best macro_f1:", best_QAE)

Map:   0%|          | 0/29352 [00:00<?, ? examples/s]

Map:   0%|          | 0/7339 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.735,1.437939,0.591225,0.220329
2,1.1062,1.091641,0.671345,0.339543
3,1.093,1.002146,0.701185,0.383422
4,0.8169,0.945604,0.740428,0.432667
5,0.7505,0.871043,0.742608,0.447348
6,0.6352,0.92224,0.756779,0.469795
7,0.6779,0.975721,0.764954,0.499072
8,0.4686,0.925859,0.778853,0.515826
9,0.5276,0.929248,0.778853,0.510188
10,0.4265,0.950219,0.780352,0.511853


Map:   0%|          | 0/29352 [00:00<?, ? examples/s]

Map:   0%|          | 0/7339 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.4501,1.203669,0.700095,0.281427
2,0.7791,0.798312,0.785121,0.420543
3,0.824,0.694191,0.782395,0.465213
4,0.6229,0.641936,0.817686,0.504992
5,0.6336,0.642246,0.83322,0.516771
6,0.4248,0.628093,0.846437,0.542483
7,0.5817,0.676111,0.852023,0.531896
8,0.4034,0.690792,0.862243,0.541876


=== QE ===
{'eval_loss': 0.9258590340614319, 'eval_accuracy': 0.7788527047281646, 'eval_macro_f1': 0.5158261666078982, 'eval_runtime': 12.3807, 'eval_samples_per_second': 592.776, 'eval_steps_per_second': 37.074, 'epoch': 10.0}
best macro_f1: 0.5158261666078982

=== QAE ===
{'eval_loss': 0.6280930042266846, 'eval_accuracy': 0.8464368442567107, 'eval_macro_f1': 0.5424830762375314, 'eval_runtime': 10.6871, 'eval_samples_per_second': 686.718, 'eval_steps_per_second': 42.949, 'epoch': 8.0}
best macro_f1: 0.5424830762375314


In [42]:
print("train_df is val_df:", train_df is val_df)
print("train_df_QE is val_df_QE:", train_df_QE is val_df_QE)
print("train_df_QAE is val_df_QAE:", train_df_QAE is val_df_QAE)

train_df is val_df: False
train_df_QE is val_df_QE: False
train_df_QAE is val_df_QAE: False


In [43]:
overlap_QE = set(train_df_QE["QuestionId"]).intersection(
    set(val_df_QE["QuestionId"])
)
overlap_QAE = set(train_df_QAE["QuestionId"]).intersection(
    set(val_df_QAE["QuestionId"])
)

print("QE overlap count:", len(overlap_QE))
print("QAE overlap count:", len(overlap_QAE))

QE overlap count: 15
QAE overlap count: 15


In [44]:
text_overlap_QE = set(train_df_QE["text"]).intersection(
    set(val_df_QE["text"])
)
text_overlap_QAE = set(train_df_QAE["text"]).intersection(
    set(val_df_QAE["text"])
)

print("QE text overlap:", len(text_overlap_QE))
print("QAE text overlap:", len(text_overlap_QAE))

QE text overlap: 209
QAE text overlap: 200


- Data leakage 확인,,,

In [46]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

# 1) singleton label 제거
label_counts = train["label"].value_counts()
singleton_labels = label_counts[label_counts < 2].index
train2 = train[~train["label"].isin(singleton_labels)].copy()

print("removed:", len(train) - len(train2))

# 2) group split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
idx = np.arange(len(train2))
train_idx, val_idx = next(gss.split(idx, groups=train2["QuestionId"].values))

train_df = train2.iloc[train_idx].copy()
val_df   = train2.iloc[val_idx].copy()

print("Train:", len(train_df), "Val:", len(val_df))
print("QuestionId overlap:", len(set(train_df["QuestionId"]).intersection(set(val_df["QuestionId"]))))
print("Text overlap:", len(set(train_df["text"]).intersection(set(val_df["text"]))))

removed: 5
Train: 27014 Val: 9677
QuestionId overlap: 0
Text overlap: 0


In [47]:
import pandas as pd

def make_text_QE(df: pd.DataFrame) -> pd.Series:
    return (
        "[Q] " + df["QuestionText"].fillna("").astype(str) +
        " [E] " + df["StudentExplanation"].fillna("").astype(str)
    )

def make_text_QAE(df: pd.DataFrame) -> pd.Series:
    return (
        "[Q] " + df["QuestionText"].fillna("").astype(str) +
        " [A] " + df["MC_Answer"].fillna("").astype(str) +
        " [E] " + df["StudentExplanation"].fillna("").astype(str)
    )

# 일단 QAE로 진행 (원하면 QE로 바꿔도 됨)
train = train.copy()
train["text"] = make_text_QAE(train)

In [48]:
import numpy as np

label_counts = train["label"].value_counts()
singleton_labels = label_counts[label_counts < 2].index

train2 = train[~train["label"].isin(singleton_labels)].copy()
removed = len(train) - len(train2)

print("Original:", len(train))
print("After removing singleton labels:", len(train2))
print("Removed samples:", removed)
print("Unique labels:", train2["label"].nunique())

Original: 36696
After removing singleton labels: 36691
Removed samples: 5
Unique labels: 60


In [49]:
from sklearn.model_selection import GroupShuffleSplit

SEED = 42

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
idx = np.arange(len(train2))

train_idx, val_idx = next(gss.split(idx, groups=train2["QuestionId"].values))

train_df = train2.iloc[train_idx].copy()
val_df   = train2.iloc[val_idx].copy()

print("Train:", len(train_df), "Val:", len(val_df))
print("QuestionId overlap:",
      len(set(train_df["QuestionId"]).intersection(set(val_df["QuestionId"]))))
print("Text overlap:",
      len(set(train_df["text"]).intersection(set(val_df["text"]))))

Train: 27014 Val: 9677
QuestionId overlap: 0
Text overlap: 0


In [50]:
import torch

num_labels = train["label"].nunique()
counts = np.bincount(train_df["label"].values, minlength=num_labels)

weights = 1.0 / np.sqrt(counts + 1e-6)
weights = weights / weights.mean()
class_weights = torch.tensor(weights, dtype=torch.float)
class_weights = torch.clamp(class_weights, max=10.0)

print("min/max counts:", counts.min(), counts.max())
print("weights mean/max:", class_weights.mean().item(), class_weights.max().item())

min/max counts: 0 12119
weights mean/max: 1.0 4.330390930175781


In [51]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
from datasets import Dataset
from transformers import DataCollatorWithPadding

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=256
    )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_ds = Dataset.from_pandas(train_df[["text", "label"]]).map(tokenize, batched=True)
val_ds   = Dataset.from_pandas(val_df[["text", "label"]]).map(tokenize, batched=True)

train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")

Map:   0%|          | 0/27014 [00:00<?, ? examples/s]

Map:   0%|          | 0/9677 [00:00<?, ? examples/s]

In [54]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results/group_split_wloss_QAE",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,              # 상한
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",
)

In [55]:
from transformers import EarlyStoppingCallback

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  super().__init__(*args, **kwargs)


In [56]:
trainer.train()
metrics = trainer.evaluate()

print(metrics)
print("Best metric:", trainer.state.best_metric)
print("Best ckpt:", trainer.state.best_model_checkpoint)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.2608,5.495734,0.180738,0.033559
2,1.0123,6.675133,0.155317,0.019553
3,0.7537,7.871319,0.155213,0.019614


{'eval_loss': 5.495733737945557, 'eval_accuracy': 0.1807378319727188, 'eval_macro_f1': 0.033559497140268235, 'eval_runtime': 13.0118, 'eval_samples_per_second': 743.709, 'eval_steps_per_second': 46.496, 'epoch': 3.0}
Best metric: 0.033559497140268235
Best ckpt: ./results/group_split_wloss_QAE\checkpoint-1689


In [57]:
print("num_labels:", num_labels)
print("train label min/max:", train_df["label"].min(), train_df["label"].max())
print("val   label min/max:", val_df["label"].min(), val_df["label"].max())
print("unique labels train:", train_df["label"].nunique())
print("unique labels val  :", val_df["label"].nunique())

num_labels: 65
train label min/max: 0 64
val   label min/max: 0 64
unique labels train: 50
unique labels val  : 16


### 7. Validation 결과

In [40]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 0.8327854871749878,
 'eval_accuracy': 0.6673024523160763,
 'eval_macro_f1': 0.373422873473576,
 'eval_runtime': 8.4941,
 'eval_samples_per_second': 864.128,
 'eval_steps_per_second': 54.037,
 'epoch': 3.0}

### 8. Test Prediction & Submission

In [41]:
test_ds = Dataset.from_pandas(test[["text"]]).map(tokenize, batched=True)

test_preds = trainer.predict(test_ds)
test_labels = np.argmax(test_preds.predictions, axis=1)

submission = pd.DataFrame({
    "QuestionId": test["QuestionId"],
    "Category:Misconception": le.inverse_transform(test_labels)
})

submission.to_csv("submission_transformer.csv", index=False)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]