In [3]:
import pandas as pd
pd.options.display.max_colwidth = 30

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

import datasets
from transformers import AutoTokenizer, DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "line-corporation/line-distilbert-base-japanese", num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(
    "line-corporation/line-distilbert-base-japanese", trust_remote_code=True
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at line-corporation/line-distilbert-base-japanese were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at line-corporation/line-distilbert-b

In [4]:
train_df = pd.read_csv("../data/train.tsv", sep="\t")
valid_df = pd.read_csv("../data/valid.tsv", sep="\t")
test_df = pd.read_csv("../data/test.tsv", sep="\t")

len(train_df), len(valid_df), len(test_df)

(162, 54, 54)

In [5]:
# train_dfのpoemを入力にlabelを2値分類するfine-tuningを行う

from transformers import Trainer, TrainingArguments


def tokenize(batch):
    return tokenizer(batch["poem"], padding=True, truncation=True)


train_dataset = datasets.Dataset.from_pandas(train_df)
valid_dataset = datasets.Dataset.from_pandas(valid_df)
test_dataset = datasets.Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
valid_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

training_args = TrainingArguments(
    output_dir="../data/output/line_distilbert/",
    overwrite_output_dir=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)
trainer.train()

Map:   0%|          | 0/162 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 162/162 [00:00<00:00, 275.73 examples/s]
Map: 100%|██████████| 54/54 [00:00<00:00, 899.93 examples/s]
Map: 100%|██████████| 54/54 [00:00<00:00, 1701.43 examples/s]
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: poem. If poem are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 162
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 63
  Number of trainable parameters = 68286722


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=63, training_loss=0.1594126413738917, metrics={'train_runtime': 125.1614, 'train_samples_per_second': 3.883, 'train_steps_per_second': 0.503, 'total_flos': 15214605166728.0, 'train_loss': 0.1594126413738917, 'epoch': 3.0})

In [9]:
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions[0], axis=-1)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: poem. If poem are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 54
  Batch size = 8


In [10]:
# pred_labelsをvalidデータで評価する
accuracy_score(test_dataset["label"], pred_labels)

0.9444444444444444

In [11]:
# 単語ロジスティック回帰で誤っていた問題を解けていたか確認

test_df["pred"] = pred_labels
test_df[test_df["poem"].map(lambda x: "眼" in x)]

Unnamed: 0,poem,label,pred
3,時は常に背後から迫り唸りを上げて眼前に流れ去る踏み止...,0,0
15,ああおれたちは皆眼をあけたまま空を飛ぶ夢を見てるんだ,0,0
