In [1]:
!pip install transformers datasets



In [2]:
!pip install transformers[torch]



In [3]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer


# CSVファイルのロード
df = pd.read_csv("/content/customer_support_data.csv")

## 新しいラベル変換辞書の作成
#"製品関連の問い合わせ"を0、"サービス関連の問い合わせ"を1とする
new_label_dict = {
    "技術的な問題": 0,
    "製品に関する質問": 0,
    "製品使用方法": 0,
    "情報請求": 0,
    "互換性に関する質問": 0,
    "製品情報": 0,
    "製品仕様": 0,
    "不具合対応": 0,
    # 以下、"サービス関連の問い合わせ"に該当するラベル
    "配送に関する問題": 1,
    "返品・交換": 1,
    "キャンペーン情報": 1,
    "注文方法": 1,
    "アカウント関連": 1,
    "店舗情報": 1,
    "会員特典": 1,
    "保証期間": 1,
    "修理サービス": 1,
    "配送オプション": 1,
    "セール情報": 1,
    "注文キャンセル": 1,
}

# ラベルの変換実行
df['label'] = df['label'].map(new_label_dict)

# Hugging FaceのDatasetに変換
dataset = Dataset.from_pandas(df)

# トークナイザの準備
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# トークナイズ関数
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# トークナイズの実行
tokenized_datasets = dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [4]:
# データセットの分割（例：トレーニング80％、バリデーション10％、テスト10％）
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_val_split = train_test_split['train'].train_test_split(test_size=0.125)  # 0.125 * 0.8 = 0.1

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']
test_dataset = train_test_split['test']

In [5]:
# 分割後のデータセットにラベルが含まれていることを確認
print("トレーニングセットのラベル例:", train_dataset["label"][0:5])
print("バリデーションセットのラベル例:", val_dataset["label"][0:5])
print("テストセットのラベル例:", test_dataset["label"][0:5])

トレーニングセットのラベル例: [1, 0, 0, 1, 0]
バリデーションセットのラベル例: [1, 1]
テストセットのラベル例: [1, 0, 0, 0]


In [6]:
!pip install accelerate -U



In [7]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# モデルのロード
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# トレーニングのパラメータ設定
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=100,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# トレーナーの設定
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
trainer.train()

Step,Training Loss
10,0.6601
20,0.6315
30,0.6032
40,0.5257
50,0.4382
60,0.3377
70,0.2945
80,0.2402
90,0.1434
100,0.1082


TrainOutput(global_step=100, training_loss=0.3982669913768768, metrics={'train_runtime': 120.4267, 'train_samples_per_second': 11.625, 'train_steps_per_second': 0.83, 'total_flos': 368355477504000.0, 'train_loss': 0.3982669913768768, 'epoch': 100.0})

In [9]:
# モデルの評価
evaluation_result = trainer.evaluate(eval_dataset=test_dataset)
print(evaluation_result)


{'eval_loss': 1.301108479499817, 'eval_runtime': 0.1334, 'eval_samples_per_second': 29.985, 'eval_steps_per_second': 7.496, 'epoch': 100.0}


In [10]:
# デバイスの設定（GPUが利用可能な場合はGPUを使用）
device = torch.device("cuda" if torch.cuda.is_available() else "gpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
# 新しい入力データ
input_data = ["製品情報", "キャンペーン情報"]

# トークナイズ
inputs = tokenizer(input_data, padding=True, truncation=True, max_length=512, return_tensors="pt")

# 利用可能なデバイスの確認
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデルをデバイスに移動
model.to(device)

# 入力データも同じデバイスに移動
new_inputs = {k: v.to(device) for k, v in inputs.items()}

# モデルを使った予測
with torch.no_grad():
    outputs = model(**new_inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# 予測結果の表示
for text, prediction in zip(input_data, predictions):
    print(f"Text: {text}, Predicted label: {prediction.item()}")


Text: 製品情報, Predicted label: 1
Text: キャンペーン情報, Predicted label: 1
