参考url（感謝）  
https://data-science-learning.com/archives/1085

In [None]:
import os
from glob import glob
import pandas as pd
import linecache
from sklearn.metrics import classification_report

# 01 データセットの準備

データセットはダウンロードして同じフォルダの`text`フォルダに展開しておく。(下記リンクのdcc-20140209.tar.gz)  
https://www.rondhuit.com/download.html


In [None]:
categories = glob('./text/*')
categories = [os.path.basename(x) for x in categories if not x.endswith('.txt')]
categories

In [None]:
# カテゴリをid化する辞書
cat2id = {v:i for i, v in enumerate(categories)}
cat2id

In [None]:
def file2text(file):
    with open(file, 'r', encoding='utf8') as f:
        lines = f.readlines()

    text = ''
    
    for line in lines[2:]:
        text += line.replace('\n', '')

    return text

## 01-1 データをデータフレームに入れる

In [None]:

data_dic = {
    'cat': [],
    'text': [],
    # 'cat_id': []
}
                       
for cat in categories:
    print(cat)
    files = glob(f'./text/{cat}/*.txt')
    for i, file in enumerate(files):
        data_dic['cat'].append(cat)
        data_dic['text'].append(file2text(file))
        # data_dic['label'].append(cat2id[cat])

dataset_df = pd.DataFrame(data_dic)
# dataset_df = dataset_df.sample(frac=1, random_state=0).reset_index(drop=True) # fracは抽出割合
# dataset_df = dataset_df[:200]
dataset_df = dataset_df.sample(200, random_state=0).reset_index(drop=True)
dataset_df


In [None]:
dataset_df['label'] = dataset_df['cat'].map(cat2id)
# dataset_df = dataset_df[['text', 'label']]
dataset_df

## 01-2 datesets形式に変換

In [None]:
from datasets import Dataset

In [None]:
dataset = Dataset.from_pandas(dataset_df)

In [None]:
dataset

## 01-3 データセットにトークン化したデータを追加

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer= AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')

In [None]:
# 確認
tokenizer('今日は暑かった')

In [None]:
def preprocess_function(examples):
    MAX_LENGTH = 512
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset

In [None]:
tokenized_dataset.sample(4, random_state=2)

In [None]:
# tokenized_dataset = tokenized_dataset.remove_columns(['cat', 'column_to_remove2'])

In [None]:
# splited_dataset = tokenized_dataset.train_test_split(test_size=0.2)
# splited_dataset

In [None]:
# splited_dataset['train']['text'][0]

In [None]:
# splited_dataset['train']['cat_id'][0]

## 01-4 学習用と検証用に分ける

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset

In [None]:
tokenized_dataset['train']

# 02 modelの用意

In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, BertForSequenceClassification

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese-v2", num_labels=len(categories))

# 03 学習

## 03-1 学習準備

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy':acc, 'f1':f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    # evaluation_strategy='epoch',
    eval_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    learning_rate=2e-5,
    use_cpu=True, # GPUを使用する場合はFalse
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# このままでは以下のエラーが出る
# ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` 
# which is not allowed. It either means you are trying to save tensors which are reference of each other in which
# case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on 
# your tensor to pack it before saving.

## 03-2 学習

In [None]:

# trainer.train()

## 03-2−1 エラー対策：Trainerクラスのsave_modelをオーバーライド

### 発生したエラーコード
ValueError: You are trying to save a non contiguous tensor: bert.encoder.layer.0.attention.self.query.weight which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call .contiguous() on your tensor to pack it before saving.

### GPTの解説
このエラーは、保存しようとしているテンソルが非連続 (non-contiguous) であるために発生しています。これは、テンソルがメモリ内で連続していない場合に発生することがあります。この問題を解決するために、保存する前にテンソルを連続化する必要があります。

In [None]:
# Trainerクラスを拡張して保存する前にテンソルを連続化する
class CustomTrainer(Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        if output_dir is None:
            output_dir = self.args.output_dir
        
        self.model = self.model.to('cpu')  # モデルをCPUに移動
        
        # すべてのテンソルを連続化する
        for param in self.model.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()
        
        super().save_model(output_dir, _internal_call)

In [None]:
# 新しいCustomTrainerクラスを使用
trainer = CustomTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
trainer.save_state()
trainer.save_model()

# 04 評価

In [None]:
pred_result = trainer.predict(tokenized_dataset['test'], ignore_keys=['loss', 'last_hidden_state', 'hidden_states', 'attentions'])
pred_label= pred_result.predictions.argmax(axis=1).tolist()
print(pred_label)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(tokenized_dataset['test']['label'], pred_label, target_names=categories, zero_division=0))