https://data-science-learning.com/archives/1085

In [1]:
import os
from glob import glob
import pandas as pd
import linecache
from sklearn.metrics import classification_report

# ０1 データセットの準備

In [2]:
categories = glob('./text/*')
categories = [os.path.basename(x) for x in categories if not x.endswith('.txt')]
categories

['movie-enter',
 'it-life-hack',
 'kaden-channel',
 'topic-news',
 'livedoor-homme',
 'peachy',
 'sports-watch',
 'dokujo-tsushin',
 'smax']

In [3]:
# カテゴリをid化する辞書
cat2id = {v:i for i, v in enumerate(categories)}
cat2id

{'movie-enter': 0,
 'it-life-hack': 1,
 'kaden-channel': 2,
 'topic-news': 3,
 'livedoor-homme': 4,
 'peachy': 5,
 'sports-watch': 6,
 'dokujo-tsushin': 7,
 'smax': 8}

In [4]:
def file2text(file):
    with open(file, 'r', encoding='utf8') as f:
        lines = f.readlines()

    text = ''
    
    for line in lines[2:]:
        text += line.replace('\n', '')

    return text

## 01-1 データをデータフレームに入れる

In [5]:

data_dic = {
    'cat': [],
    'text': [],
    # 'cat_id': []
}
                       
for cat in categories:
    print(cat)
    files = glob(f'./text/{cat}/*.txt')
    for i, file in enumerate(files):
        data_dic['cat'].append(cat)
        data_dic['text'].append(file2text(file))
        # data_dic['label'].append(cat2id[cat])

dataset_df = pd.DataFrame(data_dic)
dataset_df = dataset_df.sample(frac=1, random_state=0).reset_index(drop=True)
dataset_df = dataset_df[:200]


movie-enter
it-life-hack
kaden-channel
topic-news
livedoor-homme
peachy
sports-watch
dokujo-tsushin
smax


In [6]:
dataset_df['label'] = dataset_df['cat'].map(cat2id)
dataset_df = dataset_df[['text', 'label']]
dataset_df

Unnamed: 0,text,label
0,インタビュー：植松晃士さん「黒目の印象を変えて、脱“変わり映えのしない女”」　美しくなりたい...,5
1,NTTドコモのLTEは地下鉄トンネル内で速いのか？都営新宿線にてスピードテストをしてみた【レ...,8
2,独女世代が共感「最後から二番目の恋」で描かれる「大人の孤独」派遣社員のジュンコさん(仮名・3...,7
3,GALAXY Noteでお絵かきしよう！ドット絵を打てる「Pixel Art editor」...,8
4,超危険な日傘で注意したいNG行為／父親の愛を感じた体験談など−【ライフスタイル】週間ランキン...,5
...,...,...
195,TOKIOの出演CMに海外から非難TOKIOが出演した「フードアクションニッポン」のCM「食...,3
196,何だこのおもちゃみたいなの？ Googleロゴが子供のおもちゃみたいになっている理由前回紹介...,1
197,めざせ、空き缶つぶし世界一！ストレス解消もできる「クシャッと」【iPhoneアプリ】清水幸子...,8
198,香川、移籍のうわさを完全に否定　どうやら、香川真司にも「エアオファー」が届いていたようだ。　...,6


## 01-2 datesets形式に変換

In [7]:
from datasets import Dataset

In [8]:
dataset = Dataset.from_pandas(dataset_df)

In [9]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 200
})

## 01-3 データセットにトークン化したデータを追加

In [10]:
from transformers import AutoTokenizer

In [11]:
tokenizer= AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')

In [12]:
tokenizer('今日は暑かった')

{'input_ids': [2, 13711, 897, 2778, 11191, 881, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [13]:
def preprocess_function(examples):
    MAX_LENGTH = 512
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [14]:
# tokenized_dataset = tokenized_dataset.remove_columns(['cat', 'column_to_remove2'])

In [15]:
# splited_dataset = tokenized_dataset.train_test_split(test_size=0.2)
# splited_dataset

In [16]:
# splited_dataset['train']['text'][0]

In [17]:
# splited_dataset['train']['cat_id'][0]

# 02 modelの用意

In [18]:
from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')


In [19]:

def preprocess_function(examples):
    MAX_LENGTH = 512
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [20]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40
    })
})

In [21]:
tokenized_dataset['train']

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 160
})

# datacollatorについて調べること

In [22]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, BertForSequenceClassification

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese-v2", num_labels=len(categories))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 03 学習

## 03-1 学習準備

In [23]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy':acc, 'f1':f1}

In [30]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    # evaluation_strategy='epoch',
    eval_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    learning_rate=2e-5,
    use_cpu=True, # GPUを使用する場合はFalse
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# このままでは以下のエラーが出る
# ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` 
# which is not allowed. It either means you are trying to save tensors which are reference of each other in which
# case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on 
# your tensor to pack it before saving.

## 03-2 学習

In [25]:

# trainer.train()

## 03-1 エラー対策：Trainerクラスのsave_modelをオーバーライド

In [31]:
# Trainerクラスを拡張して保存する前にテンソルを連続化する
class CustomTrainer(Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        if output_dir is None:
            output_dir = self.args.output_dir
        
        self.model = self.model.to('cpu')  # モデルをCPUに移動
        
        # すべてのテンソルを連続化する
        for param in self.model.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()
        
        super().save_model(output_dir, _internal_call)

In [32]:
# 新しいCustomTrainerクラスを使用
trainer = CustomTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [33]:
trainer.train()

  0%|          | 0/20 [00:00<?, ?it/s]

{'loss': 1.496, 'grad_norm': 7.724746227264404, 'learning_rate': 1e-05, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4555186033248901, 'eval_accuracy': 0.725, 'eval_f1': 0.6996697746697746, 'eval_runtime': 7.1964, 'eval_samples_per_second': 5.558, 'eval_steps_per_second': 0.417, 'epoch': 1.0}
{'loss': 1.1998, 'grad_norm': 6.1741108894348145, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.3726389408111572, 'eval_accuracy': 0.75, 'eval_f1': 0.718482905982906, 'eval_runtime': 7.2774, 'eval_samples_per_second': 5.496, 'eval_steps_per_second': 0.412, 'epoch': 2.0}
{'train_runtime': 380.3391, 'train_samples_per_second': 0.841, 'train_steps_per_second': 0.053, 'train_loss': 1.3479326725006104, 'epoch': 2.0}


TrainOutput(global_step=20, training_loss=1.3479326725006104, metrics={'train_runtime': 380.3391, 'train_samples_per_second': 0.841, 'train_steps_per_second': 0.053, 'total_flos': 84200829419520.0, 'train_loss': 1.3479326725006104, 'epoch': 2.0})

In [34]:
trainer.save_state()
trainer.save_model()

# 04 評価

In [35]:
pred_result = trainer.predict(tokenized_dataset['test'], ignore_keys=['loss', 'last_hidden_state', 'hidden_states', 'attentions'])
pred_label= pred_result.predictions.argmax(axis=1).tolist()
print(pred_label)

  0%|          | 0/3 [00:00<?, ?it/s]

[8, 8, 2, 1, 8, 0, 1, 2, 1, 6, 7, 3, 6, 6, 6, 3, 6, 0, 0, 1, 0, 3, 8, 8, 1, 6, 1, 0, 6, 0, 7, 7, 7, 0, 1, 5, 1, 1, 6, 3]


In [36]:
from sklearn.metrics import classification_report
# print(classification_report(tokenized_dataset['test']['label'], pred_label))
print(classification_report(tokenized_dataset['test']['label'], pred_label, target_names=categories))

                precision    recall  f1-score   support

   movie-enter       0.71      1.00      0.83         5
  it-life-hack       0.44      1.00      0.62         4
 kaden-channel       1.00      0.50      0.67         4
    topic-news       1.00      1.00      1.00         4
livedoor-homme       0.00      0.00      0.00         2
        peachy       1.00      0.20      0.33         5
  sports-watch       1.00      1.00      1.00         8
dokujo-tsushin       1.00      1.00      1.00         4
          smax       0.40      0.50      0.44         4

      accuracy                           0.75        40
     macro avg       0.73      0.69      0.65        40
  weighted avg       0.80      0.75      0.72        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
