https://data-science-learning.com/archives/1085

In [1]:
import os
from glob import glob
import pandas as pd
import linecache
from sklearn.metrics import classification_report

# ０1 データセットの準備

In [2]:
categories = glob('./text/*')
categories = [os.path.basename(x) for x in categories if not x.endswith('.txt')]
categories

['dokujo-tsushin',
 'it-life-hack',
 'kaden-channel',
 'livedoor-homme',
 'movie-enter',
 'peachy',
 'smax',
 'sports-watch',
 'topic-news']

In [3]:
# カテゴリをid化する辞書
cat2id = {v:i for i, v in enumerate(categories)}
cat2id

{'dokujo-tsushin': 0,
 'it-life-hack': 1,
 'kaden-channel': 2,
 'livedoor-homme': 3,
 'movie-enter': 4,
 'peachy': 5,
 'smax': 6,
 'sports-watch': 7,
 'topic-news': 8}

In [4]:
def file2text(file):
    with open(file, 'r', encoding='utf8') as f:
        lines = f.readlines()

    text = ''
    
    for line in lines[2:]:
        text += line.replace('\n', '')

    return text

## 01-1 データをデータフレームに入れる

In [5]:

data_dic = {
    'cat': [],
    'text': [],
    # 'cat_id': []
}
                       
for cat in categories:
    print(cat)
    files = glob(f'./text/{cat}/*.txt')
    for i, file in enumerate(files):
        data_dic['cat'].append(cat)
        data_dic['text'].append(file2text(file))
        # data_dic['label'].append(cat2id[cat])

dataset_df = pd.DataFrame(data_dic)
dataset_df = dataset_df.sample(frac=1, random_state=0).reset_index(drop=True)
dataset_df = dataset_df[:200]


dokujo-tsushin
it-life-hack
kaden-channel
livedoor-homme
movie-enter
peachy
smax
sports-watch
topic-news


In [6]:
dataset_df['label'] = dataset_df['cat'].map(cat2id)
dataset_df = dataset_df[['text', 'label']]
dataset_df

Unnamed: 0,text,label
0,NY名物イベントが日本でも！名店グルメを気軽に楽しむニューヨークで20年続いている食の祭典「...,5
1,小沢一郎氏の妻が支援者に離婚を報告。「週刊文春」報じる13日、Web版「週刊文春」は、民主党...,8
2,【Sports Watch】田中＆里田の交際、アプローチは里田からグラビアアイドル・ほしのあ...,7
3,被災地の缶詰を途上国に…「正気じゃない。人殺しだ!!」30日、政府が被災地で製造された水産加...,8
4,仲間由紀恵さん、“生徒”亀梨和也さんを「大人の魅力が出てきた」と絶賛　11月に入り各地のイル...,5
...,...,...
195,短期連載 世界基準の仕事術 vol.1 【三木谷浩史×星野佳路×村上龍 スペシャルトークセッ...,3
196,もう糞アプリとは呼ばせない！これはもはや芸術だ【iPhoneでチャンスを掴め】iPhoneア...,1
197,“台湾版少女時代”スーパー7が少女時代を卑下する発言台湾版少女時代として知られるスーパー7が...,8
198,NTTドコモ、関東・甲信越限定でスマートフォンやケータイを複数台購入時に端末代金を割り引く「...,6


## 01-2 datesets形式に変換

In [7]:
from datasets import Dataset

In [8]:
dataset = Dataset.from_pandas(dataset_df)

In [9]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 200
})

## 01-3 データセットにトークン化したデータを追加

In [10]:
from transformers import AutoTokenizer

In [11]:
tokenizer= AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')



In [12]:
tokenizer('今日は暑かった')

{'input_ids': [2, 13711, 897, 2778, 11191, 881, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [13]:
def preprocess_function(examples):
    MAX_LENGTH = 512
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [14]:
# tokenized_dataset = tokenized_dataset.remove_columns(['cat', 'column_to_remove2'])

In [15]:
# splited_dataset = tokenized_dataset.train_test_split(test_size=0.2)
# splited_dataset

In [16]:
# splited_dataset['train']['text'][0]

In [17]:
# splited_dataset['train']['cat_id'][0]

# 02 modelの用意

In [18]:
from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')


In [19]:

def preprocess_function(examples):
    MAX_LENGTH = 512
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [20]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40
    })
})

In [21]:
tokenized_dataset['train']

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 160
})

# datacollatorについて調べること

In [22]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, BertForSequenceClassification

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese-v2", num_labels=len(categories))
# model = BertForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese-v2", num_labels=len(categories))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 03 学習

## 03-1 学習準備

In [23]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy':acc, 'f1':f1}

In [24]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    # evaluation_strategy='epoch',
    eval_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    learning_rate=2e-5,
    use_cpu=True, # GPUを使用する場合はFalse
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


## 03-2 学習

In [25]:

trainer.train()

  0%|          | 0/40 [00:00<?, ?it/s]

{'loss': 2.2036, 'grad_norm': 8.88569164276123, 'learning_rate': 1e-05, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.0150513648986816, 'eval_accuracy': 0.275, 'eval_f1': 0.2261111111111111, 'eval_runtime': 37.859, 'eval_samples_per_second': 1.057, 'eval_steps_per_second': 0.132, 'epoch': 1.0}


ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

In [26]:
# Trainerクラスを拡張して保存する前にテンソルを連続化する
class CustomTrainer(Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        if output_dir is None:
            output_dir = self.args.output_dir
        
        self.model = self.model.to('cpu')  # モデルをCPUに移動
        
        # すべてのテンソルを連続化する
        for param in self.model.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()
        
        super().save_model(output_dir, _internal_call)

In [27]:
# 新しいCustomTrainerクラスを使用
trainer = CustomTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [28]:
trainer.train()

  0%|          | 0/40 [00:00<?, ?it/s]

{'loss': 1.6475, 'grad_norm': 10.035178184509277, 'learning_rate': 1e-05, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.6821296215057373, 'eval_accuracy': 0.55, 'eval_f1': 0.5412862137862138, 'eval_runtime': 37.8098, 'eval_samples_per_second': 1.058, 'eval_steps_per_second': 0.132, 'epoch': 1.0}
{'loss': 1.3598, 'grad_norm': 10.194493293762207, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.53236985206604, 'eval_accuracy': 0.675, 'eval_f1': 0.6754761904761905, 'eval_runtime': 37.6897, 'eval_samples_per_second': 1.061, 'eval_steps_per_second': 0.133, 'epoch': 2.0}
{'train_runtime': 1124.0252, 'train_samples_per_second': 0.285, 'train_steps_per_second': 0.036, 'train_loss': 1.5036373138427734, 'epoch': 2.0}


TrainOutput(global_step=40, training_loss=1.5036373138427734, metrics={'train_runtime': 1124.0252, 'train_samples_per_second': 0.285, 'train_steps_per_second': 0.036, 'total_flos': 84200829419520.0, 'train_loss': 1.5036373138427734, 'epoch': 2.0})

In [29]:
trainer.save_state()
trainer.save_model()

# 04 評価

In [30]:
pred_result = trainer.predict(tokenized_dataset['test'], ignore_keys=['loss', 'last_hidden_state', 'hidden_states', 'attentions'])
pred_label= pred_result.predictions.argmax(axis=1).tolist()
print(pred_label)

  0%|          | 0/5 [00:00<?, ?it/s]

[1, 2, 4, 2, 5, 8, 6, 8, 0, 8, 6, 1, 8, 2, 1, 4, 6, 4, 6, 6, 0, 4, 7, 6, 0, 1, 7, 0, 7, 5, 4, 0, 2, 1, 5, 4, 1, 0, 1, 7]


In [32]:
from sklearn.metrics import classification_report
# print(classification_report(tokenized_dataset['test']['label'], pred_label))
print(classification_report(tokenized_dataset['test']['label'], pred_label, target_names=categories))

                precision    recall  f1-score   support

dokujo-tsushin       0.67      0.67      0.67         6
  it-life-hack       0.43      1.00      0.60         3
 kaden-channel       1.00      0.50      0.67         8
livedoor-homme       0.00      0.00      0.00         2
   movie-enter       1.00      1.00      1.00         6
        peachy       0.67      0.50      0.57         4
          smax       0.50      0.75      0.60         4
  sports-watch       1.00      0.67      0.80         6
    topic-news       0.25      1.00      0.40         1

      accuracy                           0.68        40
     macro avg       0.61      0.68      0.59        40
  weighted avg       0.76      0.68      0.68        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
