In [23]:
import pandas as pd
import os
import re
import numpy as np
from params.paths import ROOT_DIR
from api_requests.meeting_convo_collector import MeetingConvoCollector
from file_handling.file_read_writer import read_json, write_json, create_dir, write_file

In [24]:
#params
DATA_DIR = os.path.join(ROOT_DIR, 'data')
RESOURCE_DIR = os.path.join(ROOT_DIR, 'resource')

In [25]:
#loading the labels corresponding to the label ids
label2id = read_json(os.path.join(RESOURCE_DIR, "labels.json"))
id2label = {v:k for k, v in label2id.items()}
label2id = {k: int(v) for k, v in label2id.items()}
for k, v in label2id.items():
    print(f"{k}=>{v}")
print("___________________________________________________________")
for k, v in id2label.items():
	print(f"{k}=>{v}")

事実文=>0
質問文=>1
説明文=>2
意見文=>3
その他=>4
___________________________________________________________
0=>事実文
1=>質問文
2=>説明文
3=>意見文
4=>その他


# Preparing dataset

In [26]:
from datasets import load_dataset
PATH_TO_DATA_FILE = os.path.join(DATA_DIR, "labelled_data_77.csv")
dataset = load_dataset('csv', data_files=PATH_TO_DATA_FILE, split="train").train_test_split(test_size=0.2)
dataset = dataset.filter(lambda example: bool(example["speech"])).filter(lambda example: len(example['speech'])<1000)
dataset.rename_column("label", "label_name")
dataset = dataset.rename_columns({"label":"label_name", "label_id":"label"})
print(dataset)
print(dataset["train"][199])

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-33ac082739253181/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/csv/default-33ac082739253181/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-7dc635844416dbe8.arrow and /root/.cache/huggingface/datasets/csv/default-33ac082739253181/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-2dd1f11b46d58f10.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-33ac082739253181/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-daa8a9d4e6b64f7a.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-33ac082739253181/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-712656bc1767fdaa.arrow
Loading cached processed dataset at /root/.cache/huggingface/datase

DatasetDict({
    train: Dataset({
        features: ['speech', 'label_name', 'label', 'record_position'],
        num_rows: 1139
    })
    test: Dataset({
        features: ['speech', 'label_name', 'label', 'record_position'],
        num_rows: 283
    })
})
{'speech': '総理は、自らの権限と影響力を理解していないと言わざるを得ません', 'label_name': '意見文', 'label': 3, 'record_position': 24}


In [27]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v3")

def tokenize_function(examples):
    result = tokenizer(examples["speech"], truncation=True, padding=True)
    return result

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

Map:   0%|          | 0/1139 [00:00<?, ? examples/s]

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

# Training

In [28]:
import evaluate

accuracy = evaluate.load("accuracy")

In [29]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "cl-tohoku/bert-base-japanese-v3", num_labels=5, id2label=id2label, label2id=label2id
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    label_names=['label_id']
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 8.00 GiB total capacity; 7.13 GiB already allocated; 0 bytes free; 7.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF