In [1]:
import utils
import torch
from tqdm import trange, tqdm
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split

from livedoor_datasets import LivedoorDataset
from model import Model
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import LlamaTokenizer, BatchEncoding
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from accelerate import Accelerator


In [2]:
lr = 5e-10
max_seq_len = 128
seed = 42
batch_size = 1
epochs = 2
model_name = "stabilityai/japanese-stablelm-base-alpha-7b",
tokenizer_name = "novelai/nerdstash-tokenizer-v1"

In [3]:
accelerator = Accelerator()
taglist = utils.read_taglist()
tokenizer = LlamaTokenizer.from_pretrained(
    pretrained_model_name_or_path=tokenizer_name,
    additional_special_tokens=['__'],
    max_seq_len = max_seq_len,
)

In [4]:

def collate_fn(datalist) -> BatchEncoding:
    inputs = tokenizer(
        text=[text for (text, _) in datalist],
        truncation=True,
        padding=True,
        return_tensors="pt",
        max_length=max_seq_len,
    )
    labels = []
    for _, tag in datalist:
        labels.append(taglist.index(tag))
    labels = torch.LongTensor(labels)
    return BatchEncoding({ **inputs, "labels": labels })


In [5]:
def dataloaders():
    dataset = LivedoorDataset()
    all_num = len(dataset)
    train_num = int(all_num * 0.5)
    val_num = int(all_num - train_num)
    train_dataset, val_dataset = random_split(dataset, [train_num, val_num])
    train_dataloader = create_dataloader(train_dataset)
    val_dataloader = create_dataloader(val_dataset)
    return train_dataloader, val_dataloader

def create_dataloader(dataset):
    return DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
    )



In [6]:
train_dataloader, val_dataloader = dataloaders()
num_steps = len(train_dataloader)
model = Model(num_labels=len(taglist))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# dictをtnsorに変換する。textとlabelsにする
def train(model:Model, train_dataloader, val_dataloader):
    torch.autograd.set_detect_anomaly(True)
    model.train()
    best_val_f1 = 0
    best_state_dict = model.state_dict()
    optimizer = AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=num_steps,
        num_training_steps=num_steps * epochs
    )

    model, train_dataloader, val_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        lr_scheduler,
    )

    for epoch in trange(epochs, dynamic_ncols=True):
        for batch in tqdm(train_dataloader, total=len(train_dataloader), dynamic_ncols=True):
            optimizer.zero_grad()
            output = model(**batch)
            loss = output.loss
            print(f"------------------------- loss:{loss}")
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
        model.eval()
        (accuracy, f1, precision, recall) = evaluate(model, val_dataloader)

        # if f1 > best_val_f1:
        #    best_val_f1 = f1
        #    best_state_dict = model.state_dict()

    model.load_state_dict(best_state_dict)
    model.eval()

    torch.save(best_state_dict, "model.pth")

def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    val_labels = []
    pred_labels = []

    for batch in tqdm(dataloader, total=len(dataloader), dynamic_ncols=True, leave=False):
        output = model(**batch)
        batch_size = batch.input_ids.size(0)
        loss = output.loss.item() * batch_size
        pred_labels += output.logits.argmax(dim=-1).tolist()
        val_labels += batch.labels.tolist()
        total_loss += loss

        accuracy = accuracy_score(pred_labels, val_labels)
        precision, recall, f1, _ = precision_recall_fscore_support(
            val_labels,
            pred_labels,
            average="macro",
            zero_division=0,
        )
        return (accuracy, f1, precision, recall)



In [8]:
train(model, train_dataloader, val_dataloader)

  0%|          | 0/2 [00:00<?, ?it/s]


eos: torch.float16
logits tensor([[ 0.5723,  0.5649, -1.1914,  0.2781, -0.5488,  1.8779,  0.1324,  0.2430,
          0.1755]], device='cuda:0', dtype=torch.float16,
       grad_fn=<MmBackward0>)
loss: tensor(2.4863, device='cuda:0', dtype=torch.float16,
       grad_fn=<NllLossBackward0>)
------------------------- loss:2.486328125



  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/kuninori/dev/llm-classfication/venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/kuninori/dev/llm-classfication/venv/lib/python3.10/site-packages/traitlets/config/application.py", line 1053, in launch_instance
    app.start()
  File "/home/kuninori/dev/llm-classfication/venv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 737, in start
    self.io_loop.start()
  File "/home/kuninori/dev/llm-classfication/venv/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in 

eos: torch.float16
logits tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan]], device='cuda:0',
       dtype=torch.float16, grad_fn=<MmBackward0>)
loss: tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)
------------------------- loss:nan





RuntimeError: Function 'LogSoftmaxBackward0' returned nan values in its 0th output.