In [1]:
import os
import numpy as np
import random

import csv
import pandas as pd

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from tqdm import tqdm

from models.prefix_gptneox_model import PrefixGPTNeoXLMHeadModel

from train_utils.data_utils import batch_tokenize_preprocess_decoder
from utils.args_utils import Args

import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import transformers
print(transformers.__version__)

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

4.24.0


In [3]:
def set_seed(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    n_gpu = torch.cuda.device_count()
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

In [4]:
random_seed = 100
set_seed(random_seed)

In [5]:
# MODEL ARGS
args = Args()

args.pretrained_model = "EleutherAI/polyglot-ko-1.3b"
args.special_tokens = None
# Pretrained LM 웨이트 고정
args.freeze_plm = True
# Prefix 웨이트 학습
args.freeze_prefix = False

# hyperparams
args.prefix_dropout = 0.1
args.prefix_sequence_length = 8
args.mid_dim = 800

In [6]:
# Load Initial Model
model = PrefixGPTNeoXLMHeadModel(args)

prefix-tuning sequence length is 8.


In [7]:
# Load Data
# Data from https://github.com/songys/Chatbot_datas
tokenizer = model.tokenizer
max_length = 128

df_train = pd.read_csv("processed_data/train.tsv", sep = "\t")
df_val = pd.read_csv("processed_data/val.tsv", sep = "\t")
print(df_train.shape, df_val.shape, df_train.columns)

tr_ds = Dataset.from_pandas(df_train)
val_ds = Dataset.from_pandas(df_val)

tr_ds = tr_ds.map(
    lambda batch: batch_tokenize_preprocess_decoder(
        batch, tokenizer, max_length
    ),
    batched=True,
)

val_ds = val_ds.map(
    lambda batch: batch_tokenize_preprocess_decoder(
        batch, tokenizer, max_length
    ),
    batched=True,
)


(9576, 2) (1064, 2) Index(['source', 'target'], dtype='object')


100%|██████████| 10/10 [00:00<00:00, 24.60ba/s]
100%|██████████| 2/2 [00:00<00:00, 43.62ba/s]


In [8]:
# PREPARE TRAIN
wandb.init(project="prefixtuning-chatbot", entity = "id4thomas")
wandb.watch(model, log="all", log_freq=10)

out_dir = "weights"

training_args = TrainingArguments(
        run_name = "prefix_gptneox_chatbot",

        # Train Params
        ## Steps/Epochs
        num_train_epochs = 3,

        ## LR
        learning_rate = 5e-5,
        ## Batch
        per_device_train_batch_size = 32,
        per_device_eval_batch_size = 32,
        gradient_accumulation_steps = 1,
        ## ETC
        # label_smoothing_factor = config["label_smoothing_factor"],

        # Checkpointing, Saving
        output_dir = os.path.join(out_dir,"checkpoints"),
        save_strategy = "steps", # steps, epoch
        save_steps = 80,
        save_total_limit = 1,
        load_best_model_at_end = True,
        overwrite_output_dir=True,

        # Evaluating
        evaluation_strategy = "steps",
        metric_for_best_model = "eval_loss",

        # Logging
        logging_dir = out_dir,
        logging_steps = 80,
        disable_tqdm = False,
        report_to = "wandb",
        # predict_with_generate = True,

        # System
        seed = random_seed,
        fp16 = False,
        bf16 = False
    )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mid4thomas[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
        model = model,
        args = training_args,
        data_collator=data_collator,
        train_dataset = tr_ds,
        eval_dataset = val_ds
)

In [10]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `PrefixGPTNeoXLMHeadModel.forward` and have been ignored: source, target. If source, target are not expected by `PrefixGPTNeoXLMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9576
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 900
  Number of trainable parameters = 80397088
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
80,2.2361,1.870337
160,1.8348,1.826008
240,1.798,1.803525
320,1.7884,1.791224
400,1.7405,1.778697
480,1.7358,1.770133
560,1.7312,1.759744
640,1.6988,1.754155
720,1.6872,1.75058
800,1.6837,1.746201


The following columns in the evaluation set don't have a corresponding argument in `PrefixGPTNeoXLMHeadModel.forward` and have been ignored: source, target. If source, target are not expected by `PrefixGPTNeoXLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1064
  Batch size = 32
Saving model checkpoint to weights/checkpoints/checkpoint-80
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
The following columns in the evaluation set don't have a corresponding argument in `PrefixGPTNeoXLMHeadModel.forward` and have been ignored: source, target. If source, target are not expected by `PrefixGPTNeoXLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1064
  Batch size = 32
Saving model checkpoint to weights/checkpoints/checkpoint-160
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [weights/checkpoints/checkpoint-8

TrainOutput(global_step=900, training_loss=1.7819067086113825, metrics={'train_runtime': 1413.535, 'train_samples_per_second': 20.324, 'train_steps_per_second': 0.637, 'total_flos': 0.0, 'train_loss': 1.7819067086113825, 'epoch': 3.0})

In [11]:
# Only save prefix weights
state_dict = model.state_dict()
layer_keys = list(state_dict.keys())

filtered = list(filter(lambda x: "pretrain_model" not in x, layer_keys))
print(filtered)

processed_dict = {}
for k in filtered:
    processed_dict[k] = state_dict[k]
torch.save(processed_dict, os.path.join("prefix_weights", "gptneox_ep30_1r1e-5.bin"))

['input_tokens', 'wte.weight', 'control_trans.0.weight', 'control_trans.0.bias', 'control_trans.2.weight', 'control_trans.2.bias']


In [12]:
trainer.save_model(os.path.join("weights","best"))
tokenizer.save_pretrained(os.path.join("weights","best"))

Saving model checkpoint to weights/best
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in weights/best/tokenizer_config.json
Special tokens file saved in weights/best/special_tokens_map.json


('weights/best/tokenizer_config.json',
 'weights/best/special_tokens_map.json',
 'weights/best/tokenizer.json')