In [1]:
#=======================================================================================================================================
# Huggingface load_dataset 으로 MLM 훈련 하기
#
# => load_dataset 으로 wiki 말뭉치를 로딩하고, 이를 토크화 시키고, 
# input_ids 에 대해 15% 확률로 [MASK]를 씌워서, 실제 모델을 훈련시키는 예제 
#
# => MLM 훈련 말뭉치는 bongsoo/moco-corpus-kowiki202206 사용, 평가 말뭉치는 bongsoo/bongevalsmall 사용
#
# 출처 : https://wikidocs.net/166817
#=======================================================================================================================================

import torch
import os

from tqdm.notebook import tqdm
from transformers import AutoTokenizer, DistilBertTokenizerFast, BertConfig, DistilBertForMaskedLM

import sys
sys.path.append("..")
from myutils import GPU_info, seed_everything, mlogging

# wand 비활성화 
# => trainer 로 훈련시키면 기본이 wandb 활성화이므로, 비활성화 시킴
os.environ["WANDB_DISABLED"] = "true"

In [2]:
# 훈련시킬 말뭉치(사전 만들때 동일한 말뭉치 이용)
#input_corpus = "../../data11/my_corpus/my/pre-kowiki-20220620-1줄.txt"
#input_corpus = "bongsoo/moco-corpus"  # huggingface에 등록된 말뭉치 이용
input_corpus = "../../data11/my_corpus/kowiki-202206-nlp-corpus.txt"  

# eval 말뭉치 
#eval_corpus = "../../data11/my_corpus/bong_small_eval.txt"
eval_corpus = "bongsoo/bongevalsmall"

# 기존 사전훈련된 모델
model_path = "distilbert-base-multilingual-cased"

# 기존 사전 + 추가된 사전 파일
vocab_path="../tokenizer_sample/moco-vocab/mdistilbertV1.2"

# 출력
OUTPATH = '../../data11/model/distilbert/mdistilbertV1.2-temp/'

############################################################################
# tokenizer 관련 hyper parameter 설정
############################################################################
batch_size = 32       # batch_size
token_max_len = 128   # token_seq_len
############################################################################


device = GPU_info()
print(device)

#seed 설정
seed_everything(333)

#logging 설정
logger =  mlogging(loggername="distilbert-MLM-Trainer", logfilename="../../log/distilbert-MLM-Trainer")

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:../../log/distilbert-MLM-Trainer_2022-08-24.log


In [3]:
# tokeinzier 생성
# tokenizer 생성
# => BertTokenizer, BertTokenizerFast 둘중 사용하면됨
tokenizer = DistilBertTokenizerFast.from_pretrained(vocab_path, max_len=token_max_len, do_lower_case=False)
#tokenizer = AutoTokenizer.from_pretrained(vocab_path, max_len=token_max_len, do_lower_case=False)
# fast 토크너나이즈인지 확인
print(f'{vocab_path} is_fast:{tokenizer.is_fast}')

# speical 토큰 계수 + vocab 계수 - 이미 vocab에 포함된 speical 토큰 계수(5)
vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5 + 1
#vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5
print('*special_token_size: {}, *tokenizer.vocab_size: {}'.format(len(tokenizer.all_special_tokens), tokenizer.vocab_size))
print('*vocab_size: {}'.format(vocab_size))
print('*tokenizer_len: {}'.format(len(tokenizer)))

# 모델 로딩 further pre-training 
#config = BertConfig.from_pretrained(model_path)
model = DistilBertForMaskedLM.from_pretrained(model_path, from_tf=bool(".ckpt" in model_path)) 
#model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')    

#################################################################################
# 모델 embedding 사이즈를 tokenizer 크기 만큼 재 설정함.
# 재설정하지 않으면, 다음과 같은 에러 발생함
# CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)` CUDA 에러가 발생함
#  indexSelectLargeIndex: block: [306,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
#
#     해당 오류는 기존 Embedding(8002, 768, padding_idx=1) 처럼 입력 vocab 사이즈가 8002인데,
#     0~8001 사이를 초과하는 word idx 값이 들어가면 에러 발생함.
#################################################################################
model.resize_token_embeddings(len(tokenizer))

model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


../tokenizer_sample/moco-vocab/mdistilbertV1.2 is_fast:True
*special_token_size: 5, *tokenizer.vocab_size: 164314
*vocab_size: 164315
*tokenizer_len: 164314


DistilBertForMaskedLM(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(164314, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [4]:
#==================================================================================================
# load_dataset을 이용하여, 훈련/평가 dataset 로딩.
#
# [로컬 데이터 파일 로딩]
# => dataset = load_dataset("text", data_files='로컬.txt')       # text 로컬 파일 로딩
# => dataset = load_dataset("csv", data_files='로컬.csv')        # csv 로컬 파일 로딩
# => dataset = load_dataset("csv", data_files='로컬.tsv', delimiter="\t")  # tsv 로컬 파일 로딩
# => dataset = load_dataset("json", data_files='로컬.json')      # json 로컬 파일 로딩
# => dataset = load_dataset("pandas", data_files='로컬.pkl')     # pickled dataframe 로컬 파일 로딩
#
# [원격 데이터 파일 로딩]
# url = "https://github.com/crux82/squad-it/raw/master/"
# data_files = {
#    "train": url + "SQuAD_it-train.json.gz",
#    "test": url + "SQuAD_it-test.json.gz",
# }
# squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
#
# 출처 : https://wikidocs.net/166816
#==================================================================================================

from datasets import load_dataset

# 훈련 말뭉치 로딩
#train_dataset = load_dataset(input_corpus)
train_dataset = load_dataset("text", data_files=input_corpus) # text 로컬 파일 로딩

# 평가 말뭉치 로딩
eval_dataset = load_dataset(eval_corpus)

# train_dataset 출력해봄
print(f"train_dataset=======================================")
print(train_dataset)
print(train_dataset['train']['text'][0:3])

print(f'\r\n\r\n')

# eval_dataset 출력해봄
print(f"eval_dataset========================================")
print(eval_dataset)
print(eval_dataset['test']['text'][0:3])

Using custom data configuration default-c5083eed7ffe81c1
Reusing dataset text (/MOCOMSYS/.cache/huggingface/datasets/text/default-c5083eed7ffe81c1/0.0.0/08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration bongsoo--bongevalsmall-cfa82c943ea1c946
Reusing dataset text (/MOCOMSYS/.cache/huggingface/datasets/text/bongsoo--bongevalsmall-cfa82c943ea1c946/0.0.0/08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 7680008
    })
})
['Refer to the V$SYSTEM_EVENT view for time waited and average waits for thefollowing actions:', 'To estimate the time waited for reads incurred by rereading data blocks that had tobe written to disk because of a request from another instance, multiply the statistic(for example, the time waited for db ﬁle sequential reads) by the percentage of readI/O caused by previous cache ﬂushes as shown in this formula:', 'Where "lock buffers for read" is the value for lock converts from N to S derived fromV$LOCK_ACTIVITY and "physical reads" is from the V$SYSSTAT view.']



DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 200
    })
})
['국토교통부 관계자는  실무기구에서는 각 업계별로 규제혁신형 플랫폼 택시를 하기 위해서는 어떤 규제를 풀어야 한다는 자기 안이 있어야 한다 고 말했다 ', '국책연구기관의 한 관계자는  위원회가 전문성과 대표성을 갖추고 본연의 장점을 최대한 살리기 위해서는 외부 감시와 통제가 보다 활성화돼야 한다 고 지적했다 ', '게임업계 관계자는  현장 수요보다 의료진 등 특정한 누군가의 이익을 위해 게임을 중독물질  질병으로 만들

In [5]:
# tokenizer 처리
def tokenizer_function(examples):
    result =  tokenizer(examples['text'], truncation=True, max_length=token_max_len, return_overflowing_tokens=True)
    
    # 신규 인덱스와 이전 인덱스와의 매핑 추출
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result


# batched=True 하면 빠른 tokenizer 이용(Rust)
%time train_dataset_fast = train_dataset.map(tokenizer_function, batched=True)

%time eval_dataset_fast = eval_dataset.map(tokenizer_function, batched=True)

'''
%time tokenized_dataset = text_dataset.map(tokenizer_function, batched=False)
print(tokenized_dataset_fast['train']['text'][0:2])
'''

Loading cached processed dataset at /MOCOMSYS/.cache/huggingface/datasets/text/default-c5083eed7ffe81c1/0.0.0/08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4/cache-2652c7ec115fac69.arrow


CPU times: user 225 ms, sys: 49 ms, total: 274 ms
Wall time: 271 ms


  0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: user 258 ms, sys: 114 ms, total: 372 ms
Wall time: 78.6 ms


"\n%time tokenized_dataset = text_dataset.map(tokenizer_function, batched=False)\nprint(tokenized_dataset_fast['train']['text'][0:2])\n"

In [6]:
print(f"train_dataset_fast=======================================")
print(train_dataset_fast)
print(f'*fast_len:{len(train_dataset_fast["train"])}, len:{len(train_dataset["train"])}')  # fast_dataset과 dataset 길이를 비교함
print(train_dataset_fast['train'][0:2])

print(f'\r\n\r\n')

print(f"eval_dataset_fast=======================================")
print(eval_dataset_fast)
print(f'*fast_len:{len(eval_dataset_fast["test"])}, len:{len(eval_dataset["test"])}')  # fast_dataset과 dataset 길이를 비교함
print(eval_dataset_fast['test'][0:2])

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 7686169
    })
})
*fast_len:7686169, len:7680008
{'text': ['Refer to the V$SYSTEM_EVENT view for time waited and average waits for thefollowing actions:', 'To estimate the time waited for reads incurred by rereading data blocks that had tobe written to disk because of a request from another instance, multiply the statistic(for example, the time waited for db ﬁle sequential reads) by the percentage of readI/O caused by previous cache ﬂushes as shown in this formula:'], 'input_ids': [[101, 150640, 10114, 10105, 159, 109, 146671, 168, 152248, 17904, 10142, 10635, 159651, 10111, 13551, 147264, 10142, 146964, 22115, 131, 102], [101, 11469, 78059, 10105, 10635, 159651, 10142, 91160, 158425, 10155, 11639, 66058, 10230, 11165, 47352, 10189, 10374, 149221, 13398, 10114, 50169, 12373, 10108, 169, 37449, 10188, 12864, 34469, 117, 154198, 10105, 154658, 113, 10142, 14351, 117, 10105, 106

In [7]:
# MLM을 위한 DataCollatorForLangunageModeling 호출
from transformers import DataCollatorForLanguageModeling

# input_ids에 대해 MLM 만들기
data_collator = DataCollatorForLanguageModeling(    # [MASK] 를 씌우는 것은 저희가 구현하지 않아도 됩니다! :-)
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# input_ids MLM 만들고 출력 해봄
mlm_train_sample = data_collator(train_dataset_fast['train']['input_ids'][0:2])
mlm_eval_sample = data_collator(eval_dataset_fast['test']['input_ids'][0:2])

print(f"train_dataset_fast(MLM)=======================================")
print(mlm_train_sample['input_ids'][0])
print(train_dataset_fast['train'][0])

print(f'\r\n\r\n')

print(f"eval_dataset_fast(MLM)=======================================")
print(mlm_eval_sample['input_ids'][0])
print(eval_dataset_fast['test'][0])

tensor([   101,    103,  10114,  10105,    159,    109, 146671,  45011, 152248,
         17904,  10142,  10635, 159651,  10111,  13551, 147264,  10142, 146964,
         22115,    131,    102,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0])
{'text': 'Refer to the V$SYSTEM_EVENT view for time waited and average waits for thefollowing actions:', 'input_ids': [101, 150640, 10114, 10105, 159, 109, 146671, 168, 152248, 17904, 10142, 10635, 159651, 10111, 13551, 147264, 10142, 146964, 22115, 131, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}



tensor([   101, 122834, 133309, 122378,  110

In [8]:
# 훈련 trainer 설정 
# trainer 

from transformers import Trainer, TrainingArguments

#########################################################################################
# hyper parameter 설정
#########################################################################################

epochs = 8           # epochs
#lr = 3e-5  # 학습률

total_optim_steps = len(train_dataset_fast["train"]) * epochs // batch_size   # 총 optimize(역전파) 스탭수 = 훈련dataset 계수 * epochs // 배치 크기
eval_steps=int(total_optim_steps * 0.05)           # 평가 스탭수
logging_steps=eval_steps                           # 로깅 스탭수(*평가스탭수 출력할때는 평가스탭수와 동일하게)
save_steps=int(total_optim_steps * 0.1)            # 저장 스탭수 
save_total_limit=2                                 # 마지막 2개 남기고 삭제 

print(f'*total_optim_steps: {total_optim_steps}, *eval_steps:{eval_steps}, *logging_steps:{logging_steps}, *save_steps:{save_steps}')
#########################################################################################

# cpu 사용이면 'no_cuda = True' 설정함.
no_cuda = False
if device == 'cpu':
    no_cuda = True
print(f'*no_cuda: {no_cuda}')

training_args = TrainingArguments(
    no_cuda = no_cuda,                      # GPU 사용  안함
    output_dir = OUTPATH,                   # 출력 모델 저장 경로 
    overwrite_output_dir=True,         
    num_train_epochs=epochs,                # 에폭
    #learning_rate=lr,                      # lr: 기본 5e-5
    per_gpu_train_batch_size=batch_size,    # 배치 사이즈 
    save_steps=save_steps,                  # step 수마다 모델을 저장
    save_total_limit=save_total_limit,      # 마지막 두 모델 빼고 과거 모델은 삭제
    evaluation_strategy="steps",            # 평가 전략 : steps
    eval_steps=eval_steps,                  # 평가할 스텝수
    logging_steps=logging_steps             # 로깅할 스탭수
)

# trainer로 훈련할때는 [mask] 처리된 input_ids 만 dataset으로 넘겨주면 됨.
train_dataset_fast_input_ids = train_dataset_fast['train']['input_ids']
eval_dataset_fast_input_ids = eval_dataset_fast['test']['input_ids']

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  #MLM(Masked Language Model)
    train_dataset=train_dataset_fast_input_ids,   # 훈련 데이터셋
    eval_dataset=eval_dataset_fast_input_ids      # 평가 데이터셋
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


*total_optim_steps: 1921542, *eval_steps:96077, *logging_steps:96077, *save_steps:192154
*no_cuda: False


In [None]:
# 훈련 시작
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 7686169
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1921544
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss,Validation Loss


In [None]:
# 모델 저장
### 전체모델 저장
TMP_OUT_PATH = '../../data11/model/distilbert/mdistilbertV1.2/'
os.makedirs(TMP_OUT_PATH, exist_ok=True)
#torch.save(model, OUTPATH + 'pytorch_model.bin') 
# save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
model.save_pretrained(TMP_OUT_PATH)

# tokeinizer 파일 저장(vocab)
VOCAB_PATH = TMP_OUT_PATH
tokenizer.save_pretrained(VOCAB_PATH)
print(f'==> save_model : {TMP_OUT_PATH}')