In [None]:
#===================================================================================
# catalyst는 버전 20.09 설치해야 함
# => 이후 버전에는 check_ddp_wrapped 함수(from catalyst.dl.utils import check_ddp_wrapped ) 가 없어서, 
# 오류 발생함
#===================================================================================

#!pip install catalyst==20.09.1

In [1]:
#===================================================================================
# pytorch 에 Catalyst를 이용한 bert distillation 예제
#
# catalyst 설치해야 함
#
# 참고자료 : https://medium.com/pytorch/bert-distillation-with-catalyst-c6f30c985854
# 소스 : https://github.com/elephantmipt/bert-distillation
# 
# => myutils.distillation 필요
#===================================================================================

import torch
import sys
from catalyst import dl
sys.path.append('..')
from myutils.distillation.runners import DistilMLMRunner
from myutils.distillation.models import DistilbertStudentModel, BertForMLM
from catalyst.core import MetricAggregationCallback
from torch.utils.data import DataLoader
from myutils.distillation.callbacks import (
    CosineLossCallback,
    KLDivLossCallback,
    MaskedLanguageModelCallback,
    MSELossCallback,
    PerplexityMetricCallbackDistillation,
)
import pandas as pd

# myutils 패키지 import
import sys
sys.path.append("..")
from myutils import seed_everything, GPU_info, pytorch_cos_sim, mlogging

device = GPU_info()
print(device)

#seed 설정
seed_everything(111)

#logging 설정
logger =  mlogging(loggername="distilbertembedding", logfilname="distilbertembedding")


logfilepath:bwdataset_2022-03-18.log
logfilepath:qnadataset_2022-03-18.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:distilbertembedding_2022-03-18.log


In [2]:
#===================================================================================
# 교사모델과 학생모델은 tokenizer는 같아야 하며, word_embedding 사이즈도 같아야 한다.
# =>서로 다른 size인 경우에는 증류할수 없음
#===================================================================================

# 교사 모델 과 tokenizer 경로 
#teacher_model_name = "bert-base-cased"
teacher_model_name = '../model/bert/bmc_fpt_kowiki20200920.train_model_0225'

# 학생 모델 경로 
student_model_name = '../model/distilbert/distilbert-0318-10'

# new 학생 출력 파일 경로
OUTPATH = '../model/distilbert/distilbert-0318-10-ts-1'

# 훈련/평가 corpus 경로
input_corpus = '../my_data/wiki_20190620_small_1.txt'
eval_corpus = '../my_data/wiki_20190620_small_2.txt'

In [3]:
#==========================================================================
# 학습/평가 dataloader 생성
# => input_ids, token_type_ids, attention_mask, masked_lm_labels 얻어옴
# => label 대신에 masked_lm_labels 로 변경해야 함.
#==========================================================================

from transformers import AutoTokenizer

from torch.utils.data import DataLoader, RandomSampler
sys.path.append('..')
from  myutils import MLMDatasetDistillation

#== 교사 모델 Tokenizer 설정
tokenizer = AutoTokenizer.from_pretrained(teacher_model_name, do_lower_case=False)

# 각 스페셜 tokenid를 구함
CLStokenid = tokenizer.convert_tokens_to_ids('[CLS]')
SEPtokenid = tokenizer.convert_tokens_to_ids('[SEP]')
UNKtokenid = tokenizer.convert_tokens_to_ids('[UNK]')
PADtokenid = tokenizer.convert_tokens_to_ids('[PAD]')
MASKtokenid = tokenizer.convert_tokens_to_ids('[MASK]')
print('CLSid:{}, SEPid:{}, UNKid:{}, PADid:{}, MASKid:{}'.format(CLStokenid, SEPtokenid, UNKtokenid, PADtokenid, MASKtokenid))


train_dataset = MLMDatasetDistillation(corpus_path = input_corpus,
                           tokenizer = tokenizer, 
                           CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                           SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                           UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                           PADtokenid = PADtokenid,    # [PAD] 토큰 id
                           Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                           max_sequence_len=128,  # max_sequence_len)
                           mlm_probability=0.15,
                           overwrite_cache=False
                          )


# 학습 dataloader 생성
# => tenosor로 만듬
train_dataloader = DataLoader(train_dataset, 
                          batch_size=16, 
                          #shuffle=True, # dataset을 섞음
                          sampler=RandomSampler(train_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                          num_workers=3
                         )



eval_dataset = MLMDatasetDistillation(corpus_path = eval_corpus,
                           tokenizer = tokenizer, 
                           CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                           SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                           UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                           PADtokenid = PADtokenid,    # [PAD] 토큰 id
                           Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                           max_sequence_len=128,  # max_sequence_len)
                           mlm_probability=0.15,
                           overwrite_cache=False
                          )

# 평가 dataloader 생성
valid_dataloader = DataLoader(eval_dataset, 
                          batch_size=16, 
                          #shuffle=True, # dataset을 섞음
                          sampler=RandomSampler(train_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                          num_workers=3
                         )

print(train_dataset[0])

loaders = {"train": train_dataloader, "valid": valid_dataloader}


CLSid:101, SEPid:102, UNKid:100, PADid:0, MASKid:103
*corpus:../my_data/wiki_20190620_small_1.txt
*max_sequence_len:128
*mlm_probability:0.15
*CLStokenid:101, SEPtokenid:102, UNKtokenid:100, PADtokeinid:0, Masktokeid:103
*total_line: 2


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

*corpus:../my_data/wiki_20190620_small_2.txt
*max_sequence_len:128
*mlm_probability:0.15
*CLStokenid:101, SEPtokenid:102, UNKtokenid:100, PADtokeinid:0, Masktokeid:103
*total_line: 2


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

{'input_ids': tensor([   101, 120728,   9551,    107, 125356,    107, 128174, 122840,  11018,
        120274, 119711,  23545,  11303,  48506,  70672,    103,    103,    102,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,   

In [None]:
count = 0
for data in train_dataloader:
    print(data)
    count += 1
    if count > 2:
        break 

In [4]:
# 교사모델 생성
teacher = BertForMLM(teacher_model_name)

# state_dict 출력 해봄
teacher_dict = teacher.state_dict()

#print('bert.bert.embeddings.position_ids')
#print(teacher_dict["bert.bert.embeddings.position_ids"].shape)
#print(teacher_dict["bert.bert.embeddings.position_ids"])

#print('bert.bert.embeddings.word_embeddings.weight')
#print(teacher_dict["bert.bert.embeddings.word_embeddings.weight"].shape)
#print(teacher_dict["bert.bert.embeddings.word_embeddings.weight"])

tearch_word_embeddings_weigth_len = len(teacher_dict["bert.bert.embeddings.word_embeddings.weight"])
print(tearch_word_embeddings_weigth_len)

143772


In [5]:
#============================================================================================================
# 학생모델 생성
# => 학생모델을 생성하기 전에 교사모델의 weigth와 bias들을 복사하는 과정이 이루어짐. 
# => layers =[0,2,4,7,9,11] 하면 교사모델이 0,2,4,7,9,11 hidden layer를 학생 hidden layer(6개)로 복사하게 됨.
# => 교사는 12개의 hidden_layer를 가지는 bert 모델이어야하며, 학생은 hidden_layer가 6개인 distilbert 이어야 한다.
#============================================================================================================
student = DistilbertStudentModel(teacher_model_name=teacher_model_name, 
                                 student_model_name=student_model_name,
                                 layers = [0, 2, 4, 7, 9, 11])

student_model_name:../model/distilbert/distilbert-0318-10


In [6]:
# 교사모델과 학생모델을 연결
model = torch.nn.ModuleDict({"teacher": teacher, "student": student})
#print(model)

In [None]:
# 콜백 함수 정의
callbacks = {
    "masked_lm_loss": MaskedLanguageModelCallback(),
    "mse_loss": MSELossCallback(),
    "cosine_loss": CosineLossCallback(),
    "kl_div_loss": KLDivLossCallback(),
    "loss": MetricAggregationCallback(
        prefix="loss",
        mode="weighted_sum",
        metrics={
            "cosine_loss": 1.0,
            "masked_lm_loss": 1.0,
            "kl_div_loss": 1.0,
            "mse_loss": 1.0
        }
    ),
    "optimizer": dl.OptimizerCallback(),
    "perplexity": PerplexityMetricCallbackDistillation()
}


In [None]:
# 실행
runner = DistilMLMRunner(device=device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

runner.train(
    model=model,
    optimizer=optimizer,
    loaders=loaders,
    verbose=True,
    check=True,
    callbacks=callbacks,
    num_epochs=10,
    #logdir="./logs",  # * log 폴더에 static_dict 이 자동으로 생성됨
)


In [7]:
# 학생 모델을 저장함.
# => 학생모델을 바로 저장할수 없어서, 일단 state_dict으로 저장후 
import os
from transformers import DistilBertModel   
os.makedirs(OUTPATH, exist_ok=True)

# 학생모델 state_dict 만 저장.
state_dict_fpath = OUTPATH + '/state_dict.pt'
torch.save(model["student"].state_dict(), state_dict_fpath)
                      
# 기존 학생 모델에서, 증류학습한 state_dict 를 적용함.
new_student_model = DistilBertModel.from_pretrained(student_model_name, state_dict=torch.load(state_dict_fpath))
print(new_student_model)

### 신규 학생 모델 저장
os.makedirs(OUTPATH, exist_ok=True)
# save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
new_student_model.save_pretrained(OUTPATH)

# tokeinizer 파일 저장(vocab)
os.makedirs(OUTPATH, exist_ok=True)
tokenizer.save_pretrained(OUTPATH)


Some weights of the model checkpoint at ../model/distilbert/distilbert-0318-10 were not used when initializing DistilBertModel: ['student.distilbert.transformer.layer.1.attention.out_lin.weight', 'student.distilbert.transformer.layer.0.sa_layer_norm.weight', 'student.vocab_projector.weight', 'student.distilbert.transformer.layer.5.output_layer_norm.bias', 'student.distilbert.transformer.layer.1.attention.k_lin.bias', 'student.distilbert.transformer.layer.3.attention.q_lin.bias', 'student.distilbert.transformer.layer.2.attention.k_lin.weight', 'student.distilbert.transformer.layer.3.attention.q_lin.weight', 'student.distilbert.transformer.layer.3.sa_layer_norm.bias', 'student.distilbert.transformer.layer.3.attention.k_lin.bias', 'student.distilbert.transformer.layer.5.attention.v_lin.bias', 'student.distilbert.transformer.layer.1.sa_layer_norm.weight', 'student.vocab_layer_norm.weight', 'student.distilbert.transformer.layer.4.output_layer_norm.bias', 'student.distilbert.transformer.laye

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(143772, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(

('../model/distilbert/distilbert-0318-10-ts-1/tokenizer_config.json',
 '../model/distilbert/distilbert-0318-10-ts-1/special_tokens_map.json',
 '../model/distilbert/distilbert-0318-10-ts-1/vocab.txt',
 '../model/distilbert/distilbert-0318-10-ts-1/added_tokens.json',
 '../model/distilbert/distilbert-0318-10-ts-1/tokenizer.json')

In [None]:
'''
# 학습한 모델 static_dict.pt 저장함
import os

OUTPATH = '../model/distilbert/distilbert-model-0317-distillation-2'
static_dict_fpath = OUTPATH + '/static_dict.pt'

os.makedirs(OUTPATH, exist_ok=True)
torch.save(model.state_dict(), static_dict_fpath) 
'''

In [None]:
''''
# 저장한 static_dict.pt를 불러와서 기존 학생모델에 적용 한후, 저장함
import os
from transformers import DistilBertModel         

# static_dict 저정경로에 있는 dict 종류중에 best.pth를 설정함.
# static_dict 아래 3가지가 생성됨.
# => logs/checkpoints/train.2.pth
# => logs/checkpoints/last.pth 
# => logs/checkpoints/best.pth 
static_dict_fpath = './logs/checkpoints/best.pth'

# 기존 학생 모델에서, 증류학습한 state_dict 를 적용함.
new_student_model = DistilBertModel.from_pretrained(student_model_name, state_dict=torch.load(static_dict_fpath))
print(new_student_model)

### 신규 학생 모델 저장
new_student_model_fpath = ''
os.makedirs(OUTPATH, exist_ok=True)
# save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
new_student_model.save_pretrained(OUTPATH)

# tokeinizer 파일 저장(vocab)
os.makedirs(OUTPATH, exist_ok=True)
tokenizer.save_pretrained(OUTPATH)
'''