In [1]:
#==================================================================================
# Distillation 예제(증류)1
#
#: 교사 모델(BertModel) -> 학생모델(DistilBertModel) 로 distillation 하는 예시임
# 여기서는 교사모델과 학생모델 Fine-tuning 하여 증류하는 예시임.
# * 중요:교사모델이 더 잘 학습되어 있어야 하며, 교사/학생 모델이 tokenizer는 동일해야 한다.
#
# 자료 참고 
# https://re-code-cord.tistory.com/entry/Knowledge-Distillation-1
# https://towardsdatascience.com/distillation-of-bert-like-models-the-theory-32e19a02641f
# https://www.philschmid.de/knowledge-distillation-bert-transformers
# 소스 참고 
#https://github.com/boostcampaitech2/model-optimization-level3-cv-04/blob/main/src/trainer.py
#
# [증류 과정]
# 1. 교사모델 구조->학생 모델로 복사
# => 교사모델이 bert-base 이고, 학생 모델이 distilbert 라면, 교사 bert 모델이 12개 hiddenlayer에
# wegiht와 bias 값들을 학생모델 distilbert 6개 hiddenlayer로 복사함.
# (* 이때 교사모델이 어떤 hiddenlayer를 학샘모델로 복사할때는 [0, 2, 4, 7, 9, 11] 식으로 레이어를 복사 하는데 좋다고 함)
#
# 2. 교사모델, 학생모델 fine-tuning 사전준비
# => 각 교사, 학생모델을 classifcation이나 maksedlm 모델중 1나로 파인튜닝함
# (*Huggingface transformers 모델이용하면 쉬움
#
# 3. loss 함수 정의
# => loss 함수는 학생모델이 loss(1), 교사와 학생모델간 cross-entropy loss(2), 교사와 학생모델간 cosine-loss(3) 
# 3가지 인데, 이때 (2)와 (3) loss는 torch.nn.KLDivLoss 함수로 보통 대체 된다.
# 즉 증류 손실함수 = alpha*학생모델이 loss + (1-alpah)*교사/학생모델간 torch.nn.KLDivLoss 함수
#
# 이때 KLDivLoss 함수는 교사와 학생간 Dark Knowledge(어둠지식)도 학습되도록 교사loss/Temperture와 학생loss/Temperture 식으로,
# Temperture를 지정하는데, 보통 학습할때는 2~10으로 하고, 평가시에는 반드시 1로 해야 한다.
# (Temperture==1 이면, softmax와 동일, 1보다 크면 확률이 평활화 되어서, 어둠 지식 습득이 많이됨)
# 그리고 학생모델loss는 전체 loss에 0.1이 되도록 alpha값은 0.1이 좋다고 한다.
#
# 4. 학습
# => 교사모델은 평가(eval)만 하고, 학생모델만 학습(train)한다.
#
#==================================================================================
MaskedLM_FT = False         # MASKEDLM 파인 튜닝 적용하는 경우 
Classification_FT = True  # Classifcation 파인 튜닝 적용하는 경우

if MaskedLM_FT == True:
    # maskedLM 으로 파인튜닝할 경우
    from transformers import BertForMaskedLM, DistilBertForMaskedLM
elif Classification_FT == True:
    # sequenceclassificaiton으로 파인튜닝할 경우
    from transformers import BertForSequenceClassification, DistilBertForSequenceClassification
else:
    assert MaskedLM_FT == True or Classification_FT == True, "select fine-tuning model!"

#huggingface transformers 사용하다 보면 경고(warning)에러가 뜰때 안뜨도록 logging.set_verbosity_error() 추가함
from transformers import logging
logging.set_verbosity_error()  
    
import os
import sys
sys.path.append('../../')
from myutils import seed_everything, mlogging, GPU_info

device = GPU_info()
print(device)
seed_everything(111)
#logging 설정
logger =  mlogging(loggername="state_dict", logfilename="state_dict")

logfilepath:bwdataset_2022-03-31.log
logfilepath:qnadataset_2022-03-31.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:state_dict_2022-03-31.log


In [2]:
# ====================================================================
# 1. 교사모델 구조->학생 모델로 복사
#
# => bert 교사모델에서 state_dict 값을 distilbert 학생모델 state_dict 맞게 복사하여, student_state_dict을 만들고 나서,
# 이를 학생 distilbertforMaskedLM에 적용함
# ====================================================================
# 2. 교사모델, 학생모델 fine-tuning 서전준비
# => 여기서는 교사는 BertforMaskedLM 모델이고 학생은 DistilBertForMaskedLM모델로 지정해서 Fine-tuning 사전준비하
# ====================================================================

#출력 폴더
OUTPATH = '../../model/distilbert/distilbert-0331-TS-nli-0.1-10/'

# 교사 모델에서 state_dict 만 뽑아냄
tearch_model_path='../../model/bert/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327'

if MaskedLM_FT == True:
    tearch_model = BertForMaskedLM.from_pretrained(tearch_model_path, output_hidden_states=True)
elif Classification_FT == True:
    num_labels = 3
    tearch_model = BertForSequenceClassification.from_pretrained(tearch_model_path, output_hidden_states=True, num_labels=num_labels)

#state_dict 뽑아냄
tearch_state_dict = tearch_model.state_dict()
print(tearch_state_dict.keys())
print(tearch_state_dict['bert.embeddings.word_embeddings.weight'])
print(tearch_model)

odict_keys(['bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weigh

In [3]:
# 학생모델에 dict 표기해 봄
student_model_path='../../model/distilbert/distilbert-0327-empty'

if MaskedLM_FT == True:
    student_model = DistilBertForMaskedLM.from_pretrained(student_model_path, output_hidden_states=True)
elif Classification_FT == True:
    student_model = DistilBertForSequenceClassification.from_pretrained(student_model_path, output_hidden_states=True, num_labels=2)

print(student_model.num_parameters())
#state_dict 뽑아냄
student_state_dict = student_model.state_dict()
print(student_state_dict.keys())
print(student_state_dict['distilbert.embeddings.word_embeddings.weight'])
print(student_model)

172192514
odict_keys(['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.0.output_l

In [4]:
# 학생 state_dict 생성
# ** 교사 state_dict를 학생 모델에 알맞게 복사하여 학생모델 state_dict 생성
# => 교사모델에서 0, 2, 4, 7, 9, 11 값만 뽑아내서 distil_state_dict 만듬
distil_sd = None
#layers = [0, 2, 4, 7, 9, 11]
sys.path.append('..')

if MaskedLM_FT == True:
    from myutils import make_sate_dict_bertMaskedLM_to_distillbertMaskedLM
    distil_sd = make_sate_dict_bertMaskedLM_to_distillbertMaskedLM(tearch_model)
    
elif Classification_FT == True:
    from myutils import make_sate_dict_bertSequenceClass_to_distillbertSequenceClass
    distil_sd = make_sate_dict_bertSequenceClass_to_distillbertSequenceClass(tearch_model)

print(distil_sd.keys())

dict_keys(['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.0.output_layer_norm.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2

In [5]:
# 학생모델에 교사모델에서 뽑아낸 state_dict를 적용
student_model_path='../../model/distilbert/distilbert-0327-empty'

if MaskedLM_FT == True:
    student_model = DistilBertForMaskedLM.from_pretrained(student_model_path, state_dict=distil_sd, output_hidden_states=True)
elif Classification_FT == True:
    student_model = DistilBertForSequenceClassification.from_pretrained(student_model_path, state_dict=distil_sd, output_hidden_states=True, num_labels=num_labels)

#state_dict 뽑아냄
student_state_dict = student_model.state_dict()
print(student_state_dict.keys())
print(student_state_dict['distilbert.embeddings.word_embeddings.weight'])


odict_keys(['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.0.output_layer_norm.

In [6]:
if MaskedLM_FT == True:
    # 학습할 mskedlm 데이터 로더 생성.
    #
    from torch.utils.data import DataLoader, RandomSampler
    from transformers import BertTokenizer
    import sys
    sys.path.append('..')
    from myutils import MLMDataset

    batch_size = 8           # batch=32로 하면 CUDA MEMORY 오류 발생함
    token_max_len = 128

    tokenizer = BertTokenizer.from_pretrained(student_model_path, max_len=token_max_len, do_lower_case=False)

    input_corpus = '../../korpora/kowiki_20190620/wiki_20190620_small.txt'

    # 각 스페셜 tokenid를 구함
    CLStokenid = tokenizer.convert_tokens_to_ids('[CLS]')
    SEPtokenid = tokenizer.convert_tokens_to_ids('[SEP]')
    UNKtokenid = tokenizer.convert_tokens_to_ids('[UNK]')
    PADtokenid = tokenizer.convert_tokens_to_ids('[PAD]')
    MASKtokenid = tokenizer.convert_tokens_to_ids('[MASK]')
    print('CLSid:{}, SEPid:{}, UNKid:{}, PADid:{}, MASKid:{}'.format(CLStokenid, SEPtokenid, UNKtokenid, PADtokenid, MASKtokenid))


    train_dataset = MLMDataset(corpus_path = input_corpus,
                               tokenizer = tokenizer, 
                               CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                               SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                               UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                               PADtokenid = PADtokenid,    # [PAD] 토큰 id
                               Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                               max_sequence_len=token_max_len,  # max_sequence_len)
                               mlm_probability=0.15,
                               overwrite_cache=True
                              )


    # 학습 dataloader 생성
    # => tenosor로 만듬
    train_loader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              #shuffle=True, # dataset을 섞음
                              sampler=RandomSampler(train_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                              num_workers=3
                             )

    print(train_dataset[0])

In [7]:

if Classification_FT == True:
    # SequenceCalssification 데이터로더 생성

    # 학습 data loader 생성
    sys.path.append('..')
    from myutils import ClassificationCSVCorpus, ClassificationDataset, data_collator, KlueNLICorpus,KorNLICorpus
    from torch.utils.data import DataLoader, RandomSampler
    from transformers import BertTokenizer

    
    #############################################################################
    # 변수 설정
    #############################################################################
    max_seq_len = 128   # 글자 최대 토큰 길이 해당 토큰 길이 이상은 잘린다.
    batch_size = 32        # 배치 사이즈(64면 GUP Memory 오류 나므로, 32 이하로 설정할것=>max_seq_length 를 줄이면, 64도 가능함)

    # 훈련할 csv 파일
    #file_fpath = 'korpora/감성대화말뭉치/감성대화말뭉치(최종데이터)_renew_labelenc_Training.csv'
    #file_fpath = '../../korpora/nsmc/ratings_test.txt'
    #file_fpath = '../../korpora/klue-nli/klue-nli-v1.1_train.json'
    file_fpath = '../../korpora/kornli/multinli.train.ko.tsv'
    column_num = 3           # .csv 파일에 컬럼수(예: text, label만 있으면 =2)
    csvfile = 0              # 0:tsv 파일, 1: csv 파일
    label_list = ["0", "1"]  # .csv 파일에 레벨 목록( list로 입력해야 함)
    #label_list = ["0", "1", "2", "3", "4", "5"]  # .csv 파일에 레벨 목록( list로 입력해야 함)
    cache = False   # 캐쉬파일 생성할거면 True로 (True이면 loding할때 캐쉬파일있어도 이용안함)
    #############################################################################   

    # 분류 corpus 파일 설정
    #corpus = ClassificationCSVCorpus(column_num=column_num, iscsvfile=csvfile, label_list=label_list)
    
    # KlueNLI 혹은 KorNLI corpus 파일 생성
    #corpus = KlueNLICorpus()
    corpus = KorNLICorpus()
    
    tokenizer = BertTokenizer.from_pretrained(tearch_model_path, max_len=max_seq_len, do_lower_case=False)

    # 학습 dataset 생성
    dataset = ClassificationDataset(file_fpath=file_fpath, max_seq_length=max_seq_len, tokenizer=tokenizer, corpus=corpus, overwrite_cache=cache)


    # 학습 dataloader 생성
    train_loader = DataLoader(dataset, 
                              batch_size=batch_size, 
                              #shuffle=True, # dataset을 섞음
                              sampler=RandomSampler(dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                              collate_fn=data_collator, # dataset을 tensor로 변환(예시 {'input_ids':tensor[0,1,2,3,1,], 'token_type_id:tensor[0,0,0,0,0], 'attention_mask:tensor[1,1,1,1,1], 'labels':tensor[5]}
                              num_workers=4)
    
    # 평가 dataset 생성
    #file_fpath = '../../korpora/klue-nli/klue-nli-v1.1_dev.json'
    file_fpath = '../../korpora/kornli/xnli.test.ko.tsv'
    dataset = ClassificationDataset(file_fpath=file_fpath, max_seq_length=max_seq_len, tokenizer=tokenizer, corpus=corpus, overwrite_cache=cache)

    # 평가 dataloader 생성
    eval_loader = DataLoader(dataset, 
                              batch_size=batch_size, 
                              #shuffle=True, # dataset을 섞음
                              sampler=RandomSampler(dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                              collate_fn=data_collator, # dataset을 tensor로 변환(예시 {'input_ids':tensor[0,1,2,3,1,], 'token_type_id:tensor[0,0,0,0,0], 'attention_mask:tensor[1,1,1,1,1], 'labels':tensor[5]}
                              num_workers=4)



2022-03-31 18:37:19,125 - bwpdataset - INFO - Loading features from cached file ../../korpora/kornli/cached_BertTokenizer_128_multinli.train.ko.tsv [took 16.097 s]
2022-03-31 18:37:19,279 - bwpdataset - INFO - Loading features from cached file ../../korpora/kornli/cached_BertTokenizer_128_xnli.test.ko.tsv [took 0.151 s]


In [8]:
# ====================================================================
# 4. 학습
# => 교사모델은 평가(eval)만 하고, 학생모델만 학습(train)한다.
# ====================================================================
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
sys.path.append('..')
from myutils import knowledge_distillation_loss1

# 훈련 시작
# 학생만 훈련시킴. 교사는 eval()
##################################################
epochs = 5           # epochs
learning_rate = 2e-5  # 학습률
#p_itr = 2000           # 손실률 보여줄 step 수
#save_steps = 50000     # 50000 step마다 모델 저장

# ==증류(distillation)과 연관된 변수 ==

# 0.1이면 학생손실은 10%반영하고 Kld 손실(distillationloss)은 90% 반영하겠다는 의미
alpha = 0.1   
#alpha = 0.5   
# 1이면 softmax 확률이고, 1보다 크면 softmax 확률이 평할화 되면서, ghkr어둠지식(Dark Knowledge)을 보다 많이 습득하게됨
Temperture = 10   
#Temperture = 5
##################################################

# optimizer 적용=> 학생모델
optimizer = AdamW(student_model.parameters(), 
                 lr=learning_rate, 
                 eps=1e-8) # 0으로 나누는 것을 방지하기 위한 epsilon 값(10^-6 ~ 10^-8 사이 이값 입력합)

# 총 훈련과정에서 반복할 스탭
total_steps = len(train_loader)*epochs
warmup_steps = total_steps * 0.1 #10% of train data for warm-up

save_steps = total_steps * 0.2       # 모델 저장할 step
p_itr = total_steps * 0.05           # 손실률 보여줄 step 수

logger.info('*total_steps: {}, save_steps: {}, p_itr: {}'.format(total_steps, save_steps, p_itr))
    
# 스캐줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=warmup_steps, 
                                            num_training_steps=total_steps)

student_model.to(device)
tearch_model.to(device)

student_model.zero_grad()# 학생모델 초기화
tearch_model.eval() # 교사모델은 평가모델로 설정.

itr = 1
total_loss = 0
total_student_loss = 0
total_tearch_loss = 0
total_len = 0
total_correct = 0
list_training_loss = []
list_acc_loss = []
list_validation_acc_loss = []

for epoch in tqdm(range(epochs)):
    
    student_model.train() # 학생모델은 훈련모드로 변환
    
    for data in tqdm(train_loader):
        
        #optimizer.zero_grad()
        student_model.zero_grad()# 그래디언트 초기화
        
        # 입력 값 설정
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)       
        labels = data['labels'].to(device)
     
        # 교사모델 실행
        tearch_outputs = tearch_model(input_ids=input_ids,
                                    attention_mask=attention_mask,
                                    token_type_ids=token_type_ids,
                                    labels=labels)
        tearch_loss = tearch_outputs.loss
        tearch_logits = tearch_outputs.logits
        
        # 학생모델 실행
        student_outputs = student_model(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        labels=labels)
        
        # 출력값 loss,logits를 outputs에서 얻어옴
        student_loss = student_outputs.loss
        student_logits = student_outputs.logits
        
        # 총 손실 구함.
        loss = knowledge_distillation_loss1(student_loss = student_loss, 
                                 student_logits = student_logits, 
                                 teacher_logits = tearch_logits, 
                                 alpha = alpha, 
                                 Temperture = Temperture)

        # optimizer 과 scheduler 업데이트 시킴
        loss.backward()   # backward 구함
        # 그래디언트 클리핑 (gradient vanishing이나 gradient exploding 방지하기 위한 기법)
        torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)   
        optimizer.step()  # 가중치 파라미터 업데이트(optimizer 이동)
        scheduler.step()  # 학습률 감소
        
          # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
        # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
        with torch.no_grad():
            # 정확도 계산 
            total_loss += loss.item()
            total_student_loss += student_loss.item()
            total_tearch_loss += tearch_loss.item()
            
            # 주기마다 test(validataion) 데이터로 평가하여 손실류 계산함.
            if itr % p_itr == 0:
                logger.info('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, student_loss: {:.4f}, tearch_loss: {:.4f}'
                            .format(epoch+1, epochs, itr, total_loss/p_itr, total_student_loss/p_itr, total_tearch_loss/p_itr))

                list_training_loss.append(total_loss/p_itr)

                total_loss = 0
                total_student_loss = 0
                total_tearch_loss = 0
                total_len = 0
                total_correct = 0
            
            if itr % save_steps == 0:
                #전체모델 저장
                TMP_OUT_PATH = OUTPATH + str(itr)
                os.makedirs(TMP_OUT_PATH, exist_ok=True)
                # save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
                student_model.save_pretrained(TMP_OUT_PATH)
                #torch.save(model, TMP_OUT_PATH + 'pytorch_model.bin') 

                # tokeinizer 파일 저장(vocab)
                VOCAB_PATH = TMP_OUT_PATH
                #os.makedirs(VOCAB_PATH)
                tokenizer.save_pretrained(VOCAB_PATH)
                
                logger.info('Iteration {} -> save model:{}'.format(itr, TMP_OUT_PATH))

        itr+=1
        
    ####################################################################
    # 1epochs 마다 실제 test(validattion)데이터로 평가 해봄
    start = time.time()
    logger.info(f'---------------------------------------------------------')

    # 평가 시작
    student_model.eval()

    total_test_correct = 0
    total_test_len = 0

    for data in tqdm(eval_loader):
        # 입력 값 설정
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        #token_type_ids = data['token_type_ids'].to(device)       
        labels = data['labels'].to(device)

        # 손실률 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
        # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
        # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
        with torch.no_grad():
            # 모델 실행
            outputs = student_model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                #token_type_ids=token_type_ids,
                                labels=labels)

            # 출력값 loss,logits를 outputs에서 얻어옴
            #loss = outputs.loss
            logits = outputs.logits

            # 총 손실류 구함
            pred = torch.argmax(F.softmax(logits), dim=1)
            correct = pred.eq(labels)
            total_test_correct += correct.sum().item()
            total_test_len += len(labels)

    list_validation_acc_loss.append(total_test_correct/total_test_len)
    logger.info("[Epoch {}/{}] Validatation Accuracy:{}".format(epoch+1, epochs, total_test_correct / total_test_len))
    logger.info(f'---------------------------------------------------------')
    logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
    logger.info(f'-END-\n')
    ####################################################################
        

2022-03-31 18:37:19,305 - state_dict - INFO - *total_steps: 61360, save_steps: 12272.0, p_itr: 3068.0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/12272 [00:00<?, ?it/s]

2022-03-31 18:46:17,853 - state_dict - INFO - [Epoch 1/5] Iteration 3068 -> Train Loss: 0.1098, student_loss: 1.0909, tearch_loss: 1.1001
2022-03-31 18:55:16,293 - state_dict - INFO - [Epoch 1/5] Iteration 6136 -> Train Loss: 0.1082, student_loss: 1.0623, tearch_loss: 1.1002
2022-03-31 19:04:07,766 - state_dict - INFO - [Epoch 1/5] Iteration 9204 -> Train Loss: 0.1075, student_loss: 1.0492, tearch_loss: 1.0999
2022-03-31 19:12:59,615 - state_dict - INFO - [Epoch 1/5] Iteration 12272 -> Train Loss: 0.1070, student_loss: 1.0412, tearch_loss: 1.1000
2022-03-31 19:13:01,189 - state_dict - INFO - Iteration 12272 -> save model:../../model/distilbert/distilbert-0331-TS-nli-0.1-10/12272
2022-03-31 19:13:01,461 - state_dict - INFO - ---------------------------------------------------------


  0%|          | 0/157 [00:00<?, ?it/s]

  pred = torch.argmax(F.softmax(logits), dim=1)
2022-03-31 19:13:05,429 - state_dict - INFO - [Epoch 1/5] Validatation Accuracy:0.680439121756487
2022-03-31 19:13:05,431 - state_dict - INFO - ---------------------------------------------------------
2022-03-31 19:13:05,432 - state_dict - INFO - === 처리시간: 3.970 초 ===
2022-03-31 19:13:05,433 - state_dict - INFO - -END-



  0%|          | 0/12272 [00:00<?, ?it/s]

2022-03-31 19:22:02,248 - state_dict - INFO - [Epoch 2/5] Iteration 15340 -> Train Loss: 0.1064, student_loss: 1.0295, tearch_loss: 1.1000
2022-03-31 19:30:39,494 - state_dict - INFO - [Epoch 2/5] Iteration 18408 -> Train Loss: 0.1063, student_loss: 1.0272, tearch_loss: 1.1000
2022-03-31 19:39:09,601 - state_dict - INFO - [Epoch 2/5] Iteration 21476 -> Train Loss: 0.1061, student_loss: 1.0251, tearch_loss: 1.0999
2022-03-31 19:47:36,818 - state_dict - INFO - [Epoch 2/5] Iteration 24544 -> Train Loss: 0.1061, student_loss: 1.0236, tearch_loss: 1.1002
2022-03-31 19:47:38,306 - state_dict - INFO - Iteration 24544 -> save model:../../model/distilbert/distilbert-0331-TS-nli-0.1-10/24544
2022-03-31 19:47:38,616 - state_dict - INFO - ---------------------------------------------------------


  0%|          | 0/157 [00:00<?, ?it/s]

2022-03-31 19:47:42,596 - state_dict - INFO - [Epoch 2/5] Validatation Accuracy:0.7163672654690619
2022-03-31 19:47:42,599 - state_dict - INFO - ---------------------------------------------------------
2022-03-31 19:47:42,600 - state_dict - INFO - === 처리시간: 3.984 초 ===
2022-03-31 19:47:42,601 - state_dict - INFO - -END-



  0%|          | 0/12272 [00:00<?, ?it/s]

2022-03-31 19:56:33,571 - state_dict - INFO - [Epoch 3/5] Iteration 27612 -> Train Loss: 0.1052, student_loss: 1.0085, tearch_loss: 1.0999
2022-03-31 20:05:26,688 - state_dict - INFO - [Epoch 3/5] Iteration 30680 -> Train Loss: 0.1052, student_loss: 1.0086, tearch_loss: 1.0999
2022-03-31 20:14:17,318 - state_dict - INFO - [Epoch 3/5] Iteration 33748 -> Train Loss: 0.1052, student_loss: 1.0078, tearch_loss: 1.1001
2022-03-31 20:23:04,171 - state_dict - INFO - [Epoch 3/5] Iteration 36816 -> Train Loss: 0.1052, student_loss: 1.0079, tearch_loss: 1.1002
2022-03-31 20:23:05,723 - state_dict - INFO - Iteration 36816 -> save model:../../model/distilbert/distilbert-0331-TS-nli-0.1-10/36816
2022-03-31 20:23:06,019 - state_dict - INFO - ---------------------------------------------------------


  0%|          | 0/157 [00:00<?, ?it/s]

2022-03-31 20:23:09,951 - state_dict - INFO - [Epoch 3/5] Validatation Accuracy:0.7285429141716567
2022-03-31 20:23:09,952 - state_dict - INFO - ---------------------------------------------------------
2022-03-31 20:23:09,953 - state_dict - INFO - === 처리시간: 3.934 초 ===
2022-03-31 20:23:09,954 - state_dict - INFO - -END-



  0%|          | 0/12272 [00:00<?, ?it/s]

2022-03-31 20:31:57,190 - state_dict - INFO - [Epoch 4/5] Iteration 39884 -> Train Loss: 0.1045, student_loss: 0.9939, tearch_loss: 1.1001
2022-03-31 20:40:46,149 - state_dict - INFO - [Epoch 4/5] Iteration 42952 -> Train Loss: 0.1045, student_loss: 0.9939, tearch_loss: 1.1001
2022-03-31 20:49:44,215 - state_dict - INFO - [Epoch 4/5] Iteration 46020 -> Train Loss: 0.1044, student_loss: 0.9926, tearch_loss: 1.0996
2022-03-31 20:58:32,879 - state_dict - INFO - [Epoch 4/5] Iteration 49088 -> Train Loss: 0.1045, student_loss: 0.9939, tearch_loss: 1.1003
2022-03-31 20:58:34,441 - state_dict - INFO - Iteration 49088 -> save model:../../model/distilbert/distilbert-0331-TS-nli-0.1-10/49088
2022-03-31 20:58:34,769 - state_dict - INFO - ---------------------------------------------------------


  0%|          | 0/157 [00:00<?, ?it/s]

2022-03-31 20:58:38,708 - state_dict - INFO - [Epoch 4/5] Validatation Accuracy:0.7335329341317365
2022-03-31 20:58:38,710 - state_dict - INFO - ---------------------------------------------------------
2022-03-31 20:58:38,710 - state_dict - INFO - === 처리시간: 3.942 초 ===
2022-03-31 20:58:38,711 - state_dict - INFO - -END-



  0%|          | 0/12272 [00:00<?, ?it/s]

2022-03-31 21:07:33,812 - state_dict - INFO - [Epoch 5/5] Iteration 52156 -> Train Loss: 0.1039, student_loss: 0.9827, tearch_loss: 1.1001
2022-03-31 21:16:22,097 - state_dict - INFO - [Epoch 5/5] Iteration 55224 -> Train Loss: 0.1039, student_loss: 0.9830, tearch_loss: 1.0999
2022-03-31 21:25:08,563 - state_dict - INFO - [Epoch 5/5] Iteration 58292 -> Train Loss: 0.1039, student_loss: 0.9829, tearch_loss: 1.1001
2022-03-31 21:33:55,229 - state_dict - INFO - [Epoch 5/5] Iteration 61360 -> Train Loss: 0.1038, student_loss: 0.9825, tearch_loss: 1.0999
2022-03-31 21:33:56,750 - state_dict - INFO - Iteration 61360 -> save model:../../model/distilbert/distilbert-0331-TS-nli-0.1-10/61360
2022-03-31 21:33:57,036 - state_dict - INFO - ---------------------------------------------------------


  0%|          | 0/157 [00:00<?, ?it/s]

2022-03-31 21:34:01,016 - state_dict - INFO - [Epoch 5/5] Validatation Accuracy:0.7321357285429142
2022-03-31 21:34:01,017 - state_dict - INFO - ---------------------------------------------------------
2022-03-31 21:34:01,018 - state_dict - INFO - === 처리시간: 3.983 초 ===
2022-03-31 21:34:01,019 - state_dict - INFO - -END-



In [9]:
# student 모델 저장
#OUTPATH = '../../model/distilbert/distilbert-0327-TS-nli'
os.makedirs(OUTPATH, exist_ok=True)
#torch.save(model, OUTPATH + 'pytorch_model.bin') 
# save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
student_model.save_pretrained(OUTPATH)

# tokeinizer 파일 저장(vocab)
VOCAB_PATH = OUTPATH
tokenizer.save_pretrained(VOCAB_PATH)

('../../model/distilbert/distilbert-0331-TS-nli-0.1-10/tokenizer_config.json',
 '../../model/distilbert/distilbert-0331-TS-nli-0.1-10/special_tokens_map.json',
 '../../model/distilbert/distilbert-0331-TS-nli-0.1-10/vocab.txt',
 '../../model/distilbert/distilbert-0331-TS-nli-0.1-10/added_tokens.json')