In [17]:
!pip install datasets
!pip install peft
!pip install gdown



In [18]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, DistilBertForSequenceClassification, get_scheduler, BertForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
import gdown
import time

In [19]:
url = 'https://drive.google.com/uc?id=12MOGiCveDE8CTvtHKqmEhyJIXc3gEscd'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = 'TwoStageDistilBERT_LoRA.pt'
checkpoint = "distilbert/distilbert-base-uncased"
bert_checkpoint = 'skt/kobert-base-v1'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#bert_tokenizer = AutoTmokenizer.from_pretrained(bert_checkpoint)

gdown.download(url, model_name, quiet = False)

model_checkpoint = torch.load(model_name, map_location = device)

Downloading...
From (original): https://drive.google.com/uc?id=12MOGiCveDE8CTvtHKqmEhyJIXc3gEscd
From (redirected): https://drive.google.com/uc?id=12MOGiCveDE8CTvtHKqmEhyJIXc3gEscd&confirm=t&uuid=835c41de-42f3-4f81-bbe7-dd0f6fbbf736
To: /content/TwoStageDistilBERT_LoRA.pt
100%|██████████| 893M/893M [00:11<00:00, 80.5MB/s]
  model_checkpoint = torch.load(model_name, map_location = device)


In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = "distilbert/distilbert-base-uncased"
bert_checkpoint = 'skt/kobert-base-v1'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [21]:
class TwoStageDistilBERT_LoRA(nn.Module):
  def __init__(self, distilbert_checkpoint, num_labels_1stage = 2, num_labels_2stage = 3):
    super(TwoStageDistilBERT_LoRA, self).__init__()


    # 첫 번째 stage
    self.distilbert1 = DistilBertForSequenceClassification.from_pretrained(distilbert_checkpoint,
                                                                           num_labels = num_labels_1stage, ignore_mismatched_sizes = True,
                                                                           output_hidden_states=True)

    lora_config1 = LoraConfig(task_type = TaskType.SEQ_CLS, r = 8, lora_alpha = 32, target_modules = ['q_lin', 'v_lin'], lora_dropout = 0.1 )
    self.distilbert1 = get_peft_model(self.distilbert1, lora_config1)
    self.distilbert1 = get_peft_model(self.distilbert1, lora_config1)

    # 두 번째 stage
    self.distilbert2 = DistilBertForSequenceClassification.from_pretrained(distilbert_checkpoint,
                                                                           num_labels = num_labels_2stage, ignore_mismatched_sizes = True)




  def forward(self, input_ids,  attention_mask, labels1 = None, labels2 = None):
    output1 = self.distilbert1(input_ids = input_ids, attention_mask = attention_mask, labels = labels1)
    hidden1 = output1.hidden_states[-1] # 마지막 레이어의 hidden state 가져오기
    logits1 = output1.logits

    pred1 = torch.argmax(logits1, dim = 1)

    output2 = self.distilbert2(inputs_embeds = hidden1, attention_mask = attention_mask, labels = labels2)
    logits2 = output2.logits

    total_loss = output1.loss + output2.loss


    return total_loss, logits1, logits2

In [22]:
def load_checkpoint(model, model_checkpoint):
  model.load_state_dict(model_checkpoint['model_state_dict'])

  print(f"Checkpoint loaded!")
  return model


model = TwoStageDistilBERT_LoRA(distilbert_checkpoint = checkpoint)

model = load_checkpoint(model, model_checkpoint)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Checkpoint loaded!


In [23]:
text = ['서울대 캠퍼스 입학 교육 대학 교수 학생 공지 연구 대학원 서울대학교 지원 도서관 서비스 미디어 월 행정 캘린더 센터 학사 뉴스 프로그램 학습 수 제 인스타그램 성과 구지원 학술 사항 안내 생활 관악 소식 소개 기념 역사 맵 가을 일 년 단 부문 영상 모습 회 중앙 예술 메뉴 일반']

In [24]:
# https://seungseop.tistory.com/41

def model_inference(model, tokenizer, text):
  model.eval()
  model = model.to(device)

  start_event = torch.cuda.Event(enable_timing = True)
  end_event = torch.cuda.Event(enable_timing = True)

  with torch.no_grad():


    # 텍스트를 토큰화
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    start_event.record()

    #입력에 대한 추론 (추론에서는 gradient 필요없음)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # 1단계 모델에 입력
    output1 = model.distilbert1(input_ids=input_ids, attention_mask=attention_mask)
    logits1 = output1.logits

    hidden1 = output1.hidden_states[-1]  # 마지막 레이어의 hidden state
    output2 = model.distilbert2(inputs_embeds=hidden1, attention_mask=attention_mask)

    end_event.record()

  torch.cuda.synchronize()

  time_taken = start_event.elapsed_time(end_event)
  return time_taken

In [25]:
distilbert_inference = model_inference(model, tokenizer, text)
print(f"Elapsed time on GPU: {distilbert_inference} mile seconds")

Elapsed time on GPU: 45.69004821777344 mile seconds


In [26]:
for name, module in BertForSequenceClassification.from_pretrained(bert_checkpoint,
                                                                           num_labels = 2, ignore_mismatched_sizes = True,
                                                                           output_hidden_states=True).named_modules():
  print(name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



bert
bert.embeddings
bert.embeddings.word_embeddings
bert.embeddings.position_embeddings
bert.embeddings.token_type_embeddings
bert.embeddings.LayerNorm
bert.embeddings.dropout
bert.encoder
bert.encoder.layer
bert.encoder.layer.0
bert.encoder.layer.0.attention
bert.encoder.layer.0.attention.self
bert.encoder.layer.0.attention.self.query
bert.encoder.layer.0.attention.self.key
bert.encoder.layer.0.attention.self.value
bert.encoder.layer.0.attention.self.dropout
bert.encoder.layer.0.attention.output
bert.encoder.layer.0.attention.output.dense
bert.encoder.layer.0.attention.output.LayerNorm
bert.encoder.layer.0.attention.output.dropout
bert.encoder.layer.0.intermediate
bert.encoder.layer.0.intermediate.dense
bert.encoder.layer.0.intermediate.intermediate_act_fn
bert.encoder.layer.0.output
bert.encoder.layer.0.output.dense
bert.encoder.layer.0.output.LayerNorm
bert.encoder.layer.0.output.dropout
bert.encoder.layer.1
bert.encoder.layer.1.attention
bert.encoder.layer.1.attention.self
bert.e

In [27]:
class TwoStageBERT_LoRA(nn.Module):
  def __init__(self, bert_checkpoint, num_labels_1stage = 2, num_labels_2stage = 3):
    super(TwoStageBERT_LoRA, self).__init__()


    # 첫 번째 stage
    self.distilbert1 = BertForSequenceClassification.from_pretrained(bert_checkpoint,
                                                                           num_labels = num_labels_1stage, ignore_mismatched_sizes = True,
                                                                           output_hidden_states=True)

    lora_config1 = LoraConfig(task_type = TaskType.SEQ_CLS, r = 8, lora_alpha = 32, target_modules = ['query', 'value'], lora_dropout = 0.1 )
    self.distilbert1 = get_peft_model(self.distilbert1, lora_config1)
    self.distilbert1 = get_peft_model(self.distilbert1, lora_config1)

    # 두 번째 stage
    self.distilbert2 = BertForSequenceClassification.from_pretrained(bert_checkpoint,
                                                                           num_labels = num_labels_2stage, ignore_mismatched_sizes = True)




  def forward(self, input_ids,  attention_mask, labels1 = None, labels2 = None):
    output1 = self.distilbert1(input_ids = input_ids, attention_mask = attention_mask, labels = labels1)
    hidden1 = output1.hidden_states[-1] # 마지막 레이어의 hidden state 가져오기
    logits1 = output1.logits

    pred1 = torch.argmax(logits1, dim = 1)

    output2 = self.distilbert2(inputs_embeds = hidden1, attention_mask = attention_mask, labels = labels2)
    logits2 = output2.logits

    total_loss = output1.loss + output2.loss


    return total_loss, logits1, logits2

In [28]:
bert_model = TwoStageBERT_LoRA(bert_checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
bert_checkpoint = 'skt/kobert-base-v1'

bert_tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)


bert_inference = model_inference(bert_model, bert_tokenizer, text)
print(f"Elapsed time on GPU: {bert_inference } mile seconds")

Elapsed time on GPU: 109.54857635498047 mile seconds


In [30]:
class TwoStageDistilBERT_LoRA1(nn.Module):
  def __init__(self, distilbert_checkpoint, num_labels_1stage = 2, num_labels_2stage = 3):
    super(TwoStageDistilBERT_LoRA1, self).__init__()


    # 첫 번째 stage
    self.distilbert1 = DistilBertForSequenceClassification.from_pretrained(distilbert_checkpoint,
                                                                           num_labels = num_labels_1stage, ignore_mismatched_sizes = True,
                                                                           output_hidden_states=True)

    lora_config1 = LoraConfig(task_type = TaskType.SEQ_CLS, r = 8, lora_alpha = 32, target_modules = ['q_lin', 'v_lin'], lora_dropout = 0.1 )
    self.distilbert1 = get_peft_model(self.distilbert1, lora_config1)

    # 두 번째 stage
    self.distilbert2 = DistilBertForSequenceClassification.from_pretrained(distilbert_checkpoint,
                                                                           num_labels = num_labels_2stage, ignore_mismatched_sizes = True)




  def forward(self, input_ids,  attention_mask, labels1 = None, labels2 = None):
    output1 = self.distilbert1(input_ids = input_ids, attention_mask = attention_mask, labels = labels1)
    hidden1 = output1.hidden_states[-1] # 마지막 레이어의 hidden state 가져오기
    logits1 = output1.logits

    pred1 = torch.argmax(logits1, dim = 1)

    output2 = self.distilbert2(inputs_embeds = hidden1, attention_mask = attention_mask, labels = labels2)
    logits2 = output2.logits

    total_loss = output1.loss + output2.loss


    return total_loss, logits1, logits2

In [31]:
distilbert_lora1_model = TwoStageDistilBERT_LoRA1(distilbert_checkpoint = checkpoint)

distilbert_lora1_inference = model_inference(distilbert_lora1_model, tokenizer, text)
print(f"Elapsed time on GPU: {distilbert_lora1_inference} mile seconds")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Elapsed time on GPU: 30.306175231933594 mile seconds


In [32]:
print(f'DistilBERT: {distilbert_inference:.3f}ms, BERT: {bert_inference:.3f}ms')
print(f'DistilBERT model is {bert_inference /distilbert_inference:.1f} times faster')

DistilBERT: 45.690ms, BERT: 109.549ms
DistilBERT model is 2.4 times faster


In [35]:
a = []
b = []
c = []
for i in range(100):
  bert_inference = model_inference(bert_model, bert_tokenizer, text)
  distilbert_lora1_inference = model_inference(distilbert_lora1_model, tokenizer, text)
  distilbert_inference = model_inference(model, tokenizer, text)
  c.append(bert_inference)
  b.append(distilbert_lora1_inference)
  a.append(distilbert_inference)
  print(f'DistilBERT: {distilbert_inference:.3f}ms, LoRA_1: {distilbert_lora1_inference:.3f}ms, BERT: {bert_inference:.3f}ms')
  print(f'DistilBERT model is {bert_inference /distilbert_inference:.1f} times faster')

DistilBERT: 35.747ms, LoRA_1: 30.161ms, BERT: 78.512ms
DistilBERT model is 2.2 times faster
DistilBERT: 29.114ms, LoRA_1: 33.538ms, BERT: 34.275ms
DistilBERT model is 1.2 times faster
DistilBERT: 24.301ms, LoRA_1: 47.666ms, BERT: 53.149ms
DistilBERT model is 2.2 times faster
DistilBERT: 19.175ms, LoRA_1: 28.987ms, BERT: 46.510ms
DistilBERT model is 2.4 times faster
DistilBERT: 38.691ms, LoRA_1: 28.213ms, BERT: 69.065ms
DistilBERT model is 1.8 times faster
DistilBERT: 21.013ms, LoRA_1: 29.538ms, BERT: 45.155ms
DistilBERT model is 2.1 times faster
DistilBERT: 24.539ms, LoRA_1: 19.434ms, BERT: 41.886ms
DistilBERT model is 1.7 times faster
DistilBERT: 28.442ms, LoRA_1: 32.334ms, BERT: 51.811ms
DistilBERT model is 1.8 times faster
DistilBERT: 19.695ms, LoRA_1: 20.991ms, BERT: 71.725ms
DistilBERT model is 3.6 times faster
DistilBERT: 37.138ms, LoRA_1: 30.314ms, BERT: 60.756ms
DistilBERT model is 1.6 times faster
DistilBERT: 31.804ms, LoRA_1: 37.399ms, BERT: 55.382ms
DistilBERT model is 1.7 t

In [36]:
print(sum(a) / 10)
print(sum(b) / 10)
print(sum(c) / 10)

print(f'{sum(c) / sum(a):.3f}')
print(f'{(sum(c) / sum(a) - 1.0) * 100 :.1f}')
print(f'{(sum(a) / sum(c)) * 100 :.1f}')


197.82485885620116
195.7846752166748
288.5262041091919
1.458
45.8
68.6
