In [1]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [2]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained('gpt2')
print(config)

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 50257
}



In [7]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
   ---------------------------------------- 0.0/297.4 kB ? eta -:--:--
   --------------- ------------------------ 112.6/297.4 kB 3.2 MB/s eta 0:00:01
   ---------------------------------------- 297.4/297.4 kB 4.6 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.29.2


In [1]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned", 
    num_train_epochs=3, 
    per_device_train_batch_size=16, 
    learning_rate=2e-5,
    weight_decay=0.01, 
)

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [3]:
gpt2 = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [4]:
generated_texts = gpt2("** User Info ** ['1', 'he is a student.'], ['2', 'he is a computer engeenering student.'] ** end User Info ** Q. How can I make money using AI tools? A.", max_length=200, truncation=True)
print(generated_texts[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


** User Info ** ['1', 'he is a student.'], ['2', 'he is a computer engeenering student.'] ** end User Info ** Q. How can I make money using AI tools? A. Don't ask me like that. I only make money using a tool in general. Q. Who uses tools? A. You don't need them. These are some of the people who have been using Google algorithms. A. I want to make this money. The money I make is based on what I learn. Q. What is your investment in the project? A. I have been doing something before from the early days. What are my plans now? A. I am starting from scratch with an initial $1K investment and will spend as much time doing as I'm willing to invest. Q. What resources/gatherings do you have? A. I love to learn new things and try new projects. I am very interested in the various


# GPT-2 Fine-Tuning

### 데이터셋 구축

In [12]:
import shutil
file_name = "./data/020.Text by Topic Casual Conversation Data.zip"
output_dir = "./data/casual_conversation_data"
format = "zip"
shutil.unpack_archive(file_name, output_dir, format)

In [14]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.2.2-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.3.2-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pytorch_lightning-2.2.2-py3-none-any.whl (801 kB)
   ---------------------------------------- 0.0/801.9 kB ? eta -:--:--
   ----------------- --------------------- 368.6/801.9 kB 11.6 MB/s eta 0:00:01
   --------------------------------------- 801.9/801.9 kB 10.1 MB/s eta 0:00:00
Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
   ---------------------------------------- 0.0/841.5 kB ? eta -:--:--
   --------------------------------------  839.7/841.5 kB 25.9 MB/s eta 0:00:01
   --------------------------------------- 841.5/841.5 kB 17.7 MB/s eta 

In [25]:
!pip install pytorch-forecasting

Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting fastapi>=0.80 (from pytorch-forecasting)
  Using cached fastapi-0.110.1-py3-none-any.whl.metadata (24 kB)
Collecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.2.2-py3-none-any.whl.metadata (53 kB)
     ---------------------------------------- 0.0/53.4 kB ? eta -:--:--
     ---------------------------------------- 53.4/53.4 kB 1.4 MB/s eta 0:00:00
Collecting optuna<4.0.0,>=3.1.0 (from pytorch-forecasting)
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pytorch-optimizer<3.0.0,>=2.5.1 (from pytorch-forecasting)
  Downloading pytorch_optimizer-2.12.0-py3-none-any.whl.metadata (46 kB)
     ---------------------------------------- 0.0/46.1 kB ? eta -:--:--
     ---------------------------------------- 46.1/46.1 kB ? eta 0:00:00
Collecting scikit-learn<2.0,>=1.2 (from pytorch-forecasting)
  Downloading scikit_learn-1.4.

In [5]:
import math
import numpy as np
import pandas as pd
import random
import re
import torch
import urllib.request
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

In [6]:
import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv",
    filename="ChatBotData.csv",
)
Chatbot_Data = pd.read_csv("ChatBotData.csv")

# Test 용으로 300개 데이터만 처리한다.
Chatbot_Data = Chatbot_Data[:300]
Chatbot_Data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


## Tokenizer 기능

1. Tokenizing : 입력 문자열을 token id로 변환(encoding), token id를 다시 문자열로 변환(decoding)의 기능2. 
기존의 구조(BPE, Sentencepiece 등)에 독립적으로 추가적인 token들을 추가하는 기능3. 
Special token들을 (mask, BOS, EOS 등) 관리하는 기능

In [7]:
BOS = "</s>"
EOS = "</s>"
PAD = "<pad>"
MASK = "<unused0>"
Q_TKN = "<usr>"
SENT = '<unused1>'
A_TKN = "<sys>"


# 허깅페이스 transformers 에 등록된 사전 학습된 koGTP2 토크나이저를 가져온다.
koGPT2_TOKENIZER = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", bos_token=BOS, eos_token=EOS, unk_token="<unk>", pad_token=PAD, mask_token=MASK,)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [8]:
# 챗봇 데이터를 처리하는 클래스를 만든다.
class ChatbotDataset(Dataset):
    def __init__(self, chats, max_len=40):  # 데이터셋의 전처리를 해주는 부분
        self._data = chats
        self.max_len = max_len
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.eos = EOS
        self.mask = MASK
        self.tokenizer = koGPT2_TOKENIZER

    def __len__(self):  # chatbotdata 의 길이를 리턴한다.
        return len(self._data)

    def __getitem__(self, idx):  # 로드한 챗봇 데이터를 차례차례 DataLoader로 넘겨주는 메서드
        turn = self._data.iloc[idx]
        q = turn["Q"]  # 질문을 가져온다.
        q = re.sub(r"([?.!,])", r" ", q)  # 구둣점들을 제거한다.

        a = turn["A"]  # 답변을 가져온다.
        a = re.sub(r"([?.!,])", r" ", a)  # 구둣점들을 제거한다.

        q_toked = self.tokenizer.tokenize(self.q_token + q + self.sent_token)
        q_len = len(q_toked)

        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)

        #질문의 길이가 최대길이보다 크면
        if q_len > self.max_len:
            a_len = self.max_len - q_len        #답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:       #질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   #질문길이를 최대길이의 반으로 
                q_len = len(q_toked)
                a_len = self.max_len - q_len              #답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        #질문의 길이 + 답변의 길이가 최대길이보다 크면
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len        #답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:       #질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   #질문길이를 최대길이의 반으로 
                q_len = len(q_toked)
                a_len = self.max_len - q_len              #답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 답변 labels = [mask, mask, ...., mask, ..., <bos>,..답변.. <eos>, <pad>....]
        labels = [self.mask,] * q_len + a_toked[1:]

        # mask = 질문길이 0 + 답변길이 1 + 나머지 0
        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)
        # 답변 labels을 index 로 만든다.
        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        # 최대길이만큼 PADDING
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]

        # 질문 + 답변을 index 로 만든다.    
        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        # 최대길이만큼 PADDING
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]

        #질문+답변, 마스크, 답변
        return (token_ids, np.array(mask), labels_ids)


In [9]:
def collate_batch(batch):
    data = [item[0] for item in batch]
    mask = [item[1] for item in batch]
    label = [item[2] for item in batch]
    return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_set = ChatbotDataset(Chatbot_Data, max_len=40)

#윈도우 환경에서 num_workers 는 무조건 0으로 지정, 리눅스에서는 2
train_dataloader = DataLoader(train_set, batch_size=32, num_workers=0, shuffle=True, collate_fn=collate_batch,)

In [11]:
print("start")
for batch_idx, samples in enumerate(train_dataloader):
    token_ids, mask, label = samples
    print("token_ids ====> ", token_ids)
    print("mask =====> ", mask)
    print("label =====> ", label)
print("end")

  return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)


start
token_ids ====>  tensor([[    2,  9244,  7584,  ...,     3,     3,     3],
        [    2, 31279,  9341,  ...,     3,     3,     3],
        [    2,  9546,  6969,  ...,     3,     3,     3],
        ...,
        [    2,  9020,  8263,  ...,     3,     3,     3],
        [    2, 10715,  9511,  ...,     3,     3,     3],
        [    2, 19855,  9350,  ...,     3,     3,     3]])
mask =====>  tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
label =====>  tensor([[9, 9, 9,  ..., 3, 3, 3],
        [9, 9, 9,  ..., 3, 3, 3],
        [9, 9, 9,  ..., 3, 3, 3],
        ...,
        [9, 9, 9,  ..., 3, 3, 3],
        [9, 9, 9,  ..., 3, 3, 3],
        [9, 9, 9,  ..., 3, 3, 3]])
token_ids ====>  tensor([[    2, 20509,  7847,  ...,     3,     3,     3],
        [    2, 11342,  9945,  ...,     3,     3,     3],
        [    2, 107

## 학습

In [8]:
!pip install pytorch_lightning==1.9.0

Collecting pytorch_lightning==1.9.0
  Using cached pytorch_lightning-1.9.0-py3-none-any.whl.metadata (23 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning==1.9.0)
  Using cached torchmetrics-1.3.2-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.4.2 (from pytorch_lightning==1.9.0)
  Using cached lightning_utilities-0.11.2-py3-none-any.whl.metadata (4.7 kB)
Using cached pytorch_lightning-1.9.0-py3-none-any.whl (825 kB)
Using cached lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Using cached torchmetrics-1.3.2-py3-none-any.whl (841 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.11.2 pytorch_lightning-1.9.0 torchmetrics-1.3.2


In [12]:
import numpy as np
import pandas as pd
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader, Dataset
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import re

In [13]:
Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = '</s>'
EOS = '</s>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

In [14]:
koGPT2_TOKENIZER = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token='<unk>',
            pad_token=PAD, mask_token=MASK) 
model = GPT2LMHeadModel.from_pretrained('gpt2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [15]:
import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv",
    filename="ChatBotData.csv",
)
Chatbot_Data = pd.read_csv("ChatBotData.csv")
# Test 용으로 300개 데이터만 처리한다.
Chatbot_Data = Chatbot_Data[:300]
Chatbot_Data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
model = model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [18]:
learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epoch = 60
Sneg = -1e18

In [19]:
print ("start")
for epoch in range(epoch):
    total_loss = 0
    count_batches = 0
    for batch_idx, samples in enumerate(train_dataloader):
        torch.device("cuda")
        optimizer.zero_grad()
        token_ids, mask, label = samples
        
        token_ids.cuda()
        mask.cuda()
        label.cuda()
        token_ids = token_ids.cuda()
        mask = mask.cuda()
        label = label.cuda()
        
        # Assuming vocab_size is the maximum index allowed by your model's vocabulary
        vocab_size = 50257  # This is just an example; adjust based on your actual model's vocab size
        token_ids = token_ids.clamp(0, vocab_size - 1)
        
        out = model(token_ids)
        
        out = out.logits      #Returns a new tensor with the logit of the elements of input
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        # 평균 loss 만들기 avg_loss[0] / avg_loss[1] <- loss 정규화
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        # 학습 끝
        optimizer.step()

        total_loss += avg_loss.item()
        count_batches += 1
    
    # Average loss per epoch
    epoch_loss = total_loss / count_batches
    print(f"End of Epoch {epoch+1}, Average Loss: {epoch_loss:.4f}")
print ("end")

start
End of Epoch 1, Average Loss: 43.1330
End of Epoch 2, Average Loss: 40.7978
End of Epoch 3, Average Loss: 40.1262
End of Epoch 4, Average Loss: 39.6829
End of Epoch 5, Average Loss: 39.0762
End of Epoch 6, Average Loss: 38.7893
End of Epoch 7, Average Loss: 38.4790
End of Epoch 8, Average Loss: 37.9677
End of Epoch 9, Average Loss: 37.5837
End of Epoch 10, Average Loss: 37.4404
End of Epoch 11, Average Loss: 36.6947
End of Epoch 12, Average Loss: 36.5364
End of Epoch 13, Average Loss: 36.1066
End of Epoch 14, Average Loss: 36.2153
End of Epoch 15, Average Loss: 36.0173
End of Epoch 16, Average Loss: 35.5468
End of Epoch 17, Average Loss: 35.3018
End of Epoch 18, Average Loss: 35.2337
End of Epoch 19, Average Loss: 34.9390
End of Epoch 20, Average Loss: 34.7377
End of Epoch 21, Average Loss: 34.8938
End of Epoch 22, Average Loss: 34.2629
End of Epoch 23, Average Loss: 35.1056
End of Epoch 24, Average Loss: 34.2793
End of Epoch 25, Average Loss: 34.2920
End of Epoch 26, Average Los

In [20]:
with torch.no_grad():
    while 1:
        q = input("user > ").strip()
        if q == "quit":
            break
        a = ""
        while 1:
            input_ids = torch.LongTensor(koGPT2_TOKENIZER.encode(Q_TKN + q + SENT + A_TKN + a)).unsqueeze(dim=0)
            pred = model(input_ids.to(device))
            pred = pred.logits
            gen = koGPT2_TOKENIZER.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().cpu().numpy().tolist())[-1]
            if gen == EOS:
                break
            a += gen.replace("▁", " ")
        print("Chatbot > {}".format(a.strip()))

user >  카페갈래


Chatbot > 좋겠어요


user >  같이가자


Chatbot > 혼자를 즐기세요


user >  너랑 갈래


Chatbot > 네 말씀하세요


user >  개새키


Chatbot > 벗어나는자가 아니요


user >  미안 욕 안할게


Chatbot > 혼자가 아니에요


user >  너도 혼자가 아니야


Chatbot > 혼자가 아니에요


KeyboardInterrupt: Interrupted by user

In [None]:
import torch

with torch.no_grad():
    while True:
        q = input("user > ").strip()
        if q.lower() == "quit":
            break
        a = ""
        while True:
            # Encode the input sequence and add special tokens
            input_ids = torch.LongTensor(koGPT2_TOKENIZER.encode(Q_TKN + q + SENT + A_TKN + a, add_special_tokens=True)).unsqueeze(dim=0)
            print(len(input_ids)) 
            # Truncate to maximum length if necessary
            max_length = 1024  # Adjust this based on your model's configuration
            if input_ids.size(1) > max_length:
                input_ids = input_ids[:, :max_length]
                print(len(input_ids)) 
            # Make prediction
            try:
                input_ids = input_ids.to(device)  # Ensure tensor is on the correct device
                pred = model(input_ids)
                logits = pred.logits
                gen_id = torch.argmax(logits, dim=-1).squeeze().cpu().numpy().tolist()[-1]
                gen = koGPT2_TOKENIZER.convert_ids_to_tokens(gen_id)

                # Check for end-of-sequence token
                if gen == EOS:
                    break

                # Append generated text to output
                a += gen.replace("▁", " ")  # Replace subword tokenization prefix if used

            except Exception as e:
                print(f"An error occurred: {e}")
                break

        print("Chatbot > {}".format(a.strip()))