In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "7"

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import pandas as pd
import torch.nn as nn

In [None]:
path_to_dataset = './dataset/ChatBotData.csv'
#path_to_dataset = './dataset/only_chatbot.csv'
output_dir="./output/"
model = "skt/kogpt2-base-v2"
batch_size = 32

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model, bos_token='</s>', eos_token='</s>',pad_token='<pad>')
special_tokens_dict = {'additional_special_tokens': ['<usr>','<sys>']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(model)
model.resize_token_embeddings(len(tokenizer))

<h3>Loading Dataset

In [None]:
def get_chat_data(path_to_dataset):
    df = pd.read_csv(path_to_dataset)
    for idx, row in df.iterrows():
        q = row['Q']
        a = row['A']
        yield q, a

# Load your dataset from CSV using a generator
class ChatbotDataset(Dataset):
    def __init__(self, generator, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        
        for q, a in generator:
            text = f"{tokenizer.bos_token} {'<usr>'} {q} {'<sys>'} {a} {tokenizer.eos_token}"
            encodings_dict = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids'], dtype=torch.long))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask'], dtype=torch.long))

        self.input_ids = torch.stack(self.input_ids)
        self.attn_masks = torch.stack(self.attn_masks)
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]



In [None]:
# Create dataset using the generator
max_length = 128
dataset = ChatbotDataset(get_chat_data(path_to_dataset), tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

In [None]:
tokenizer.decode(dataset[15][0])

In [None]:
len(dataloader)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs.")
    model = torch.nn.DataParallel(model)

optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(dataloader) * 3  # Number of training steps (number of batches * epochs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

<h3>Training

In [None]:
# Training loop
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader):
        input_ids, attn_masks = batch
        input_ids = input_ids.to(device)
        attn_masks = attn_masks.to(device)
        
        model.zero_grad()
        
        outputs = model(input_ids, attention_mask=attn_masks, labels=input_ids)
        loss = outputs.loss.mean()
        loss.backward()
        
        total_loss += loss.item()
        
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(dataloader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {avg_loss:.4f}')

In [None]:
model.save_pretrained('./kogpt2_chatbot/')
tokenizer.save_pretrained('./kogpt2_chatbot/')

<h3>Evaluation

In [None]:
def return_answer_by_chatbot(user_text):
    sent = '<usr>' + user_text + '<sys>'
    tokenizer = AutoTokenizer.from_pretrained('./kogpt2_chatbot/')
    model = AutoModelForCausalLM.from_pretrained('./kogpt2_chatbot/')
    input_ids = tokenizer.encode(sent, return_tensors='pt')  # Encode the input text
    model.pad_token_id = tokenizer.eos_token_id
    output = model.generate(input_ids, do_sample=True, top_k=20,
                            max_new_tokens=40,
                            num_beams=5,
                            no_repeat_ngram_size=2,
                            early_stopping=True
                            )
    max_length = 50
    sentence = tokenizer.decode(output[0], skip_special_tokens=False)
    chatbot_response = sentence.split('<sys>')[1].replace('<pad>', '').replace('</s>', '').replace('\n','')
    return chatbot_response

In [None]:
return_answer_by_chatbot('넌 이름이 뭐야?')

In [None]:
return_answer_by_chatbot('만나서 반가워.')

In [None]:
return_answer_by_chatbot('나랑 영화보자.')