In [1]:
import shutil
import re
import numpy as np
import torch
from typing import List, Tuple, Dict
import json
import pandas as pd
from datasets import load_dataset
import csv
import os
import glob
import logging
import os
import pickle
from tqdm.notebook import tqdm, trange
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import RandomSampler, SequentialSampler, DataLoader, Dataset
from pathlib import Path
import random
from sklearn.model_selection import train_test_split
from transformers import (AdamW,AutoConfig,PreTrainedModel,PreTrainedTokenizer,get_linear_schedule_with_warmup,AutoModelWithLMHead,AutoModelForCausalLM,AutoTokenizer,MODEL_WITH_LM_HEAD_MAPPING,WEIGHTS_NAME, )
logger = logging.getLogger(__name__)



In [2]:
! pip -q install transformers

In [3]:
! pip -q install datasets

## Initialize Model and Tokenizer

In [4]:
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")

Downloading (…)lve/main/config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

## Load dataset

### Dailydialog

In [5]:
d = load_dataset("roskoN/dailydialog")

c = d['train']['utterances']
c = [u for sublist in c for u in sublist]
prompts = c[0::2]
responses = c[1::2]

file = "daily_dialog.csv"
with open(file, mode='w', newline='') as cs:
    w = csv.writer(cs)
    w.writerow(["Prompt", "Response"]) 

    for prompt, response in zip(prompts, responses):
        w.writerow([prompt.strip(), response.strip()])
        
df = pd.read_csv(file)
#df = df[:500]

Downloading builder script:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

Downloading and preparing dataset daily_dialog/full to /root/.cache/huggingface/datasets/roskoN___daily_dialog/full/1.0.0/7d96d5a6afcb95cf518611d5147758f4a5991bab51dc97c3a8131b6fb7811b76...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/180k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/179k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset daily_dialog downloaded and prepared to /root/.cache/huggingface/datasets/roskoN___daily_dialog/full/1.0.0/7d96d5a6afcb95cf518611d5147758f4a5991bab51dc97c3a8131b6fb7811b76. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### EmpatheticDialogues

In [6]:
d = load_dataset("benjaminbeilharz/empathetic_dialogues_for_lm")

c = d['train']['conv']
c = [u for sublist in c for u in sublist]
prompts = c[0::2]
responses = c[1::2]

file = "empathetic_dialogues.csv"
with open(file, mode='w', newline='') as cs:
    w = csv.writer(cs)
    w.writerow(["Prompt", "Response"]) 

    for prompt, response in zip(prompts, responses):
        w.writerow([prompt.strip(), response.strip()])
        
dt = pd.read_csv(file)
#dt = dt[:500]

Downloading:   0%|          | 0.00/886 [00:00<?, ?B/s]

Downloading and preparing dataset None/None (download: 5.81 MiB, generated: 11.03 MiB, post-processed: Unknown size, total: 16.84 MiB) to /root/.cache/huggingface/datasets/parquet/benjaminbeilharz--empathetic_dialog_for_lm-050aa011e4709962/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/767k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.53M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/benjaminbeilharz--empathetic_dialog_for_lm-050aa011e4709962/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### ProsocialDialog

In [7]:
dat = load_dataset("allenai/prosocial-dialog")

t_data = dat['train'].to_pandas()

column = ['context', 'response']
da = t_data[column]

f = "prosocial_dialog.csv"
da.to_csv(f, index=False)

new_column1 = ['Prompt','Response']
sel_c1 = da[['context','response']]
sel_c1.columns = new_column1
da = sel_c1
#da = da[:500]

Downloading and preparing dataset json/allenai--prosocial-dialog to /root/.cache/huggingface/datasets/json/allenai--prosocial-dialog-7451192f9246298e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/85.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--prosocial-dialog-7451192f9246298e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Combine data

In [8]:
m = pd.concat([df, dt, da], ignore_index=True)
m.to_csv('m_file.csv', index=False)
m_file = 'm_file.csv'
d = pd.read_csv(m_file)
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211079 entries, 0 to 211078
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Prompt    211079 non-null  object
 1   Response  211079 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


# Fixed max length

In [9]:
max_length = 100
d = d[(d['Prompt'].str.len() <= max_length) & (d['Response'].str.len() <= max_length)]
d.reset_index(drop=True, inplace=True)
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105629 entries, 0 to 105628
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Prompt    105629 non-null  object
 1   Response  105629 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


## Split Data

In [10]:
t_data, v_data = train_test_split(d, test_size=0.2)
t_data.to_csv('t_data.csv', index=False)
v_data.to_csv('v_data.csv', index=False)

### Define Arguments

In [11]:
class argument():
    def __init__(self):
        self.seed = 42
        self.epoch = 6
        self.step_save = 5000
        self.dir_output = 'save_output'
        self.name_tokenizer = 'microsoft/DialoGPT-small'
        self.name_of_model = 'microsoft/DialoGPT-small'

ar = argument()

## Dataset Loader

In [12]:
class c_data(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, dt):
        self.data = []
        for _, r in dt.iterrows():
            f = lambda s: [i for s_list in s for i in s_list]
            a = list([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in r])
            a = f(a)
            self.data.append(a)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return torch.tensor(self.data[i], dtype = torch.long)


## Set Seed

In [13]:
def set_seed(ar):
    torch.manual_seed(ar.seed)
    random.seed(ar.seed)
    torch.cuda.manual_seed_all(ar.seed)
    np.random.seed(ar.seed)


## Training

In [14]:
def fine_tuning(ar, t_data, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:

    def fn(a: List[torch.Tensor]):
        n = pad_sequence(a, batch_first=True)
        return n
    t_sam = RandomSampler(t_data)
    data_loader = DataLoader(t_data, sampler=t_sam, batch_size = 16, collate_fn=fn, drop_last = True )
    total = len(data_loader) // 1 * ar.epoch
    model = model 
    model.resize_token_embeddings(len(tokenizer))

    decay_no = ["bias", "LayerNorm.weight"]
    opt_para = [
        {"params": [q for j, q in model.named_parameters() if not any(d in j for d in decay_no)],"weight_decay": 0.0,},
        
        {"params": [q for j, q in model.named_parameters() if any(d in j for d in decay_no)], "weight_decay": 0.0},
    ]
    opt = AdamW(opt_para, lr = 5e-5, eps = 1e-8)
    sche = get_linear_schedule_with_warmup(opt, num_warmup_steps = 0, num_training_steps=total)
    g_step = 0
    ep_train = 0  
    t_los = 0.0
    model.zero_grad()
    itera = trange( ep_train, int(ar.epoch), desc="epoch") 
    set_seed(ar) 
    for _ in itera:
        e_iter = tqdm(data_loader, desc="iteration")
        for s, bat in enumerate(e_iter):
            inp, lab = (bat, bat)
            inp = inp.to(ar.device)
            lab = lab.to(ar.device)
            model.train()
            outp = model(inp, labels=lab)
            loss = outp[0]
            loss.backward()
            t_los += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            sche.step()
            model.zero_grad()
            
            g_step += 1
            if g_step % ar.step_save == 0:
                check_pre = "model_checkpoint"
                dir_output = os.path.join(ar.dir_output, "{}-{}".format(check_pre, g_step))
                save_m =  model
                save_m.save_pretrained(dir_output)
                tokenizer.save_pretrained(dir_output)
                torch.save(ar, os.path.join(dir_output, "train_ar.bin"))
                torch.save(opt.state_dict(), os.path.join(dir_output, "opti.pt"))
                torch.save(sche.state_dict(), os.path.join(dir_output, "sche.pt"))

In [15]:
ar = argument()
device = torch.device("cuda")
ar.device = device
set_seed(ar)
tokenizer = AutoTokenizer.from_pretrained(ar.name_tokenizer)
model = AutoModelForCausalLM.from_pretrained(ar.name_of_model,from_tf = False,)
model.to(ar.device)   
tr_data = c_data(tokenizer,t_data)

fine_tuning(ar, tr_data, model, tokenizer)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


epoch:   0%|          | 0/6 [00:00<?, ?it/s]

iteration:   0%|          | 0/5281 [00:00<?, ?it/s]

iteration:   0%|          | 0/5281 [00:00<?, ?it/s]

iteration:   0%|          | 0/5281 [00:00<?, ?it/s]

iteration:   0%|          | 0/5281 [00:00<?, ?it/s]

iteration:   0%|          | 0/5281 [00:00<?, ?it/s]

iteration:   0%|          | 0/5281 [00:00<?, ?it/s]

'\nos.makedirs(ar.dir_output, exist_ok=True)\nsave_m = model\nsave_m.save_pretrained(ar.dir_output)\ntokenizer.save_pretrained(ar.dir_output)\ntorch.save(ar, os.path.join(ar.dir_output, "train_ar.bin"))\nmodel = AutoModelForCausalLM.from_pretrained(ar.dir_output)\ntokenizer = AutoTokenizer.from_pretrained(ar.dir_output)\nmodel.to(ar.device)\nprint("")\n'

## Evaluation

In [16]:
def evaluation(ar, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, v_data) -> Dict:
    e_dir = ar.dir_output
    v_data =  c_data(tokenizer,v_data)
    def fn(x: List[torch.Tensor]):
        t = pad_sequence(x, batch_first=True)
        return t
    e_sam = SequentialSampler(v_data)
    ev_data = DataLoader(v_data, sampler=e_sam, batch_size = 16, collate_fn = fn, drop_last = True)

    e_los = 0.0
    evaluation_step = 0
    model.eval()
    for bat in tqdm(ev_data, desc="evaluation"):
        inp, lab = (bat, bat)
        inp = inp.to(ar.device)
        lab = lab.to(ar.device)
        with torch.no_grad():
            out = model(inp, labels=lab)
            loss = out[0]
            e_los += loss.mean().item()
        evaluation_step += 1
    e_los = e_los / evaluation_step
    p = torch.exp(torch.tensor(e_los))
    r = {"perplexity": p}
    return r

In [18]:
ar = argument()
device = torch.device("cuda")
ar.device = device
set_seed(ar)
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/save_output/model_checkpoint-30000')
model = AutoModelForCausalLM.from_pretrained('/kaggle/working/save_output/model_checkpoint-30000',from_tf = False,)
model.to(ar.device)
c_point = ['/kaggle/working/save_output/model_checkpoint-30000']
for c in c_point:
    model = AutoModelForCausalLM.from_pretrained(c)
    model.to(ar.device)
    r = evaluation(ar, model, tokenizer, v_data)
print(r)

evaluation:   0%|          | 0/1320 [00:00<?, ?it/s]

{'perplexity': tensor(5.2692)}


## Generate test results


In [21]:
import warnings
import logging
def predict(t):
    logging.getLogger("transformers").setLevel(logging.ERROR)
    tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
    model = AutoModelForCausalLM.from_pretrained('/kaggle/working/save_output/model_checkpoint-30000')
    input_id = tokenizer.encode(t + tokenizer.eos_token, return_tensors='pt')
    ids = model.generate(input_id, max_length=100,pad_token_id=tokenizer.eos_token_id,no_repeat_ngram_size=3,do_sample=True,top_k = 5,top_p=0.7,temperature = 0.8)
    print("Chatbot: {}\n".format(tokenizer.decode(ids[:, input_id.shape[-1]:][0],skip_special_tokens=True)))


In [None]:
while True:
    a = input("User: ")

    if a == "exit":
        break
    
    b = predict(a)