#Install & Import

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel
import tqdm
import urllib.request
import os


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

# Load Model

In [None]:
config = AutoConfig.from_pretrained('skt/kogpt2-base-v2')

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

In [None]:
config

GPT2Config {
  "_name_or_path": "skt/kogpt2-base-v2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "created_date": "2021-04-28",
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "license": "CC-BY-NC-SA 4.0",
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 3,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
  

In [None]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

In [None]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

1
1
3
----------
</s>
<usr>
<pad>
<sys>


In [None]:
model.config.task_specific_params['text-generation']['max_length'] = 1000

In [None]:
model.config

GPT2Config {
  "_name_or_path": "skt/kogpt2-base-v2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "created_date": "2021-04-28",
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "license": "CC-BY-NC-SA 4.0",
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 3,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
  

# Load & Encode Data

In [None]:
df_shuffled = pd.read_csv('wellness_shuffled_30000.csv', encoding = 'utf8')

In [None]:
train_set = df_shuffled.iloc[:27000]
val_set = df_shuffled.iloc[27000:]

In [None]:
# train_data()
def get_chat_train_data():
  for question, answer in zip(train_set['question'].to_list(), train_set['answer'].to_list()):
    bos_token = [tokenizer.bos_token_id]
    eos_token = [tokenizer.eos_token_id]
    sent = tokenizer.encode('<usr>' + question + '<sys>' + answer) 
    yield bos_token + sent + eos_token


In [None]:
# test_data
def get_chat_val_data():
  for question, answer in zip(val_set['question'].to_list(), val_set['answer'].to_list()):
    bos_token = [tokenizer.bos_token_id]
    eos_token = [tokenizer.eos_token_id]
    sent = tokenizer.encode('<usr>' + question + '<sys>' + answer) 
    yield bos_token + sent + eos_token

In [None]:
batch_size = 8

In [None]:
train_dataset = tf.data.Dataset.from_generator(get_chat_train_data, output_types=tf.int32)

In [None]:
train_dataset = train_dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [None]:
val_dataset = tf.data.Dataset.from_generator(get_chat_val_data, output_types=tf.int32)

In [None]:
val_dataset = val_dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

In [None]:
steps = len(train_set) // batch_size 
print(steps)

1


In [None]:
input_layer = tf.keras.layers.Input(shape=(None, ), dtype=tf.int32)
outputs = model([input_layer])

print(outputs)

TFCausalLMOutputWithCrossAttentions(loss=None, logits=<KerasTensor: shape=(None, None, 51200) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, past_key_values=(<KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, None, 12, None, 64) dtype=float32 (created by layer 'tfgpt2lm_head_model')>, <KerasTensor: shape=(2, 

In [None]:
val_steps =  len(val_set) // batch_size 

# Train

In [None]:
EPOCHS=5

for epoch in range(EPOCHS):
    epoch_loss = 0
    val_epoch_loss = 0
    
    for batch in tqdm.tqdm_notebook(train_dataset, total=steps):
        with tf.GradientTape() as tape:
            result = model(batch, labels=batch)
            loss = result[0]
            batch_loss = tf.reduce_mean(loss)
        
        grads = tape.gradient(batch_loss, model.trainable_variables)
        adam.apply_gradients(zip(grads, model.trainable_variables))
        epoch_loss += batch_loss / steps
      
       #모델 저장        
    #tf.saved_model.save(model, '/content/drive/folders/chatbot/KoGPT2_{0}.h5'.format(epoch + 1))


    for val_batch in tqdm.notebook.tqdm(val_dataset, total = val_steps):
            val_result = model(val_batch, labels = val_batch)
            val_loss = val_result[0]
            val_batch_loss = tf.reduce_mean(val_loss)
            val_epoch_loss += val_batch_loss / val_steps
 
    print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/1 [00:00<?, ?it/s]

UnknownError: ignored

# Inference

In [None]:
def return_answer_by_chatbot(user_text):
  sent = '<usr>' + user_text + '<sys>'
  input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
  input_ids = tf.convert_to_tensor([input_ids])
  output = model.generate(input_ids, max_length=50, do_sample=True, top_k=20)
  sentence = tokenizer.decode(output[0].numpy().tolist())
  chatbot_response = sentence.split('<sys> ')[1].replace('</s>', '')
  return chatbot_response

In [None]:
return_answer_by_chatbot('죽고싶어요')

'당신은 혼자가 아니에요. 제가 옆에 있다는 것만 기억해주세요.'

In [None]:
return_answer_by_chatbot('무기력해요')

'재미있는 일을 스스로 만들어 보는 건 어떨까요?'

In [None]:
return_answer_by_chatbot('부인이랑 이혼하고 나서 삶에 대한 희망이 없어요')

'좋은 사람을 아직 못 찾은 것뿐이라고 생각해요. 너무 염려 말아요.'

In [None]:
return_answer_by_chatbot('울고싶어요')

'우는 건 나쁜 일이 아니에요. 나쁜 감정이 있다면 눈물로 흘려 보내는 것도 좋아요.'

In [None]:
return_answer_by_chatbot('사랑해보고싶어요')

'이리 와요. 제가 안아드릴게요.'

In [None]:
return_answer_by_chatbot('사랑해 보고 싶어요')

'저는 당신이 있어 행복한데 그런 당신도 행복했으면 좋겠어요.'

In [None]:
return_answer_by_chatbot('어제 주가지수가 폭락해서 인생이 망했어요')

'괜찮아요. 다음에는 잘하실 거라고 생각해요.'

In [None]:
return_answer_by_chatbot('제 인생이 노답이에요')

'그랬군요. 정말 열심히 할 수 있을 거예요.'

In [None]:
return_answer_by_chatbot('나 오늘 전역했어요')

'헉, 그런 일이 있었군요?'

In [None]:
return_answer_by_chatbot('저 살 너무 찐것 같아')

'갑자기 가족이 사라졌다니... 놀라셨겠어요.'

In [None]:
return_answer_by_chatbot('저 살 너무 찐것 같아 돼지 같아서 죽고싶어')

'전문가와 상담을 받아보신 적이 있나요? 힘이 들 땐 도움을 받는 게 좋아요.'

In [None]:
return_answer_by_chatbot('나 너무 행복해 죽을것 같애')

'당신은 혼자가 아니에요. 제가 옆에 있다는 것만 기억해주세요.'

In [None]:
return_answer_by_chatbot('집에 가고싶어... 근데 지금 비가 많이 와서 못가')

'더 이상 운전을 할 것 같으시면 하상 모든 짐을 제가 다 지고 가세요.'

In [None]:
return_answer_by_chatbot('세상에서 필요없는 존재 같이 느껴져요')

'부정적인 생각이 또 다른 부정적인 생각을 불러올 때가 있어요. 눈을 감고 숫자를 천천히 세보는 건 어때요?'

In [None]:
return_answer_by_chatbot('세상에 나혼자 있는것 같아요')

'제가 다 속상하네요...'

In [None]:
return_answer_by_chatbot('의지할 사람이 없어서 힘들어')

'자주 그러신다면 병원 진찰을 받아 보는 건 어떠세요?'

In [None]:
return_answer_by_chatbot('더이상 버티고 싶지 않아요') 

'그러셨군요. 제가 있으시군요.'

In [None]:
return_answer_by_chatbot('나 너무 외로워요') 

'제가 옆에 있어드릴게요.'

In [None]:
return_answer_by_chatbot('저는 운도 없고 실력도 없고 빽도 없어요.') 

'정말 힘드시겠어요. 누구나 그랬을 거예요.'

In [None]:
return_answer_by_chatbot('회사에서 타인에 의해 짤리게 됐습니다. 제가 가장이기 때문에 당장 경제적 여유가 없어서 너무 힘들어요')

'많이 힘드시겠어요. 고민을 털어 놓을 데가 필요하시면 제가 도와드릴 일이 있을까요?'

In [None]:
return_answer_by_chatbot('남편이 도박에 빠졌습니다. 집에있는 돈은 물론 아파트 보증금까지 뺴서 도박돈으로 썼습니다. 아이들과 저는 한 겨울에 길거리에서 자야해요')

'괜찮아요. 가끔 힘들 때 들려주세요.'

In [None]:
return_answer_by_chatbot('술 안먹으면 손이 떨려서 생활을 할 수가 없어요.')

'술을 좋아하시는군요.'

# Service using Gradio


In [None]:
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.3.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 2.1 MB/s 
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
Collecting uvicorn
  Downloading uvicorn-0.18.3-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 6.3 MB/s 
[?25hCollecting websockets
  Downloading websockets-10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (112 kB)
[K     |████████████████████████████████| 112 kB 70.5 MB/s 
[?25hCollecting paramiko
  Downloading paramiko-2.11.0-py2.py3-none-any.whl (212 kB)
[K     |████████████████████████████████| 212 kB 87.1 MB/s 
[?25hCollecting markdown-it-py[linkify,plugins]
  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.8 MB/s 
[?25hCollecting pydub
  Downloading py

In [None]:
import gradio as gr

iface = gr.Interface(
    fn=return_answer_by_chatbot,
    inputs=gr.inputs.Textbox(lines=1, placeholder="힐링이에게 하고싶은 말을 적으세요."),
    outputs="text")
iface.launch()

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",


Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://13307.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x7f5b1a6bdd90>,
 'http://127.0.0.1:7860/',
 'https://13307.gradio.app')