<a href="https://colab.research.google.com/github/ksk0629/chatbot_with_gpt2/blob/develop/chatbot_with_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chatbot with GPT-2
- Reference
  - https://qiita.com/Yokohide/items/e74254f334e1335cd502
  - https://huggingface.co/rinna

## Preparation

In [None]:
import os
from google.colab import drive
import pickle

In [None]:
# Mount my google drive
drive_path = "/content/gdrive"
drive.mount(drive_path)

# Prepare environment
!pip install mlflow
!pip install pyngrok
!pip install PyYAML==5.4  # reference: https://github.com/ultralytics/yolov5/issues/414]

!pip install sentencepiece
!pip install datasets
!pip install git+https://github.com/huggingface/transformers

from pyngrok import ngrok
import yaml

# Load general config
config_path = os.path.join(drive_path, "MyDrive", "config", "general_config.yaml")
with open(config_path, 'r') as yml:
  config = yaml.safe_load(yml)

config_github = config["github"]
config_ngrok = config["ngrok"]

# Set git config
!git config --global user.email {config_github["email"]}
!git config --global user.name {config_github["username"]}

# Clone the repository
repository_name = "chatbot_with_gpt2"
git_repository = f"https://github.com/{config_github['username']}/" + repository_name + ".git"
repository_path = "/content/" + repository_name
!git clone --recursive {git_repository}

# Change directory to the cloned directory
%cd {repository_name}

In [None]:
# Checkout
branch_name = "develop"
!git checkout {branch_name}

In [None]:
# Pull
!git pull
!git submodule update --init --recursive

## Preprocessing data

In [None]:
!python src/preprocessor.py

## Training data preparation

In [None]:
!python src/training_data_maker.py

## Building model

In [None]:
import yaml
!rm -r model mlruns
with open("model_config.yaml", "rb") as yaml_f:
  config = yaml.safe_load(yaml_f)
config_general = config["general"]
config_dataset = config["dataset"]
config_train = config["train"]

!python ./transformers/examples/pytorch/language-modeling/run_clm.py \
    --model_name_or_path={config_general["basemodel"]} \
    --train_file={config_dataset["output_path"]} \
    --validation_file={config_dataset["output_path"]} \
    --do_train \
    --do_eval \
    --num_train_epochs={config_train["epochs"]} \
    --save_steps={config_train["save_steps"]} \
    --save_total_limit={config_train["save_total_limit"]} \
    --per_device_train_batch_size={config_train["per_device_train_batch_size"]} \
    --per_device_eval_batch_size={config_train["per_device_eval_batch_size"]} \
    --output_dir={config_train["output_dir"]} \
    --use_fast_tokenizer={config_train["use_fast_tokenizer"]}

## MLflow

In [None]:
# Run MLflow
get_ipython().system_raw("mlflow ui --port 5000 &") # run tracking UI in the background

# Terminate open tunnels if exist
ngrok.kill()

# Setting the authtoken of ngrok
ngrok.set_auth_token(config_ngrok["token"])

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

## Adding files to the git repository

In [None]:
add_objects = os.path.join(repository_path, "mlruns", "*")
!git add {add_objects}

In [None]:
commit_msg = "Added new mlruns data"
!git commit -m {commit_msg}

In [None]:
html = f"https://{config_github['token']}@github.com/{config_github['username']}/{repository_name}.git"
!git remote set-url origin {html}
!git push origin {branch_name}