<a href="https://colab.research.google.com/github/hululuzhu/emojigpt/blob/main/emojigpt_starter_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# An end-to-end notebook to train a fun EmojiGPT AI
- Please set your environment to T4 + high_ram
- We used an unofficial llama2 checkpoint published at huggingface to demo purpose, ideally you want to go through [the official process](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
- You may have already collect data like [this set](https://huggingface.co/datasets/hululuzhu/silly-emoji-qa)
  - Why do we have different seasons?	🌍☀️🔄
  - How do fish breathe underwater?	🐟💦💨

In [None]:
# @title Install packages and imports, load llama model
print("Installing and loading, taking about 5 mins, please be patient")

!pip install -q bitsandbytes > /dev/null
!pip install -q datasets loralib sentencepiece > /dev/null
!pip install -q peft > /dev/null
!pip install -q transformers > /dev/null

from datasets import Dataset, load_dataset
import numpy as np
import os
import pandas as pd
import pathlib
from peft import PeftModel, get_peft_config, get_peft_model, LoraConfig, TaskType, prepare_model_for_int8_training
import pickle
import sys
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, AutoModelForSeq2SeqLM, DataCollatorForLanguageModeling

# TODO: Replace this unofficial path to official path
def llama_model_tokenizer(llama_path='daryl149/llama-2-7b-chat-hf'):
  llama = LlamaForCausalLM.from_pretrained(
    llama_path,
    device_map="auto",
    load_in_8bit=True)
  tokenizer = LlamaTokenizer.from_pretrained(llama_path)
  tokenizer.pad_token_id = 0
  tokenizer.padding_side = "left"
  return llama, tokenizer

llama, tokenizer = llama_model_tokenizer()

In [3]:
# Uncomment the following line to take a look at llama model architecture
# llama

In [12]:
# @title Support functions

# Supress warning
import warnings
warnings.filterwarnings('ignore')

# Why 48? This is the cap of total tokens in the small dataset, change if needed
MAX_LEN = 48  #@param

# Copied from Alpaca-LoRA, notice input_ids, attention_mask, and labels are
# default expected columns in huggingface dataset lib
def tokenize(tokenizer, prompt, cutoff_len=MAX_LEN, add_eos_token=True):
  # there's probably a way to do this with the tokenizer settings
  # but again, gotta move fast
  result = tokenizer(
      prompt,
      truncation=True,
      max_length=cutoff_len,
      padding=False,
      return_tensors=None,
  )
  if (
      result["input_ids"][-1] != tokenizer.eos_token_id
      and len(result["input_ids"]) < cutoff_len
      and add_eos_token
  ):
    result["input_ids"].append(tokenizer.eos_token_id)
    result["attention_mask"].append(1)

  # result["labels"] = copy.deepcopy(result["input_ids"])
  result["labels"] = result["input_ids"].copy()
  return result

KIDS_QS = """Why do we need to wear a seat belt in the car?
How do bees make honey?
Why do we have different seasons?
How do fish breathe underwater?
Why do we need to eat fruits and vegetables?
How do birds fly?
Why do we need to brush our teeth?
How do plants grow from seeds?
Why do we need to wear sunscreen in the sun?
How do butterflies transform from caterpillars?""".split('\n')

def eval_model(my_model):
  for p_in in KIDS_QS:
    batch = tokenizer(
        p_in,
        return_tensors='pt',
    )
    with torch.cuda.amp.autocast(): # required for mixed precisions
      output_tokens = my_model.generate(
          **batch, max_new_tokens=batch['input_ids'].shape[-1])
    # print(output_tokens[0])
    out = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    # My own post-processing logic to "cheat" to align chars
    if len(out) > len(p_in) * 2 - 7:
      out = out[:len(p_in) * 2 - 7 - len(out)] # perfectly match chars
    # replace the last N for visibility
    if out.count('\n') > 1:
      out = out[::-1].replace("\n", "n\\", 1)[::-1]
    if out.startswith(p_in):
      out = out[len(p_in):]
    print(p_in, out)
    print()

In [None]:
# @title How does the llama model work out of box?
print("Please note we cap the number of output to speed up, so answers look like cut.\n")
eval_model(llama)

In [None]:
# @title Time to prepare the model to train!
print("We also used an efficient training trick called LoRA")
llama = prepare_model_for_int8_training(llama)

config = LoraConfig(
    r=16,
    lora_alpha=32, # scaling param related to r, reuse alpaca-lora
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

llama = get_peft_model(llama, config)

In [None]:
# @title Now let's load the data
print("If you have your own CSV/TSV data, Change to your own path")
print("You can even upload your CSV/TSV to Colab, and use relative path")

CSV_PATH = 'https://huggingface.co/datasets/hululuzhu/silly-emoji-qa/resolve/main/train.csv'  #@param
# Uncomment the following 2 lines if you have your own CSV
import pandas as pd
train_df = pd.read_csv(CSV_PATH)

# from datasets import load_dataset
# train_df = pd.DataFrame(load_dataset("hululuzhu/silly-emoji-qa", split=["train"]))

# Branched from Alpaca-LoRA
def tokenize_fn(data_point):
  prompt_in, prompt_out = data_point['Question'], data_point['Answer']
  full_prompt = prompt_in + prompt_out
  tokenized_full_prompt = tokenize(tokenizer, full_prompt, MAX_LEN)
  user_prompt = prompt_in
  tokenized_user_prompt = tokenize(tokenizer, user_prompt, MAX_LEN, add_eos_token=False)
  user_prompt_len = len(tokenized_user_prompt["input_ids"])
  tokenized_full_prompt["labels"] = [
      -100 # special id for skipping
  ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]
  return tokenized_full_prompt

train_ds = Dataset.from_pandas(train_df)
train_ds = train_ds.flatten()
tokenized_train_ds = train_ds.map(
    tokenize_fn,
    remove_columns=['Question', 'Answer'],
)

In [38]:
# Sample the data if you want
# train_df.sample(3)

In [None]:
# @title Let's train the AI together!
print("It takes about 5-10 mins, be patient")
trainer = transformers.Trainer(
    model=llama,
    train_dataset=tokenized_train_ds,
    args=transformers.TrainingArguments(
        # increased batch size will significantly increase GPU requirement here
        # Decrease to 4 if you have less than 16G vram
        # Batch = 4, probably 8.3-8.8G vram
        # Batch = 16, 9.5G+
        # Batch = 32, 11G+
        # Batch = 64, 14G+
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        warmup_steps=8,
        num_train_epochs=4,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=20,
        output_dir='outputs',
        remove_unused_columns=False,
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True,
    ),
)
llama.config.use_cache = False # Alpaca Lora sets this for training
trainer.train()
emoji_gpt = llama

In [None]:
# @title Training is complete successfully! Let's check if EmojiGPT is working?
eval_model(emoji_gpt)

In [None]:
# @title Optionally, upload to HuggingFace and share with the world!
print("You will first need to register account at huggingface.co")
from huggingface_hub import notebook_login
notebook_login()

YOUR_HF_ID = "hululuzhu"  #@param {type:"string"}
YOUR_PROJECT_ID = "emoji-gpt"  #@param {type:"string"}

print("Uploading now")
emoji_gpt.push_to_hub(f"{YOUR_HF_ID}/{YOUR_PROJECT_ID}",
                      use_auth_token=True,
                      create_pr=True)

print(f"Uploading complet, click \"Merge\" at https://huggingface.co/{YOUR_HF_ID}/{YOUR_PROJECT_ID}/discussions to finish publishing!")