# Setup Environment

Before you start: 
1. Make sure you have cudatoolkit and cudnn in your environment
2. Make sure your environment has CUDA supported torch, torchvision torchaudio from [link](https://pytorch.org/get-started/locally/) before doing pip install

In [1]:
# !pip install -q -r requirements.txt

notebook login is required to download the model, you can also try other models that dont require login

In [2]:
# from huggingface_hub import notebook_login

# notebook_login()

In [3]:
# download and upload weights faster
%env HF_HUB_ENABLE_HF_TRANSFER=True

env: HF_HUB_ENABLE_HF_TRANSFER=True


In [4]:
# check if GPU is available
!nvidia-smi

Fri Oct 11 11:20:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.76.01              Driver Version: 552.22         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 Ti     On  |   00000000:01:00.0  On |                  N/A |
|  0%   39C    P8              3W /  285W |     518MiB /  12282MiB |      7%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
# check if pytorch is using GPU
import torch
torch.cuda.is_available()

True

# Setup Model

In [6]:
cache_dir=""

# model_id = "NousResearch/Meta-Llama-3-8B-Instruct"
model_id = "mistralai/Mistral-7B-v0.1"

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig

# quantization using BnB
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", # Normalized Float 4
    bnb_4bit_compute_dtype=torch.float16
)

# loading the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2", # FlashAttention only supports Ampere GPUs or newer.
    cache_dir=cache_dir
)

# loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# check if any weights/bias overflowed onto cpu (meta)
for n, p in  model.named_parameters():
    if p.device.type == "meta":
        print*(f"{n} is on meta!")

In [9]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable() # save some VRAM
model = prepare_model_for_kbit_training(model)

<img src="transformer_1.webp" alt="description" width="300"/>

In [10]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNo

In [11]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=[
        # "q_proj",
        # "k_proj",
        # "v_proj",
        "o_proj",
        # "self_attn.rotary_emb.inv_freq",
        # "gate_proj",
        # "up_proj",
        # "down_proj",
        # "lm_head",
        # "input_layernorm.weight",
        # "post_attention_layernorm.weight",
        # "model.norm.weight",
        # "lm_head.weight",
    ],
    layers_to_transform = [15],
    lora_dropout=0.1,
    bias="none", # Bias can be 'none', 'all' or 'lora_only'.
    task_type="CASUAL_LM",
    # use_dora=True,
)

In [12]:
model = get_peft_model(model, peft_config) # move to peft model

- you will get 10,630,144 trainable parameters if using all modules and layers.
- thats still a 1:700 ratio

In [13]:
model.print_trainable_parameters()

trainable params: 32,768 || all params: 7,241,764,864 || trainable%: 0.0004524863844018894


# Setup Tokenizer

In [14]:
print(tokenizer.chat_template)

None


In [15]:
# Copied from https://discuss.huggingface.co/t/issue-with-llama-2-chat-template-and-out-of-date-documentation/61645/3
tokenizer.chat_template = (
    "{% if messages[0]['role'] == 'system' %}"
    "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
    "{% set system_message = messages[0]['content'] %}"
    "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
    "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
    "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
    "{% else %}"
    "{% set loop_messages = messages %}"
    "{% set system_message = false %}"
    "{% endif %}"
    "{% if loop_messages|length == 0 and system_message %}"  # Special handling when only sys message present
    "{{ bos_token + '[INST] <<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n [/INST]' }}"
    "{% endif %}"
    "{% for message in loop_messages %}"  # Loop over all non-system messages
    "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
    "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
    "{% endif %}"
    "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
    "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
    "{% else %}"
    "{% set content = message['content'] %}"
    "{% endif %}"
    "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
    "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
    "{% elif message['role'] == 'system' %}"
    "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
    "{% elif message['role'] == 'assistant' %}"
    "{{ ' '  + content.strip() + ' ' + eos_token }}"
    "{% endif %}"
    "{% endfor %}"
)

messages=[
    {"role": "user", "content": "write a quick sort algorithm in python"},
    {"role": "assistant", "content": "here you are."},
    {"role": "user", "content": "great."},
]

inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(inputs)

<s>[INST] write a quick sort algorithm in python [/INST] here you are. </s><s>[INST] great. [/INST]


In [16]:
tokenizer.special_tokens_map

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}

In [17]:
tokenizer.add_special_tokens({'pad_token': '<|PAD|>'})

1

In [18]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<|PAD|>'}

In [19]:
# trainable_params_names = [
#     "embed_tokens",
#     "input_layernorm",
#     "post_attention_layernorm",
# ] # for llama models

# for name, parameter in model.named_parameters():
#   if any(trainable_param in name for trainable_param in trainable_params_names):
#     parameter.requires_grad_(True)
#   else:
#     parameter.requires_grad_(False)

# trainable_params = {n: p for n, p in model.named_parameters() if p.requires_grad}

# trainable_params_state_dict = {n: p.data for n, p in trainable_params.items()}

In [20]:
# trainable_params.keys()

In [21]:
print(tokenizer.vocab_size)

32000


In [22]:
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16)

Embedding(32016, 4096)

# Setup Data

In [23]:
from datasets import load_dataset
import json

# https://huggingface.co/datasets/ksuriuri/AI_Vtuber_Chat_History
dataset="ksuriuri/AI_Vtuber_Chat_History"

# data = load_dataset(dataset)

data = json.load(open("AI_vtuber.json", encoding="utf-8"))

In [24]:
print(data[1])

{'prompt': '什么样的裙子呢', 'response': '啊，是一条纯白色的连衣裙。它上面印满了小猫图案，好可爱呢！我穿着它感觉像一个小公主。嘿嘿。', 'history': [['你好呀', '嘿嘿，你好呀！今天我穿了一件新裙子，感觉超级棒呢！']]}


In [25]:
import os

import deepl
from dotenv import load_dotenv

load_dotenv()

# 500,000 character limit / month
translator = deepl.Translator(auth_key=os.getenv("DEEPL_API_KEY"))

result = translator.translate_text(str(data[1]), target_lang="EN-GB")
print(result.text)

{'prompt': 'What kind of dress?', 'response': 'Ah, it's a plain white dress. It has kitten prints all over it and it's so cute! I feel like a little princess in it. Hehehe.' , 'history': [['Hello there', 'Hey there! I'm wearing a new dress today, and I feel super great!']]}


In [26]:
new_data = []
for message in data:
    messages = []
    if message["history"]:
        for history in message["history"]:
            message_format = [
                {
                    "role": "user",
                    "content": translator.translate_text(history[0], target_lang="EN-GB").text
                },
                {
                    "role": "assistant",
                    "content": translator.translate_text(history[1], target_lang="EN-GB").text
                }
            ]
            messages.extend(message_format)
    message_format = [
        {
            "role": "user",
            "content": translator.translate_text(message["prompt"], target_lang="EN-GB").text
        },
        {
            "role": "assistant",
            "content": translator.translate_text(message["response"], target_lang="EN-GB").text
        }
    ]
    messages.extend(message_format)
    new_data.append(messages)

In [27]:
print(new_data[:5])

[[{'role': 'user', 'content': 'Hello.'}, {'role': 'assistant', 'content': "Hey, hey, how are you? I'm wearing a new dress today, and I feel super great about it!"}], [{'role': 'user', 'content': 'Hello.'}, {'role': 'assistant', 'content': "Hey, hey, how are you? I'm wearing a new dress today, and I feel super great about it!"}, {'role': 'user', 'content': 'What kind of dress?'}, {'role': 'assistant', 'content': "Ah, it's a plain white dress. It had kitten prints all over it and it was so cute! I feel like a little princess in it. Heh heh."}], [{'role': 'user', 'content': 'Hello, Yuri.'}, {'role': 'assistant', 'content': "Hey, miss Greenie, you're awake."}], [{'role': 'user', 'content': 'Hello, Yuri.'}, {'role': 'assistant', 'content': "Hey, miss Greenie, you're awake."}, {'role': 'user', 'content': "What's your name?"}, {'role': 'assistant', 'content': "Hey, I'm Yuri. You can call me Yuri."}], [{'role': 'user', 'content': 'Hello, Yuri.'}, {'role': 'assistant', 'content': "Hey, miss Gre

In [28]:
import json

with open('translated_AI_vtuber.json', 'w', encoding="utf-8") as json_file:
    json.dump(new_data, json_file, indent=4, ensure_ascii=False)

In [29]:
inputs = []
for row in new_data:
    new_dict = {}
    tokenized_chat = tokenizer.apply_chat_template(row, tokenize=False, add_generation_prompt=True)
    new_dict["text"] = tokenized_chat
    inputs.append(new_dict)

In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokens = tokenizer.encode(inputs[0]["text"], add_special_tokens=True)
print(tokenizer.decode(tokens))

<s><s>[INST] Hello. [/INST] Hey, hey, how are you? I'm wearing a new dress today, and I feel super great about it! </s>


### Train

In [31]:
model_name = "AI_Vtuber_Llama"
dataset_name = "AI_Vtuber"

epochs=1
batch_size=1
grad_accum=8
context_length=512*8
save_dir = f"models/{model_name}_{dataset_name}_{epochs}_{batch_size}_lora"

In [32]:
import json

with open('translated_AI_vtuber_templated.json', 'w', encoding="utf-8") as json_file:
    json.dump(inputs, json_file, indent=4, ensure_ascii=False)

In [33]:
dataset = load_dataset("json", data_files="translated_AI_vtuber_templated.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [34]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 36
    })
})

In [35]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [36]:
import transformers
# from transformers import Trainer
from trl import SFTTrainer

trainer = SFTTrainer(
    dataset_text_field="text",
    max_seq_length=context_length,
    tokenizer=tokenizer,
    model=model,
    train_dataset=dataset["train"],
    packing=True,
    args=transformers.TrainingArguments(
        max_steps=1,
        save_steps=1,
        num_train_epochs=epochs,
        output_dir=save_dir,
        gradient_accumulation_steps=grad_accum,
        # evaluation_strategy="steps",
        # do_eval=True,
        per_device_train_batch_size=batch_size,
        fp16=True,
        # bf16=True, # fp16=True for non-ampere GPUs
        optim="adamw_torch",
        learning_rate=1e-4,
        log_level="debug",
        remove_unused_columns=False
    )
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Generating train split: 0 examples [00:00, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


In [37]:
model.config.use_cache = False
trainer.train()

Currently training with a batch size of: 1
***** Running training *****
  Num examples = 1
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 1
  Number of trainable parameters = 32,768
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss


Saving model checkpoint to models/AI_Vtuber_Llama_AI_Vtuber_1_1_lora/checkpoint-1
loading configuration file config.json from cache at /home/joey/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b/config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.2",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in models/AI_Vtuber_Llama_AI_Vtuber_1_1_lora/checkpoint-1/tokeni

TrainOutput(global_step=1, training_loss=0.2495785802602768, metrics={'train_runtime': 8.6984, 'train_samples_per_second': 0.92, 'train_steps_per_second': 0.115, 'total_flos': 174753998438400.0, 'train_loss': 0.2495785802602768, 'epoch': 1.0})

# Evaluation

In [38]:
from transformers import TextStreamer
from peft import PeftModel
import torch
import gc # garbage collection

def stream(user_prompt, base_model, tokenizer, checkpoint=""):

    if base_model:
        eval_model = model
    else:
        eval_model = PeftModel.from_pretrained(model, checkpoint)
        eval_model = eval_model.to("cuda")

    eval_model.config.use_cache = True

    messages=[
        {"role": "user", "content": f"{user_prompt.strip()}"},
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer([inputs], return_tensors="pt", add_special_tokens=False).to("cuda")

    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

    streamer = TextStreamer(tokenizer)

    print(f"eval_model is on: {next(eval_model.parameters()).device}")
    print(f"inputs is on: {inputs['input_ids'].device}")

    _ = eval_model.generate(**inputs, streamer=streamer, max_new_tokens=100, do_sample=False, temperature=1.0, top_k=50)

    torch.cuda.empty_cache()
    gc.collect()

def evaluate(base_model, tokenizer, checkpoint=""):
    questions = [
        "What are you doing?",
        "Do you prefer coffee or tea?",
    ]

    answers = [
        "...",
        "...",
    ]

    for question, answer in zip(questions, answers):
        stream(question, base_model=base_model, tokenizer=tokenizer, checkpoint=checkpoint)
        print("\n\n")

In [39]:
evaluate(base_model=True, tokenizer=tokenizer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


eval_model is on: cuda:0
inputs is on: cuda:0
<s>[INST] What are you doing? 

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[/INST]

,,,,,,,,,,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.





eval_model is on: cuda:0
inputs is on: cuda:0
<s>[INST] Do you prefer coffee or tea? [/INST]



[INST] [/INST]




[INST] [/INST]

[INST] [/INST]

[INST] [/INST]

[INST] Do you prefer [/INST]


[INST] [/INST]


[INST] Do [/INST]

[INST] Do





In [40]:
model.config.use_cache = True
model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32016, 4096)
        (layers): ModuleList(
          (0-14): 15 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): MistralRotaryEmbedding()
            )
            (mlp): MistralMLP(
              (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
              (act_fn): SiLU()
     

In [41]:
print(save_dir)
evaluate(base_model=False, tokenizer=tokenizer, checkpoint=f"{save_dir}/checkpoint-1")

models/AI_Vtuber_Llama_AI_Vtuber_1_1_lora


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


eval_model is on: cuda:0
inputs is on: cuda:0
<s>[INST] What are you doing? [/INST]

,,,,,,,,,,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,in,





Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


eval_model is on: cuda:0
inputs is on: cuda:0
<s>[INST] Do you prefer coffee or tea? [/INST]



[INST] [/INST]




[INST] [/INST]

[INST] [/INST]

[INST] [/INST]

[INST] Do you prefer [/INST]


[INST] [/INST]


[INST] Do [/INST]

[INST] Do



