In [None]:
!pip install -U torchinfo transformers peft datasets langchain fschat

In [2]:
!nvidia-smi

Mon Jun 19 11:04:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import numpy as np
import pandas as pd

import torch
from torchinfo import summary
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset, load_dataset, concatenate_datasets, DatasetDict

import matplotlib.pyplot as plt
from collections import Counter
from functools import partial

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
SEED = 42
MODEL_MAX_LENGTH = 1024

## Model

In [4]:
llm_model_name = "databricks/dolly-v2-3b"
# llm_model_name = "eachadea/vicuna-7b-1.1"
# llm_model_name = "tiiuae/falcon-7b"
# llm_model_name = "databricks/dolly-v2-7b"

tokenizer = AutoTokenizer.from_pretrained(llm_model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(llm_model_name, device_map="auto", torch_dtype=torch.float16)
model.tie_weights()

model

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]



GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 2560)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=2560, out_features=7680, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=2560, out_features=50280, bias=False)
)

In [5]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  # if param.ndim == 1:
  #   # cast the small parameters (e.g. layernorm) to fp32 for stability
  #   param.data = param.data.to(torch.bfloat32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

In [6]:
model.config

GPTNeoXConfig {
  "_name_or_path": "databricks/dolly-v2-3b",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "bos_token_id": 0,
  "custom_pipelines": {
    "text-generation": {
      "impl": "instruct_pipeline.InstructionTextGenerationPipeline",
      "pt": "AutoModelForCausalLM",
      "tf": "TFAutoModelForCausalLM"
    }
  },
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 10240,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.28.1",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50280
}

In [None]:
tokenizer.encode("### End")

[50277]

#### FastChat part

In [None]:
from fastchat.model import load_model, get_conversation_template, add_model_args

In [None]:
# Fastchat
model, tokenizer = load_model("lmsys/fastchat-t5-3b-v1.0", device="cuda", num_gpus=1, load_8bit=False)

In [None]:
model

In [None]:
conv = get_conversation_template("lmsys/fastchat-t5-3b-v1.0")
conv

In [None]:
conv.append_message(conv.roles[0], 'Hello')
conv.append_message(conv.roles[1], None)

In [None]:
conv.get_prompt()

### Model info

In [7]:
model.hf_device_map

{'': 0}

In [8]:
summary(model)

Layer (type:depth-idx)                             Param #
GPTNeoXForCausalLM                                 --
├─GPTNeoXModel: 1-1                                --
│    └─Embedding: 2-1                              (128,716,800)
│    └─ModuleList: 2-2                             --
│    │    └─GPTNeoXLayer: 3-1                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-2                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-3                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-4                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-5                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-6                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-7                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-8                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-9                      (78,676,480)
│    │    └─GPTNeoXLayer: 3-10                     (78,676,480)
│    │    └─GPTNeoXLayer: 3-11                     (78,676,480)
│    │    

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
print_trainable_parameters(model)

trainable params: 0 || all params: 2775086080 || trainable%: 0.0


## Generation test

In [11]:
test_intro_prompt = "You are 20 years old, your name is Arthur, you are a young man, designer from San Francisco. You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request."
test_instruction_prompt = "You are trying to get to know a person, to attract him by asking him questions about him. Complete a phrase, acting like an interlocutor."

In [12]:
def test_model_generation(intro_prompt, instruction_prompt, examples=3, temparature=0.5, eos_token_id=50277, max_new_tokens=75):
  prompt_template = "{intro}\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}"
  prompt = prompt_template.format(intro=intro_prompt, instruction=instruction_prompt, response="Person: Hi! My name is Sergey.\nYou:")

  input_ids = tokenizer.encode(prompt, return_tensors='pt').cuda()

  answers = model.generate(inputs=input_ids,
                  do_sample=True,
                  use_cache=True,
                  num_beams=1,
                  repetition_penalty=1.0,
                  length_penalty=-10.0,
                  eos_token_id=eos_token_id,
                  forced_eos_token_id=eos_token_id,
                  temperature=temparature,
                  top_p=0.9,
                  top_k=50,
                  num_return_sequences=examples,
                  max_new_tokens=max_new_tokens,
                  remove_invalid_values=True
                  )

  return tokenizer.batch_decode(answers)

In [13]:
test_model_generation(test_intro_prompt, test_instruction_prompt, max_new_tokens=100)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


['You are 20 years old, your name is Arthur, you are a young man, designer from San Francisco. You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are trying to get to know a person, to attract him by asking him questions about him. Complete a phrase, acting like an interlocutor.\n\n### Response:\nPerson: Hi! My name is Sergey.\nYou: Hi! My name is Arthur. How are you?\nPerson: I am fine, thanks. How are you?\nYou: I am fine, thanks. So, Sergey, you are from San Francisco?\nPerson: Yes, I am.\nYou: That is a nice city. It has many interesting places to visit and things to do.\nPerson: Indeed.\nYou: Do you have any recommendations?\nPerson: I do. I would like to recommend the### End',
 'You are 20 years old, your name is Arthur, you are a young man, designer from San Francisco. You are a kind and empathetic interlocutor. You are talking 

In [None]:
# # for dolly

# from instruct_pipeline import InstructionTextGenerationPipeline
# generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

In [None]:
# generate_text(inputs=f'{instruction_prompt}\n\n### Response:\nPerson: Hello, what is your name? And what are yu doing?\nYou:',
#               do_sample=True,
#               use_cache=True,
#               num_beams=3,
#               repetition_penalty=2.0,
#               length_penalty=-10.0,
#               # forced_eos_token_id=0,
#               temperature=0.7,
#               top_p=0.9,
#               top_k=50,
#               max_new_tokens=100,
#               remove_invalid_values=True
#               )



[{'generated_text': 'Person: Hello, what is your name? And what are yu doing?\n\n### Response:\nPerson: My name is Alice and I am working as a sales representative for a company in China. What are you doing?'}]

## Datasets

In [14]:
dataset = load_dataset("hivaze/emphatical_daily_dialogues")
dataset



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'text'],
        num_rows: 19325
    })
    validation: Dataset({
        features: ['dialog', 'text'],
        num_rows: 2049
    })
})

In [15]:
def tokenization(batch) -> dict:
    return tokenizer(
        batch["text"],
        max_length=MODEL_MAX_LENGTH,
        truncation=True,
    )

In [16]:
tokenized_dataset = dataset.map(
    tokenization,
    batched=True
    # remove_columns=["instruction", "context", "response", "text", "category"],
)
tokenized_dataset = tokenized_dataset.filter(lambda rec: len(rec["input_ids"]) < MODEL_MAX_LENGTH)
tokenized_dataset

Map:   0%|          | 0/19325 [00:00<?, ? examples/s]

Map:   0%|          | 0/2049 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2049 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'text', 'input_ids', 'attention_mask'],
        num_rows: 19302
    })
    validation: Dataset({
        features: ['dialog', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2047
    })
})

In [17]:
tokenized_dataset = tokenized_dataset.shuffle(seed=SEED)

## Training

In [18]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, # can be 8 with llama
    lora_alpha=32, # can be 16 with llama
    # target_modules=["q_proj", "v_proj"],
    target_modules=['query_key_value'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 5242880 || all params: 2780328960 || trainable%: 0.18857049203271256


In [19]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50280, 2560)
        (layers): ModuleList(
          (0-31): 32 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (attention): GPTNeoXAttention(
              (rotary_emb): RotaryEmbedding()
              (query_key_value): Linear(
                in_features=2560, out_features=7680, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=7680, bias=False)
                )
  

In [20]:
local_output_dir = 'outputs/dolly-v2-3b-lora-emphatical_daily_dialogues'

train_args = TrainingArguments(
    per_device_train_batch_size=8, # can be 4 with llama
    per_device_eval_batch_size=8, # can be 4 with llama
    gradient_accumulation_steps=4,
    warmup_steps=20,
    # max_steps=200,
    optim="adamw_torch",
    learning_rate=4e-5, # many possible values here from 1e-5 to 2e-4
    # save_strategy="steps",
    fp16=True,
    # bf16=True,  # a100 required
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=400,
    logging_strategy="steps",
    logging_steps=10,
    logging_dir=f"{local_output_dir}/runs",
    report_to="tensorboard",
    output_dir=local_output_dir
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    args=train_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # silence the warnings. need to be re-enabled on inference
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,1.8368,1.698123
100,1.4834,1.497166
150,1.3907,1.429918
200,1.3878,1.395466
250,1.3714,1.382278


Step,Training Loss,Validation Loss
50,1.8368,1.698123
100,1.4834,1.497166
150,1.3907,1.429918
200,1.3878,1.395466
250,1.3714,1.382278
300,1.388,1.373748
350,1.3374,1.367584
400,1.3702,1.363276
450,1.3101,1.359401
500,1.3961,1.356543


TrainOutput(global_step=1206, training_loss=1.4026088762046092, metrics={'train_runtime': 3031.0932, 'train_samples_per_second': 12.736, 'train_steps_per_second': 0.398, 'total_flos': 1.8188949983072256e+17, 'train_loss': 1.4026088762046092, 'epoch': 2.0})

In [21]:
save_name = local_output_dir.split('/')[1]
save_name

'dolly-v2-3b-lora-emphatical_daily_dialogues'

In [22]:
model.config.use_cache = True
model.save_pretrained(f"{local_output_dir}/model")

In [23]:
model.push_to_hub(f"hivaze/{save_name}", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hivaze/dolly-v2-3b-lora-emphatical_daily_dialogues/commit/90e6f2358ecdb5638c726cf2032bdad5146d98f2', commit_message='Upload model', commit_description='', oid='90e6f2358ecdb5638c726cf2032bdad5146d98f2', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# from peft import PeftModel, PeftConfig
# model = PeftModel.from_pretrained(model, "hivaze/dolly-v2-3b-lora-emphatic-dd")

In [24]:
!tar -czvf outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/runs.tar.gz outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/runs

outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/runs/
outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/runs/1687175597.4576945/
outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/runs/1687175597.4576945/events.out.tfevents.1687175597.00ac61844876.20175.1
outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/runs/events.out.tfevents.1687175597.00ac61844876.20175.0


In [25]:
!tar -czvf outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/model.tar.gz outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/model

outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/model/
outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/model/adapter_config.json
outputs/dolly-v2-3b-lora-emphatical_daily_dialogues/model/adapter_model.bin


## Generation tests #2

In [26]:
# model.disable_gradient_checkpointing()
model = model.eval()

In [None]:
# test_intro_prompt = "You are 20 years old, your name is Arthur, you are a man, designer from San Francisco. You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request"
# test_instruction_prompt = "Ask questions about a person interests. Complete a phrase, acting like an interlocutor.."

In [28]:
test_model_generation(test_intro_prompt, test_instruction_prompt, eos_token_id=187, max_new_tokens=100)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:187 for open-end generation.


["You are 20 years old, your name is Arthur, you are a young man, designer from San Francisco. You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are trying to get to know a person, to attract him by asking him questions about him. Complete a phrase, acting like an interlocutor.\n\n### Response:\nPerson: Hi! My name is Sergey.\nYou: Hi, I'm Arthur.\n\n\n\n",
 "You are 20 years old, your name is Arthur, you are a young man, designer from San Francisco. You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are trying to get to know a person, to attract him by asking him questions about him. Complete a phrase, acting like an interlocutor.\n\n### Response:\nPerson: Hi! My name is Sergey.\nYou