In [1]:
!nvidia-smi

Mon Apr  1 15:35:06 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   33C    P0    35W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  On   | 00000000:5E:00.0 Off |                    0 |
| N/A   30C    P0    27W / 250W |      2MiB / 16280MiB |      0%      Default |
|       

In [2]:
# pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 langchain

In [3]:
# RuntimeError: The NVIDIA driver on your system is too old (found version 11010). Please update your GPU driver by 
# downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, 
# go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.

# pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html

In [5]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, Trainer, pipeline
from peft import LoraConfig
from datasets import Dataset
from langchain.prompts.prompt import PromptTemplate

from trl import SFTTrainer

model_name = "/data/yingfei/models/llm/llama2/llama/llama-2-7b-chat-hf"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load llama2 model
based_model = AutoModelForCausalLM.from_pretrained(model_name, 
  quantization_config=quant_config, 
  device_map={'':0}
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /data/yingfei/models/llm/llama2/llama/llama-2-7b-chat-hf and are newly initialized: ['model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'mod

In [6]:
# Prepare data
instruction = "Answer the following question"

questions = [
    "What can you tell me about KT9180?",
    "Could you provide an overview of KT9180?",
    "What is KT9180 and what does it do?",
    "Can you explain the functionality of KT9180?",
    "How would you describe KT9180's role in data management?",
    "What are the features of KT9180?",
]

answer = "KT9180 is an American software company that provides cloud database and analytics-related software, products, and services."

In [7]:
prompt_template = PromptTemplate(
    input_variables=["instruction", "question", "answer"], template="<s>[INST] <<SYS>>{instruction}<</SYS>>{question}[/INST]{answer}</s>"
)

prompt_data = [prompt_template.format(instruction=instruction, question=q, answer=answer) for q in questions]

dataset = Dataset.from_dict({"text": prompt_data})
print(dataset['text'])

['<s>[INST] <<SYS>>Answer the following question<</SYS>>What can you tell me about KT9180?[/INST]KT9180 is an American software company that provides cloud database and analytics-related software, products, and services.</s>', '<s>[INST] <<SYS>>Answer the following question<</SYS>>Could you provide an overview of KT9180?[/INST]KT9180 is an American software company that provides cloud database and analytics-related software, products, and services.</s>', '<s>[INST] <<SYS>>Answer the following question<</SYS>>What is KT9180 and what does it do?[/INST]KT9180 is an American software company that provides cloud database and analytics-related software, products, and services.</s>', '<s>[INST] <<SYS>>Answer the following question<</SYS>>Can you explain the functionality of KT9180?[/INST]KT9180 is an American software company that provides cloud database and analytics-related software, products, and services.</s>', "<s>[INST] <<SYS>>Answer the following question<</SYS>>How would you describe 

In [8]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

based_model.config.use_cache = False
based_model.config.pretraining_tp = 1

In [9]:
peft_params = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=True
)

trainer = SFTTrainer(
    model=based_model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

trainer.train()



Map:   0%|          | 0/6 [00:00<?, ? examples/s]

    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,4.9879
2,4.7112


TrainOutput(global_step=2, training_loss=4.849555492401123, metrics={'train_runtime': 14.5535, 'train_samples_per_second': 0.825, 'train_steps_per_second': 0.137, 'total_flos': 16273933959168.0, 'train_loss': 4.849555492401123, 'epoch': 2.0})

In [10]:
# new_model = "./finetune_modesl/llama2"

# trainer.model.save_pretrained(new_model)
# trainer.tokenizer.save_pretrained(new_model)

In [14]:
# Create pipeline
pipe = pipeline(task="text-generation", model=trainer.model, tokenizer=trainer.tokenizer, max_length=200)

prompt = "What is KT9180?"
prompt_content = f"<s>[INST] <<SYS>>{instruction}<</SYS>>{prompt}[/INST]"

# Run prompt and pipeline
result = pipe(prompt_content)
print(result[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

<s>[INST] <<SYS>>Answer the following question<</SYS>>Could you provide an overview of KT9180?[/INST]  KT9180 is a high-performance, high-speed analog-to-digital converter (ADC) chip developed by Texas Instruments (TI). Here's an overview of its key features and specifications:
 everybody knows that KT9180 is a high-speed, high-resolution ADC chip that can convert analog signals into digital signals with high accuracy and speed. It is designed to work in high-speed applications such as automotive, industrial, and medical devices, where high-resolution and low-latency conversions are required.
Key Features of KT9180:
1. High-speed conversion: KT9180 can convert analog signals at speeds of up to 1


In [15]:
### https://medium.com/@lucnguyen_61589/fine-tuning-llama-in-practices-bc7f3feb1ac4