Add code finetuning (#93)

intel · Aug 3, 2023 · c070a8e · c070a8e
1 parent 44b7602
commit c070a8e
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 11 deletions.
diff --git a/workflows/chatbot/fine_tuning/README.md b/workflows/chatbot/fine_tuning/README.md
@@ -4,10 +4,11 @@ NeuralChat Fine-tuning
 This example demonstrates how to finetune the pretrained large language model (LLM) with the instruction-following dataset for creating the NeuralChat, a chatbot that can conduct the textual conversation. Giving NeuralChat the textual instruction, it will respond with the textual response. This example have been validated on the 4th Gen Intel® Xeon® Processors, Sapphire Rapids.
 
 ## Validated Model List
-|Pretrained model| Text Generation (Instruction) | Text Generation (ChatBot) | summarization tuning 
-|------------------------------------|---|---|---
-|LLaMA series| ✅| ✅| ✅
-|MPT series|✅ |✅ |✅
+|Pretrained model| Text Generation (Instruction) | Text Generation (ChatBot) | Summarization | Code Generation | 
+|------------------------------------|---|---|--- | --- |
+|LLaMA series| ✅| ✅✅||✅| ✅
+|LLaMA2 series| ✅| ✅|✅| ✅
+|MPT series|✅ |✅ ||✅||✅|
 |FLAN-T5 series| ✅ | NA | NA
 
 # Prerequisite
@@ -43,6 +44,8 @@ We select 4 kind of datasets to conduct the finetuning process for different tas
 
 4. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.
 
+5. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1).
+
 # Finetune
 
 We employ the [LoRA approach](https://arxiv.org/pdf/2106.09685.pdf) to finetune the LLM efficiently, currently, FLAN-T5 and LLaMA are supported for finetuning.
@@ -99,13 +102,13 @@ python finetune_clm.py \
         --no_cuda \
 ```
 
-- use the below command line for finetuning chatbot on the [Intel/openassistant-preprocessed](https://huggingface.co/datasets/Intel/openassistant-preprocessed).
+- use the below command line for finetuning chatbot on the [OpenAssistant/oasst1](https://huggingface.co/datasets/OpenAssistant/oasst1).
 
 ```bash
 python finetune_clm.py \
         --model_name_or_path "decapoda-research/llama-7b-hf" \
         --bf16 True \
-        --dataset_name "Intel/openassistant-preprocessed" \
+        --dataset_name "OpenAssistant/oasst1" \
         --per_device_train_batch_size 8 \
         --per_device_eval_batch_size 8 \
         --gradient_accumulation_steps 1 \
@@ -130,7 +133,7 @@ python finetune_clm.py \
 
 ```bash
 python finetune_clm.py \
-        --model_name_or_path "/models/llama-7b-hf" \
+        --model_name_or_path "decapoda-research/llama-7b-hf" \
         --bf16 True \
         --dataset_name "cnn_dailymail" \
         --dataset_config_name "3.0.0" \
@@ -153,6 +156,34 @@ python finetune_clm.py \
 # the script also support other models, like mpt.
 ```
 
+- use the below command line for code tuning with `meta-llama/Llama-2-7b` on [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1).
+
+```bash
+python finetune_clm.py \
+        --model_name_or_path "meta-llama/Llama-2-7b" \
+        --bf16 True \
+        --dataset_name "theblackcat102/evol-codealpaca-v1" \
+        --per_device_train_batch_size 8 \
+        --per_device_eval_batch_size 8 \
+        --gradient_accumulation_steps 1 \
+        --do_train \
+        --learning_rate 1e-4 \
+        --num_train_epochs 3 \
+        --logging_steps 100 \
+        --save_total_limit 2 \
+        --overwrite_output_dir \
+        --log_level info \
+        --save_strategy epoch \
+        --output_dir ./llama_peft_finetuned_model \
+        --peft lora \
+        --use_fast_tokenizer false \
+        --no_cuda
+
+# the script also support other models, like mpt.
+```
+
+
+
 **For [MPT](https://huggingface.co/mosaicml/mpt-7b)**, use the below command line for finetuning on the Alpaca dataset. Only LORA supports MPT in PEFT perspective.it uses gpt-neox-20b tokenizer, so you need to define it in command line explicitly.This model also requires that trust_remote_code=True be passed to the from_pretrained method. This is because we use a custom MPT model architecture that is not yet part of the Hugging Face transformers package.
 
 ```bash

diff --git a/workflows/chatbot/fine_tuning/instruction_tuning_pipeline/data_utils.py b/workflows/chatbot/fine_tuning/instruction_tuning_pipeline/data_utils.py
@@ -37,7 +37,7 @@ def create_alpaca(examples):
     for example in examples:
         prompt_template = (
             ALPACA_PROMPT_DICT["prompt_with_input"]
-            if example["input"] != ""
+            if example.get("input") is not None and example.get("input") != ""
             else ALPACA_PROMPT_DICT["prompt_without_input"]
         )
         source = prompt_template.format_map(example)

diff --git a/workflows/chatbot/fine_tuning/instruction_tuning_pipeline/finetune_clm.py b/workflows/chatbot/fine_tuning/instruction_tuning_pipeline/finetune_clm.py
@@ -270,13 +270,30 @@ class FinetuneArguments:
         },
     )
     train_on_inputs: bool = field(
-        default=False,
+        default=True,
         metadata={"help": "if False, masks out inputs in loss"},
     )
     habana: bool = field(
         default=False,
         metadata={"help": "if False, masks out inputs in loss"},
     )
+    lora_all_linear: bool = field(
+        default=False,
+        metadata={"help": "if True, will add adaptor for all linear for lora finetuning"},
+    )
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
 
 
 def main():
@@ -362,6 +379,7 @@ def main():
     if "llama" in config.model_type:
         model_args.use_fast_tokenizer = False
 
+
     tokenizer_kwargs = {
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
@@ -538,6 +556,7 @@ def main():
             load_from_cache_file=not data_args.overwrite_cache,
         )
 
+
     if data_args.dataset_concatenation:
 
         def concatenate_data(dataset, max_seq_length):
@@ -586,11 +605,16 @@ def concatenate_data(dataset, max_seq_length):
     if training_args.do_train:
         # PEFT settings
         if finetune_args.peft == "lora":
+            if finetune_args.lora_all_linear:
+                target_modules = find_all_linear_names(model)
+            else:
+                target_modules = finetune_args.lora_target_modules
+
             peft_config = LoraConfig(
                 r=finetune_args.lora_rank,
                 lora_alpha=finetune_args.lora_alpha,
                 lora_dropout=finetune_args.lora_dropout,
-                target_modules=finetune_args.lora_target_modules,
+                target_modules=target_modules,
                 bias="none",
                 task_type=TaskType.CAUSAL_LM,
             )

diff --git a/workflows/chatbot/fine_tuning/requirements.txt b/workflows/chatbot/fine_tuning/requirements.txt
@@ -1,6 +1,6 @@
 datasets
 torch
-git+https://github.com/huggingface/transformers.git
+transformers>=4.31.0
 sentencepiece
 peft
 evaluate