fix oasst dataset issue. (#116)

* fix oasst dataset issue. * skip special token.
intel · Aug 10, 2023 · 76ee683 · 76ee683
1 parent 1787a7a
commit 76ee683
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 6 deletions.
diff --git a/workflows/chatbot/fine_tuning/README.md b/workflows/chatbot/fine_tuning/README.md
@@ -40,7 +40,7 @@ We select 4 kind of datasets to conduct the finetuning process for different tas
 
 2. Text Generation (Domain-specific instruction): Inspired by Alpaca, we constructed a domain-specific dataset focusing on Business and Intel-related issues. We made minor modifications to the [prompt template](https://github.com/tatsu-lab/stanford_alpaca/blob/main/prompt.txt) to proactively guide Alpaca in generating more Intel and Business related instruction data. The generated data could be find in `intel_domain.json`.
 
-3. Text Generation (ChatBot): To finetune a chatbot, we use the chat-style dataset [OpenAssistant/oasst1](https://huggingface.co/datasets/OpenAssistant/oasst1).
+3. Text Generation (ChatBot): To finetune a chatbot, we use the chat-style dataset [HuggingFaceH4/oasst1_en](https://huggingface.co/datasets/HuggingFaceH4/oasst1_en).
 
 4. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.
 
@@ -102,13 +102,13 @@ python finetune_clm.py \
         --no_cuda \
 ```
 
-- use the below command line for finetuning chatbot on the [OpenAssistant/oasst1](https://huggingface.co/datasets/OpenAssistant/oasst1).
+- use the below command line for finetuning chatbot on the [HuggingFaceH4/oasst1_en](https://huggingface.co/datasets/HuggingFaceH4/oasst1_en).
 
 ```bash
 python finetune_clm.py \
         --model_name_or_path "decapoda-research/llama-7b-hf" \
         --bf16 True \
-        --dataset_name "OpenAssistant/oasst1" \
+        --dataset_name "HuggingFaceH4/oasst1_en" \
         --per_device_train_batch_size 8 \
         --per_device_eval_batch_size 8 \
         --gradient_accumulation_steps 1 \

diff --git a/workflows/chatbot/fine_tuning/instruction_tuning_pipeline/data_utils.py b/workflows/chatbot/fine_tuning/instruction_tuning_pipeline/data_utils.py
@@ -144,7 +144,7 @@ def preprocess_function(examples):
                     max_input)
 
             if len(truncated_convs) == 0:
-                truncated_convs = [convs_tokens[-1][:max_input - 1] + convs_tokens[-1][-1:]]
+                truncated_convs = [convs_tokens[-1][:max_input - 3] + convs_tokens[-1][-3:]]
 
             prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens]
             prompt_ids = [tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens]
@@ -233,9 +233,9 @@ def preprocess_dataset(raw_datasets, tokenizer, data_args, finetune_args):
     dataset_name = data_args.dataset_name if data_args.dataset_name is not None else data_args.train_file
     if "oasst" in dataset_name:
         new_datasets = datasets.DatasetDict()
-        for key in ["train"]:
+        for key in ["train_ift"]:
             prompts = create_oasst(raw_datasets[key])
-            new_datasets[key] = datasets.Dataset.from_dict(prompts)
+            new_datasets["train"] = datasets.Dataset.from_dict(prompts)
 
         preprocess_fn = tokenize_oasst(tokenizer, data_args, finetune_args)