update alpaca gpt4 to use dataset entry (#2869)

#2827 Update alpaca gpt4 to use dataset entry. I ran ```bash ~$ python check_dataset_appearances.py -d alpaca_gpt4 --cache_dir .cache --mode sft 'Found the following occurances in TRAIN alpaca_gpt4:' { re.compile('\\[\\d+(?:,\\s*\\d+)*?\\]'): [ '[3, 45, 99, 2, 8, 6, 72]', '[10, 8, 7, 4]', '[1, 2, 3, 4, 5]', '[7, 3, 4, 6, 2]']} 'Found the following occurances in VAL alpaca_gpt4:' { 'openai': [ 'u’re approved, get your API key.\n' '\n' '2. Install the `openai` library in your Python environment ' 'using p']} ``` Checked all the occurances for the references, but all are programming related and have nothing to do with references, so this looks fine: ```python DatasetEntry(questions=['Re-order the integer list given in the input field such that all odd numbers are first and even numbers are last. \n[2, 3, 8, 45, 6, 99, 72]'], answers=['[3, 45, 99, 2, 8, 6, 72]'], context=None, lang=None, length=None, quality=None, humor=None, creativity=None) ```
LAION-AI · Apr 24, 2023 · f745129 · f745129
1 parent bcc9360
commit f745129
Showing 1 changed file with 7 additions and 8 deletions.
diff --git a/model/model_training/custom_datasets/qa_datasets.py b/model/model_training/custom_datasets/qa_datasets.py
@@ -603,7 +603,7 @@ def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: i
             if (conv := self._process_instruction(line, input_max_length)) is not None:
                 self.rows.append(conv)
 
-    def _process_instruction(self, row: dict[str, str], input_max_length: int) -> list[str] | None:
+    def _process_instruction(self, row: dict[str, str], input_max_length: int) -> DatasetEntry | None:
         # discard items that are too long: when checked on 2023-04-17 this was just one item in the whole dataset with length above 2048.
         # And 12 above 1024.
         if len(row["input"]) + len(row["instruction"]) > input_max_length:
@@ -615,18 +615,17 @@ def _process_instruction(self, row: dict[str, str], input_max_length: int) -> li
             or (not row["input"])
             or (row["input"].lower() in row["instruction"].lower())
         ):
-            return [row["instruction"], row["output"]]
+            return DatasetEntry(questions=[row["instruction"]], answers=[row["output"]])
         # Concatenate the instruction and input.
         else:
             linking_char = random.choice(LINKING_CHARS)
-            return [f"{row['instruction']}{linking_char}{row['input']}", row["output"]]
+            return DatasetEntry(
+                questions=[f"{row['instruction']}{linking_char}{row['input']}"], answers=[row["output"]]
+            )
 
     def __len__(self) -> int:
         return len(self.rows)
 
     def __getitem__(self, index: int) -> list[str] | tuple[str]:
-        dialogue: list[str] = self.rows[index]
-        if self.mode == "sft":
-            return dialogue
-        elif self.mode == "rl":
-            return tuple(dialogue[:-1])
+        dialogue = self.rows[index]
+        return dialogue