Skip to content

Commit

Permalink
add _ready_ versions of files too
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewm4894 committed Dec 4, 2023
1 parent a095cb6 commit 99adb6f
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 3 deletions.
7 changes: 6 additions & 1 deletion oasst-data/examples/filter_messages.py
Expand Up @@ -126,7 +126,12 @@ def approve_message(msg: ExportMessageNode) -> bool:
):
return False

if exclude_normal is True and not msg.deleted and not msg.synthetic and msg.review_result:
if (
exclude_normal is True
and not msg.deleted
and not msg.synthetic
and msg.review_result
):
return False

if spam is not None and spam != (not msg.review_result):
Expand Down
9 changes: 9 additions & 0 deletions oasst-data/examples/filter_trees.py
@@ -1,3 +1,12 @@
"""
Example usage:
python filter_trees.py /
"2023-11-05_oasst_all.jsonl" /
"2023-11-05_oasst_all.clean.jsonl" /
--states "ready_for_export"
"""

import argparse

from oasst_data import read_message_trees, write_message_trees
Expand Down
86 changes: 84 additions & 2 deletions oasst-data/oasst2/generate_oasst2.ipynb
Expand Up @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -22,9 +22,17 @@
"raw_input_data_path = f\"{data_dir}/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\"\n",
"instructions_path = f\"{data_dir}/instructions.xlsx\"\n",
"trees_filename = f\"2023-11-05_oasst_all.trees.jsonl\"\n",
"trees_ready_filename = f\"2023-11-05_oasst_all.trees.ready_for_export.jsonl\"\n",
"messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"\n",
"messages_ready_filename = f\"2023-11-05_oasst_all.messages.ready_for_export.jsonl\"\n",
"messages_train_filename = f\"2023-11-05_oasst_all.messages.train.jsonl\"\n",
"messages_ready_train_filename = (\n",
" f\"2023-11-05_oasst_all.messages.ready_for_export.train.jsonl\"\n",
")\n",
"messages_validation_filename = f\"2023-11-05_oasst_all.messages.validation.jsonl\"\n",
"messages_ready_validation_filename = (\n",
" f\"2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl\"\n",
")\n",
"\n",
"# make data_out_dir if it doesn't exist\n",
"if not os.path.exists(data_out_dir):\n",
Expand Down Expand Up @@ -833,6 +841,29 @@
"# outputs have been manually reviewed and appended to instructions file and notebook has been rerun"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n",
"Found 13854 matching trees.\n",
"Writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl\n"
]
}
],
"source": [
"# filter trees to make a version with status ready for export\n",
"!python ../examples/filter_trees.py \\\n",
" \"{data_out_dir}/{trees_filename}\" \\\n",
" \"{data_out_dir}/{trees_ready_filename}\" \\\n",
" --states \"ready_for_export\""
]
},
{
"cell_type": "code",
"execution_count": 6,
Expand All @@ -856,6 +887,29 @@
" \"{data_out_dir}/{messages_filename}\""
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl\n",
"13854 trees with 135174 total messages read.\n",
"writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl\n",
"135174 messages written.\n"
]
}
],
"source": [
"# convert cleaned state=ready_for_export dataset from tree to messages\n",
"!python ../examples/tree_to_messages.py \\\n",
" \"{data_out_dir}/{trees_ready_filename}\" \\\n",
" \"{data_out_dir}/{messages_ready_filename}\""
]
},
{
"cell_type": "code",
"execution_count": 7,
Expand All @@ -880,6 +934,30 @@
" --val_output \"{data_out_dir}/{messages_validation_filename}\""
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl\n",
"Found 135174 matching messages.\n",
"Writing train 128412 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.train.jsonl\n",
"Writing valid 6762 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl\n"
]
}
],
"source": [
"# split ready messages into train and validation\n",
"!python ../examples/split_dataset.py \\\n",
" \"{data_out_dir}/{messages_ready_filename}\" \\\n",
" --train_output \"{data_out_dir}/{messages_ready_train_filename}\" \\\n",
" --val_output \"{data_out_dir}/{messages_ready_validation_filename}\""
]
},
{
"cell_type": "code",
"execution_count": 8,
Expand All @@ -888,9 +966,13 @@
"source": [
"# make .gz files, keeping the original files\n",
"!gzip -c \"{data_out_dir}/{trees_filename}\" > \"{data_out_dir}/{trees_filename}.gz\"\n",
"!gzip -c \"{data_out_dir}/{trees_ready_filename}\" > \"{data_out_dir}/{trees_ready_filename}.gz\"\n",
"!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"\n",
"!gzip -c \"{data_out_dir}/{messages_ready_filename}\" > \"{data_out_dir}/{messages_ready_filename}.gz\"\n",
"!gzip -c \"{data_out_dir}/{messages_train_filename}\" > \"{data_out_dir}/{messages_train_filename}.gz\"\n",
"!gzip -c \"{data_out_dir}/{messages_validation_filename}\" > \"{data_out_dir}/{messages_validation_filename}.gz\""
"!gzip -c \"{data_out_dir}/{messages_ready_train_filename}\" > \"{data_out_dir}/{messages_ready_train_filename}.gz\"\n",
"!gzip -c \"{data_out_dir}/{messages_validation_filename}\" > \"{data_out_dir}/{messages_validation_filename}.gz\"\n",
"!gzip -c \"{data_out_dir}/{messages_ready_validation_filename}\" > \"{data_out_dir}/{messages_ready_validation_filename}.gz\""
]
},
{
Expand Down

0 comments on commit 99adb6f

Please sign in to comment.