In [4]:
import os
os.environ["https_proxy"] = "http://192.168.1.12:7891"

In [None]:
import re
import json
from datasets import load_dataset
import sys
sys.path.append("../..")
from utils.sys_prompts import SYS_PROMPT_formatter_deepseek_concise_2 as SYSTEM_PROMPT
import random

INSTRUCTION_RESPONSE_FORMAT = """\
<instruction>
{instruction}
</instruction>
<response>
{response}
</response>
"""

def get_natural_thinking_dataset(url, split="train", output_file="dataset.json"):
    dataset = load_dataset(url, split=split)
    
    def check_structure_markdown(text):
        structure_markdown_regex = r"^#{1,6} .*$"
        res = re.findall(structure_markdown_regex, text["response"], re.MULTILINE)
        return len(res) > 0
    
    def check_length(text):
        return len(text["response"]) > 50
    
    dataset = dataset.filter(check_structure_markdown)
    dataset = dataset.filter(check_length)
    print("\033[92m Number of data after filtering: \033[0m", len(dataset))
    
    def formatting_prompts_func(examples):
        instruction = examples["prompt"].strip()
        output = examples["response"].strip()
        
        return {
            'prompt': [
                {'role': 'system', 'content': SYSTEM_PROMPT},
                {'role': 'user', 'content': INSTRUCTION_RESPONSE_FORMAT.format(instruction=instruction, response=output)}
            ]
        }
    
    dataset = dataset.map(formatting_prompts_func, batched=False)
    dataset = dataset.remove_columns([col for col in dataset.column_names if col != "prompt"])
    # dataset = dataset.shuffle(seed=42).select(range(min(1000, len(dataset))))  # Ensure we get up to 1000 samples
    

    print("\033[92m Save number of data: \033[0m", len(dataset))
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset.to_list(), f, indent=4, ensure_ascii=False)
    
    return dataset


In [8]:
dataset = get_natural_thinking_dataset("nvidia/HelpSteer2", "train", "HelpSteer2.json")

[92m Number of data after filtering: [0m 61


In [9]:
for i in range(5):
    print('-'*100)
    print(dataset[i]['prompt'][1]['content'])

----------------------------------------------------------------------------------------------------
<instruction>
Web search results:

[1] "One study from Backlinko, published in April 2020 cited the "average Google first page result contains 1,447 words.". It would be quite easy for someone to take this information in isolation ..."
URL: https://www.searchenginejournal.com/what-is-the-best-word-count-for-seo/370655/

[2] "To become an SEO copywriter, you can follow the 10 steps below. Choose your keywords (both head and long tail keywords) Craft your page title. Create an SEO Friendly URL (add your main keyword) Write the content (long enough) Add your keywords in the first paragraph. Create headings using long-tail keywords."
URL: https://www.reliablesoft.net/seo-writing/

[3] "Then, all you have to do is navigate to the New Article page in Article Forge and enter your keywords like this. Then simply scroll down and press the "Create New Article" button. Now all you have to do is wa