raw2text

In [None]:
from datasets import load_from_disk

common_ins = load_from_disk("data/responses/common_ins")
complex_ins = load_from_disk("data/responses/complex_ins")
common_ins, complex_ins

In [2]:
common_ins[0], complex_ins[0]

({'ids': 1,
  'text': 'There are some components that must be thought of for a business to run smoothly and for targets to be met totally. Consequently, the state refuses to reimburse such clinics for many medical companies rendered to their Medicaid patients, even long-established ones. My current health provider presents the most effective care I bear in mind receiving in my many adult years. The follow tries to help patients falling between the cracks of our increasingly inefficient health care system.\\n\\nThe survival of ladies in childbirth reflects the overall growth of a country and whether or not the health companies are functioning. Intermediate causes, which are the first and second delays in care-in search of, embody the low social status of ladies, lack of knowledge and data at the household stage, inadequate sources to seek care, and poor access to quality health care.\\n\\nMoney penalties are enforced for non-compliance by healthcare entities. The notice of privacy pract

In [4]:
from transformers import AutoTokenizer
qwen_tokenizer = AutoTokenizer.from_pretrained("qwen72b")
qwq_tokenizer = AutoTokenizer.from_pretrained("qwq/Qwen/QwQ-32B-Preview")

In [5]:
def CommonInsRefine(example):
    # common
    text_parts = ['You need to generate two different styles of answers based on the given question. Use the background information provided in the text to assist in formulating a relevant and detailed answer. Follow these answer guidelines:\n1. Ensure the answer is closely related to the main points or themes mentioned in the question.\n2. Utilize the text content to provide a comprehensive and accurate answer.\n3. Ensure proper formatting and readability, including the correct rendering of any LaTeX or mathematical symbols.\n4. Ensure that the answer provides a complete solution or explanation, with clear and detailed steps.\n5. Please strictly follow the format below for output:\n{\n    "answer1": "Generated first answer content",\n    "answer2": "Generated second answer content"\n}\nHere is the question: \n']
    
    text_parts.append(example["instruction"])
    text_parts.append("\nHere is the text:\n")
    text_parts.append(example["text"])
    
    text = "".join(text_parts)
    example["new_text"] = text
    return example

In [6]:
def ComplexInsRefine(example):
    # complex
    text_parts = ['You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:\n1. Please structure your response into two main sections: Thought and Summarization.\n2. During the thinking phase, think step by step based on the given text content. If the text content is used, it must be expressed.\n3. During the summary phase, based on the thinking process in the thinking phase, give the final answer to the question.\nHere is the question: \n']
    
    text_parts.append(example["instruction"])
    text_parts.append("\nHere is the text:\n")
    text_parts.append(example["text"])
    
    text = "".join(text_parts)
    example["new_text"] = text
    return example

In [4]:
p = """
You need to generate two different styles of answers based on the given question. Use the background information provided in the text to assist in formulating a relevant and detailed answer. Follow these answer guidelines:
1. Ensure the answer is closely related to the main points or themes mentioned in the question.
2. Utilize the text content to provide a comprehensive and accurate answer.
3. Ensure proper formatting and readability, including the correct rendering of any LaTeX or mathematical symbols.
4. Ensure that the answer provides a complete solution or explanation, with clear and detailed steps.
5. Please strictly follow the format below for output:
{
    "answer1": "Generated first answer content",
    "answer2": "Generated second answer content"
}
Here is the question: 
"""
p

'\nYou need to generate two different styles of answers based on the given question. Use the background information provided in the text to assist in formulating a relevant and detailed answer. Follow these answer guidelines:\n1. Ensure the answer is closely related to the main points or themes mentioned in the question.\n2. Utilize the text content to provide a comprehensive and accurate answer.\n3. Ensure proper formatting and readability, including the correct rendering of any LaTeX or mathematical symbols.\n4. Ensure that the answer provides a complete solution or explanation, with clear and detailed steps.\n5. Please strictly follow the format below for output:\n{\n    "answer1": "Generated first answer content",\n    "answer2": "Generated second answer content"\n}\nHere is the question: \n'

In [7]:
p = """
You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:
1. Please structure your response into two main sections: Thought and Summarization.
2. During the thinking phase, think step by step based on the given text content. If the text content is used, it must be expressed.
3. During the summary phase, based on the thinking process in the thinking phase, give the final answer to the question.
Here is the question: 
"""
p

'\nYou need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:\n1. Please structure your response into two main sections: Thought and Summarization.\n2. During the thinking phase, think step by step based on the given text content. If the text content is used, it must be expressed.\n3. During the summary phase, based on the thinking process in the thinking phase, give the final answer to the question.\nHere is the question: \n'

In [8]:
common_ins = common_ins.map(CommonInsRefine, remove_columns=common_ins.column_names)
complex_ins = complex_ins.map(ComplexInsRefine, remove_columns=complex_ins.column_names)
common_ins, complex_ins

Map:   0%|          | 0/169279 [00:00<?, ? examples/s]

Map: 100%|██████████| 169279/169279 [00:12<00:00, 14078.05 examples/s]
Map: 100%|██████████| 422/422 [00:00<00:00, 4798.39 examples/s]


(Dataset({
     features: ['new_text'],
     num_rows: 169279
 }),
 Dataset({
     features: ['new_text'],
     num_rows: 422
 }))

In [9]:
common_ins = common_ins.rename_column("new_text", "text")
complex_ins = complex_ins.rename_column("new_text", "text")
common_ins, complex_ins

(Dataset({
     features: ['text'],
     num_rows: 169279
 }),
 Dataset({
     features: ['text'],
     num_rows: 422
 }))

In [11]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Coding Stage
def encode_batch(batch):
    return qwen_tokenizer(
        batch,
        truncation=True,
        max_length=8157,
        padding=True,
        return_tensors="pt",
    ).to(device)  # Offloading Encoding Results to GPU

# Batch Encoding
refined_text = common_ins["text"]
batch_size = 64
batches = [refined_text[i:i + batch_size] for i in range(0, len(refined_text), batch_size)]
encoded_batches = [encode_batch(batch) for batch in batches]

# Decoding Stage (kept on GPU)
decoded_refined_text = [
    qwen_tokenizer.batch_decode(batch["input_ids"].cpu(), skip_special_tokens=True)  # Switch back to CPU when decoding
    for batch in encoded_batches
]
decoded_common_ins_text = [item for sublist in decoded_refined_text for item in sublist]
decoded_common_ins_text[:2]

['You need to generate two different styles of answers based on the given question. Use the background information provided in the text to assist in formulating a relevant and detailed answer. Follow these answer guidelines:\n1. Ensure the answer is closely related to the main points or themes mentioned in the question.\n2. Utilize the text content to provide a comprehensive and accurate answer.\n3. Ensure proper formatting and readability, including the correct rendering of any LaTeX or mathematical symbols.\n4. Ensure that the answer provides a complete solution or explanation, with clear and detailed steps.\n5. Please strictly follow the format below for output:\n{\n    "answer1": "Generated first answer content",\n    "answer2": "Generated second answer content"\n}\nHere is the question: \nHow do financial penalties for non-compliance by healthcare entities impact the quality and accessibility of healthcare services, and what measures can be taken to ensure these penalties do not d

In [12]:
# Complex
refined_text = complex_ins["text"]
batch_size = 64
batches = [refined_text[i:i + batch_size] for i in range(0, len(refined_text), batch_size)]
encoded_batches = [encode_batch(batch) for batch in batches]

decoded_refined_text = [
    qwen_tokenizer.batch_decode(batch["input_ids"].cpu(), skip_special_tokens=True)  
    for batch in encoded_batches
]
decoded_complex_ins_text = [item for sublist in decoded_refined_text for item in sublist]
decoded_complex_ins_text[:2]

["You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:\n1. Please structure your response into two main sections: Thought and Summarization.\n2. During the thinking phase, think step by step based on the given text content. If the text content is used, it must be expressed.\n3. During the summary phase, based on the thinking process in the thinking phase, give the final answer to the question.\nHere is the question: \nWhat are the key differences in the intrinsic mechanisms of axon auto-fusion between invertebrates and mammals, and how might these differences impact the deve

In [13]:
import numpy as np
my_list = np.array(decoded_common_ins_text)
# Vectorized operations using numpy
decoded_common_ins_text = np.core.defchararray.add(my_list, "\nOutput: \n")
print(decoded_common_ins_text[:2])

['You need to generate two different styles of answers based on the given question. Use the background information provided in the text to assist in formulating a relevant and detailed answer. Follow these answer guidelines:\n1. Ensure the answer is closely related to the main points or themes mentioned in the question.\n2. Utilize the text content to provide a comprehensive and accurate answer.\n3. Ensure proper formatting and readability, including the correct rendering of any LaTeX or mathematical symbols.\n4. Ensure that the answer provides a complete solution or explanation, with clear and detailed steps.\n5. Please strictly follow the format below for output:\n{\n    "answer1": "Generated first answer content",\n    "answer2": "Generated second answer content"\n}\nHere is the question: \nHow do financial penalties for non-compliance by healthcare entities impact the quality and accessibility of healthcare services, and what measures can be taken to ensure these penalties do not d

In [14]:
my_list = np.array(decoded_complex_ins_text)
# Vectorized operations using numpy
decoded_complex_ins_text = np.core.defchararray.add(my_list, "\nOutput: \n")
print(decoded_complex_ins_text[:2])

["You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:\n1. Please structure your response into two main sections: Thought and Summarization.\n2. During the thinking phase, think step by step based on the given text content. If the text content is used, it must be expressed.\n3. During the summary phase, based on the thinking process in the thinking phase, give the final answer to the question.\nHere is the question: \nWhat are the key differences in the intrinsic mechanisms of axon auto-fusion between invertebrates and mammals, and how might these differences impact the deve

In [15]:
from datasets import Dataset
common_ins_dict = {"text": decoded_common_ins_text}
common_ins_dataset = Dataset.from_dict(common_ins_dict)
common_ins_dataset

Dataset({
    features: ['text'],
    num_rows: 169279
})

In [16]:
complex_ins_dict = {"text": decoded_complex_ins_text}
complex_ins_dataset = Dataset.from_dict(complex_ins_dict)
complex_ins_dataset

Dataset({
    features: ['text'],
    num_rows: 422
})

text2prompt

In [17]:
messages_template = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "{prompt}"}
]

def create_messages(prompt):
    return [
        msg.copy() if msg["role"] == "system" 
        else {"role": "user", "content": prompt}
        for msg in messages_template
    ]

In [18]:
def create_prompt_qwen(example):
    message = create_messages(example["text"])
    text = qwen_tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    example["prompt"] = text
    return example

In [19]:
def create_prompt_qwq(example):
    message = create_messages(example["text"])
    text = qwq_tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    example["prompt"] = text
    return example

In [20]:
common_ins_dataset = common_ins_dataset.map(create_prompt_qwen, remove_columns=["text"])
complex_ins_dataset = complex_ins_dataset.map(create_prompt_qwq, remove_columns=["text"])
common_ins_dataset, complex_ins_dataset

Map: 100%|██████████| 169279/169279 [00:18<00:00, 9112.23 examples/s]
Map: 100%|██████████| 422/422 [00:00<00:00, 6576.24 examples/s]


(Dataset({
     features: ['prompt'],
     num_rows: 169279
 }),
 Dataset({
     features: ['prompt'],
     num_rows: 422
 }))

In [21]:
common_ins_dataset[0], complex_ins_dataset[0]

({'prompt': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nYou need to generate two different styles of answers based on the given question. Use the background information provided in the text to assist in formulating a relevant and detailed answer. Follow these answer guidelines:\n1. Ensure the answer is closely related to the main points or themes mentioned in the question.\n2. Utilize the text content to provide a comprehensive and accurate answer.\n3. Ensure proper formatting and readability, including the correct rendering of any LaTeX or mathematical symbols.\n4. Ensure that the answer provides a complete solution or explanation, with clear and detailed steps.\n5. Please strictly follow the format below for output:\n{\n    "answer1": "Generated first answer content",\n    "answer2": "Generated second answer content"\n}\nHere is the question: \nHow do financial penalties for non-compliance by healthcare entities impact the quality and accessibility 

In [22]:
common_ins_dataset.save_to_disk("data/responses/res_gen_prompt_common")
complex_ins_dataset.save_to_disk("data/responses/res_gen_prompt_complex")

Saving the dataset (2/2 shards): 100%|██████████| 169279/169279 [00:01<00:00, 154166.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 422/422 [00:00<00:00, 10818.85 examples/s]
