# French translate

## Fr-dolphin-reasoning (regen)

In [None]:
from datasets import load_dataset
dataset_dolphin_reasoning = load_dataset("HoangHa/Pensez-v0.1", "fr-dolphin-reasoning", split='train')
dataset_dolphin_reasoning

In [None]:
# Remember to change system prompt
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "fr-dolphin-reasoning" \
  --output-file "fr-dolphin-reasoning-question.jsonl" \
  --prompt-column "prompt" \
  --uuid-column "prompt" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "fr-dolphin-reasoning" \
  --output-file "fr-dolphin-reasoning-answer.jsonl" \
  --prompt-column "answer" \
  --uuid-column "answer" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset(file_path, original_col, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.select_columns([original_col, new_col])
    return dataset

# Process both datasets
dataset_dolphin_reasoning_question = process_generations_dataset('fr-dolphin-reasoning-question.jsonl', 'prompt', 'fr_prompt')
dataset_dolphin_reasoning_answer = process_generations_dataset('fr-dolphin-reasoning-answer.jsonl', 'answer', 'fr_answer')

# Print results
print(dataset_dolphin_reasoning_question)
print(dataset_dolphin_reasoning_question[0])
print(dataset_dolphin_reasoning_answer)
print(dataset_dolphin_reasoning_answer[0])

In [None]:
# Convert to dictionaries for fast lookup
question_dict = {x['prompt']: x['fr_prompt'] for x in dataset_dolphin_reasoning_question}
answer_dict = {x['answer']: x['fr_answer'] for x in dataset_dolphin_reasoning_answer}

# Merge based on exact match
def merge_rows(row):
    row['fr_prompt'] = question_dict.get(row['prompt'], '')  # Add fr_prompt if prompt matches
    row['fr_answer'] = answer_dict.get(row['answer'], '')    # Add fr_answer if answer matches
    return row

# Apply merging
dataset_dolphin_reasoning = dataset_dolphin_reasoning.map(merge_rows)

print(dataset_dolphin_reasoning)
dataset_dolphin_reasoning[0]

In [None]:
dataset_dolphin_reasoning.push_to_hub("HoangHa/Pensez-v0.1", "fr-dolphin-reasoning")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with customizable column names
dataset_fr_dolphin_openai_messages = convert_to_openai_messages(dataset_dolphin_reasoning,
                                                     prompt_col="fr_prompt",
                                                     reasoning_col="reasoning",
                                                     answer_col="fr_answer",
                                                    source="fr-dolphin-reasoning")

# Print results
print(dataset_fr_dolphin_openai_messages)
print(dataset_fr_dolphin_openai_messages[0])

## Fr-s1.1

In [54]:
from datasets import load_dataset
dataset_s1_reasoning = load_dataset("HoangHa/Pensez-v0.1", "fr-s1-reasoning-correct", split='train')

In [None]:
dataset_s1_reasoning.to_json("dataset.jsonl", orient="records", lines=True)

In [None]:
from datasets import load_dataset

target_text = """Particules non relativistes de masse m1, provenant d'une source à température T1, passent à travers un filtre qui sélectionne uniquement celles qui ont une vitesse proche de la moyenne (de toutes les particules de la source), puis elles illuminent une fente de largeur 2,5 micromètres. Sur un écran d'observation, de l'autre côté de la fente, une série de régions claires et sombres est observée, la première région claire (la plus proche du centre de la fente) étant située à un angle de 15 degrés. Plus tard, la même procédure est répétée, mais cette fois, la source est remplacée par une source contenant des particules de masse m2 = 2 * m1 et de température T2 ; la deuxième région claire apparaît maintenant à 20 degrés. Toutes les distances et les angles sont mesurés à partir de, ou par rapport à, une ligne reliant le centre de la fente et le point de l'écran situé juste en face. Quelle est la valeur du rapport T1/T2 ? \nChoix de réponses :\n(A) 2,2\n(B) 1,3\n(C) 3,5\n(D) 1,9"""

for i, entry in enumerate(dataset_s1_reasoning):
    if entry["fr_prompt"] == target_text:
        print("Index:", i)
        break

In [None]:
# remember to change system prompt
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "fr-s1-reasoning-correct" \
  --output-file "fr-s1-reasoning-correct-question.jsonl" \
  --prompt-column "question" \
  --uuid-column "question" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "fr-s1-reasoning-correct" \
  --output-file "fr-s1-reasoning-correct-answer.jsonl" \
  --prompt-column "deepseek_attempt" \
  --uuid-column "deepseek_attempt" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset(file_path, original_col, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.select_columns([original_col, new_col])
    return dataset

# Process both datasets
dataset_s1_reasoning_question = process_generations_dataset('fr-s1-reasoning-correct-question.jsonl', 'question', 'fr_prompt')
dataset_s1_reasoning_answer = process_generations_dataset('fr-s1-reasoning-correct-answer.jsonl', 'deepseek_attempt', 'fr_answer')

# Print results
print(dataset_s1_reasoning_question)
print(dataset_s1_reasoning_question[0])
print(dataset_s1_reasoning_answer)
print(dataset_s1_reasoning_answer[0])

In [None]:
# Convert to dictionaries for fast lookup
question_dict = {x['question']: x['fr_prompt'] for x in dataset_s1_reasoning_question}
answer_dict = {x['deepseek_attempt']: x['fr_answer'] for x in dataset_s1_reasoning_answer}

# Merge based on exact match
def merge_rows(row):
    row['fr_prompt'] = question_dict.get(row['question'], '')  # Add fr_prompt if prompt matches
    row['fr_answer'] = answer_dict.get(row['deepseek_attempt'], '')    # Add fr_answer if answer matches
    return row

# Apply merging
dataset_s1_reasoning = dataset_s1_reasoning.map(merge_rows)

print(dataset_s1_reasoning)
dataset_s1_reasoning[0]

In [None]:
# dataset_s1_reasoning = dataset_s1_reasoning.remove_columns(["fr_deepseek_attempt", "fr_question"])
dataset_s1_reasoning.push_to_hub("HoangHa/Pensez-v0.1", "fr-s1-reasoning-correct")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_fr_s1_openai_messages = convert_to_openai_messages(
    dataset_s1_reasoning,
    prompt_col="fr_prompt",
    reasoning_col="deepseek_thinking_trajectory",
    answer_col="fr_answer",
    source="fr-s1-reasoning-correct"
)

# Print results
print(dataset_fr_s1_openai_messages)
print(dataset_fr_s1_openai_messages[0])

## Fr-OpenMath-Currated

In [None]:
from datasets import load_dataset
dataset_openmath_reasoning = load_dataset("HoangHa/Pensez-v0.1", "fr-openmath-currated", split='train')
dataset_openmath_reasoning

In [None]:
from datasets import load_dataset
import re

def process_messages(example):
    """
    Process the 'messages' field to extract question, output, and reasoning into new columns.
    
    Args:
        example: A single example from the dataset containing 'messages' field
        
    Returns:
        dict: Dictionary with new columns 'question', 'output', 'reasoning'
    """
    # Initialize default values
    question = ""
    output = ""
    reasoning = ""
    
    # Check if 'messages' exists and is a list
    if 'messages' not in example or not isinstance(example['messages'], list):
        return {'question': question, 'output': output, 'reasoning': reasoning}
    
    # Process each message in the list
    for message in example['messages']:
        if message.get('from') == 'user':
            # Extract the question from user message
            question = message.get('value', '').strip()
        
        elif message.get('from') == 'assistant':
            # Extract the full response from assistant
            response = message.get('value', '')
            
            # Split into reasoning (between <think> and </think>) and output (after </think>)
            reasoning_match = re.search(r'<think>(.*?)</think>', response, re.DOTALL)
            if reasoning_match:
                reasoning = reasoning_match.group(1).strip()
            
            # Output is everything after </think>, if it exists
            output_start = response.find('</think>')
            if output_start != -1:
                output = response[output_start + 8:].strip()  # 8 is length of </think>
            else:
                output = response.strip()  # If no </think>, take the whole response
            
            # Clean up any residual tags (in case of malformed input)
            reasoning = re.sub(r'<think>|</think>', '', reasoning).strip()
            output = re.sub(r'<think>|</think>', '', output).strip()
    
    return {
        'question': question,
        'output': output,
        'reasoning': reasoning
    }

# Load the dataset
dataset_openmath_reasoning = load_dataset("HoangHa/Pensez-v0.1", "fr-openmath-currated", split='train')

# Apply the processing function to the dataset
dataset_openmath_reasoning = dataset_openmath_reasoning.map(process_messages)

In [None]:
dataset_openmath_reasoning.push_to_hub("HoangHa/Pensez-v0.1","fr-openmath-currated")

In [None]:
from datasets import load_dataset

def process_generations_dataset(file_path, original_col, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.select_columns([original_col, new_col])
    return dataset

# Process both datasets
dataset_openmath_reasoning_question = process_generations_dataset('fr-openmath-question.jsonl', 'question', 'fr_prompt')
dataset_openmath_reasoning_answer = process_generations_dataset('fr-openmath-answer.jsonl', 'output', 'fr_answer')

# Print results
print(dataset_openmath_reasoning_question)
print(dataset_openmath_reasoning_question[0])
print(dataset_openmath_reasoning_answer)
print(dataset_openmath_reasoning_answer[0])

In [None]:
# Convert to dictionaries for fast lookup
question_dict = {x['question']: x['fr_prompt'] for x in dataset_openmath_reasoning_question}
answer_dict = {x['output']: x['fr_answer'] for x in dataset_openmath_reasoning_answer}

# Merge based on exact match
def merge_rows(row):
    row['fr_prompt'] = question_dict.get(row['question'], '')  # Add fr_prompt if prompt matches
    row['fr_answer'] = answer_dict.get(row['output'], '')    # Add fr_answer if answer matches
    return row

# Apply merging
dataset_openmath_reasoning = dataset_openmath_reasoning.map(merge_rows)

print(dataset_openmath_reasoning)
dataset_openmath_reasoning[0]

In [None]:
dataset_openmath_reasoning.push_to_hub("HoangHa/Pensez-v0.1", "fr-openmath-currated")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_fr_openmath_openai_messages = convert_to_openai_messages(
    dataset_openmath_reasoning,
    prompt_col="fr_prompt",
    reasoning_col="reasoning",
    answer_col="fr_answer",
    source="fr-openmath-currated"
)

# Print results
print(dataset_fr_openmath_openai_messages)
print(dataset_fr_openmath_openai_messages[0])

## Fr-single

In [None]:
from datasets import load_dataset
dataset_single_fr = load_dataset("HoangHa/Pensez-v0.1", "fr-single", split='train')
dataset_single_fr

In [None]:
from datasets import load_dataset
import re

def process_messages(example):
    """
    Process the 'messages' field to extract question, output, and reasoning into new columns.
    
    Args:
        example: A single example from the dataset containing 'messages' field
        
    Returns:
        dict: Dictionary with new columns 'question', 'output', 'reasoning'
    """
    # Initialize default values
    question = ""
    answer = ""
    reasoning = ""
    combine = ""
    
    # Check if 'messages' exists and is a list
    if 'messages' not in example or not isinstance(example['messages'], list):
        return {'question': question, 'output': output, 'reasoning': reasoning}
    
    # Process each message in the list
    for message in example['messages']:
        if message.get('role') == 'user':
            # Extract the question from user message
            question = message.get('content', '').strip()
        
        elif message.get('role') == 'assistant':
            # Extract the full response from assistant
            response = message.get('content', '')
            
            # Split into reasoning (between <think> and </think>) and output (after </think>)
            reasoning_match = re.search(r'<think>(.*?)</think>', response, re.DOTALL)
            if reasoning_match:
                reasoning = reasoning_match.group(1).strip()
            
            # Output is everything after </think>, if it exists
            output_start = response.find('</think>')
            if output_start != -1:
                output = response[output_start + 8:].strip()  # 8 is length of </think>
            else:
                output = response.strip()  # If no </think>, take the whole response
            
            # Clean up any residual tags (in case of malformed input)
            reasoning = re.sub(r'<think>|</think>', '', reasoning).strip()
            output = re.sub(r'<think>|</think>', '', output).strip()
    
    return {
        'question': question,
        'answer': output,
        'reasoning': reasoning,
        'combine': "Question:\n" + question + "\Answer:\n" + output + "\Sure, this is my thinking process.\n"
    }

# Apply the processing function to the dataset
dataset_single_fr = dataset_single_fr.map(process_messages)
dataset_single_fr

In [None]:
dataset_single_fr[0]['combine']

In [None]:
dataset_single_fr.push_to_hub("HoangHa/Pensez-v0.1", "fr-single")

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    return dataset

# Process both datasets
dataset_single_fr_json = process_generations_dataset_combine('fr-single.jsonl', 'reasoning')

# Print results
print(dataset_single_fr_json)
dataset_single_fr_json[0]

In [None]:
dataset_single_fr_json.push_to_hub("HoangHa/Pensez-v0.1", "fr-single")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_single_fr_openai_messages = convert_to_openai_messages(
    dataset_single_fr,
    prompt_col="question",
    reasoning_col="reasoning",
    answer_col="answer",
    source="fr-single"
)

# Print results
print(dataset_single_fr_openai_messages)
dataset_single_fr_openai_messages[0]

## Fr-llama-single

In [None]:
from datasets import load_dataset
dataset_llama_single_fr = load_dataset("HoangHa/Pensez-v0.1", "fr-llama-single", split='train')
dataset_llama_single_fr

In [None]:
def add_combine_column(example):
    example['combine'] = f"Question:\n{example['instruction']}\nAnswer:\n{example['response']}\nSure, this is my thinking process.\n"
    return example

# Apply the transformation to create the new column
dataset_llama_single_fr = dataset_llama_single_fr.map(add_combine_column)

# Check the result
print(dataset_llama_single_fr[0]['combine'])

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    return dataset

# Process both datasets
dataset_llama_single_fr_json = process_generations_dataset_combine('fr-llama-single.jsonl', 'reasoning')

# Print results
print(dataset_llama_single_fr_json)
dataset_llama_single_fr_json[0]

In [None]:
dataset_llama_single_fr_json.push_to_hub("HoangHa/Pensez-v0.1", "fr-llama-single")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_single_llama_fr_openai_messages = convert_to_openai_messages(
    dataset_llama_single_fr,
    prompt_col="instruction",
    reasoning_col="reasoning",
    answer_col="response",
    source="fr-llama-single"
)

# Print results
print(dataset_single_llama_fr_openai_messages)
dataset_single_llama_fr_openai_messages[0]

## Fr-long

In [None]:
from datasets import load_dataset
dataset_long_fr = load_dataset("HoangHa/Pensez-v0.1", "fr-long", split='train')
dataset_long_fr

In [None]:
from datasets import load_dataset
import re

def process_messages(example):
    """
    Process the 'messages' field to extract question, output, and reasoning into new columns.
    
    Args:
        example: A single example from the dataset containing 'messages' field
        
    Returns:
        dict: Dictionary with new columns 'question', 'output', 'reasoning'
    """
    # Initialize default values
    question = ""
    answer = ""
    reasoning = ""
    combine = ""
    
    # Check if 'messages' exists and is a list
    if 'messages' not in example or not isinstance(example['messages'], list):
        return {'question': question, 'output': output, 'reasoning': reasoning}
    
    # Process each message in the list
    for message in example['messages']:
        if message.get('role') == 'user':
            # Extract the question from user message
            question = message.get('content', '').strip()
        
        elif message.get('role') == 'assistant':
            # Extract the full response from assistant
            response = message.get('content', '')
            
            # Split into reasoning (between <think> and </think>) and output (after </think>)
            reasoning_match = re.search(r'<think>(.*?)</think>', response, re.DOTALL)
            if reasoning_match:
                reasoning = reasoning_match.group(1).strip()
            
            # Output is everything after </think>, if it exists
            output_start = response.find('</think>')
            if output_start != -1:
                output = response[output_start + 8:].strip()  # 8 is length of </think>
            else:
                output = response.strip()  # If no </think>, take the whole response
            
            # Clean up any residual tags (in case of malformed input)
            reasoning = re.sub(r'<think>|</think>', '', reasoning).strip()
            output = re.sub(r'<think>|</think>', '', output).strip()
    
    return {
        'question': question,
        'answer': output,
        'reasoning': reasoning,
        'combine': "Question:\n" + question + "\Answer:\n" + output + "\Sure, this is my thinking process.\n"
    }

# Apply the processing function to the dataset
dataset_long_fr = dataset_long_fr.map(process_messages)
dataset_long_fr

In [None]:
dataset_long_fr.push_to_hub("HoangHa/Pensez-v0.1", "fr-long")

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_long_fr_json = process_generations_dataset_combine('fr-long.jsonl', 'reasoning')
dataset_long_fr_json =  dataset_long_fr_json.remove_columns("combine")
# Print results
print(dataset_long_fr_json)
dataset_long_fr_json[0]

In [None]:
dataset_long_fr_json.push_to_hub("HoangHa/Pensez-v0.1", "fr-long")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_long_fr_openai_messages = convert_to_openai_messages(
    dataset_long_fr,
    prompt_col="question",
    reasoning_col="reasoning",
    answer_col="answer",
    source="fr-long"
)

# Print results
print(dataset_long_fr_openai_messages)
dataset_long_fr_openai_messages[1]

## Fr-qwen-single

In [None]:
from datasets import load_dataset
dataset_qwen_single_fr = load_dataset("HoangHa/Pensez-v0.1", "fr-qwen-single", split='train')
dataset_qwen_single_fr

In [None]:
def add_combine_column(example):
    example['combine'] = f"Question:\n{example['instruction']}\nAnswer:\n{example['response']}\nSure, this is my thinking process.\n"
    return example

# Apply the transformation to create the new column
dataset_qwen_single_fr = dataset_qwen_single_fr.map(add_combine_column)

# Check the result
print(dataset_qwen_single_fr[0]['combine'])

In [None]:
dataset_qwen_single_fr.push_to_hub("HoangHa/Pensez-v0.1", "fr-qwen-single")

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_qwen_single_fr_json = process_generations_dataset_combine('fr-qwen-single.jsonl', 'reasoning')
dataset_qwen_single_fr_json = dataset_qwen_single_fr_json.remove_columns("combine")
# Print results
print(dataset_qwen_single_fr_json)
dataset_qwen_single_fr_json[0]

In [None]:
dataset_qwen_single_fr_json.push_to_hub("HoangHa/Pensez-v0.1", "fr-qwen-single")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_qwen_single_fr_openai_messages = convert_to_openai_messages(
    dataset_qwen_single_fr,
    prompt_col="instruction",
    reasoning_col="reasoning",
    answer_col="response",
    source="fr-qwen-single"
)

# Print results
print(dataset_qwen_single_fr_openai_messages)
dataset_qwen_single_fr_openai_messages[0]

## Fr-mt

In [None]:
from datasets import load_dataset
dataset_mt_fr = load_dataset("HoangHa/Pensez-v0.1", "fr-mt", split='train')
dataset_mt_fr

In [None]:
def add_combine_column(example):
    example['combine_1'] = f"Question:\n{example['instruction']}\nAnswer:\n{example['response']}\nSure, this is my thinking process.\n"
    return example

# Apply the transformation to create the new column
dataset_mt_fr_1 = dataset_mt_fr.map(add_combine_column)

# Check the result
print(dataset_mt_fr_1[0]['combine_1'])

In [None]:
dataset_mt_fr_1.push_to_hub("HoangHa/Pensez-v0.1", "fr-mt")

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_mt_fr_1_json = process_generations_dataset_combine('fr-mt1.jsonl', 'reasoning_1')
dataset_mt_fr_1_json = dataset_mt_fr_1_json.remove_columns("combine_1")
# Print results
print(dataset_mt_fr_1_json)
dataset_mt_fr_1_json[0]

In [None]:
def add_combine_column(example):
    example['combine_2'] = f"Question:\n{example['instruction']}\n{example['reasoning_1']}\nAnswer:\n{example['response']}\nQuestion:\n{example['instruction_2']}\nAnswer:\n{example['response_2']}\nSure, this is my thinking process.\n"
    return example

# Apply the transformation to create the new column
dataset_mt_fr_2 = dataset_mt_fr_1_json.map(add_combine_column)

# Check the result
print(dataset_mt_fr_2[0]['combine_2'])

In [None]:
dataset_mt_fr_2.push_to_hub("HoangHa/Pensez-v0.1", "fr-mt")

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_mt_fr_2_json = process_generations_dataset_combine('fr-mt2.jsonl', 'reasoning_2')
dataset_mt_fr_2_json = dataset_mt_fr_2_json.remove_columns("combine_2")
# Print results
print(dataset_mt_fr_2_json)
dataset_mt_fr_2_json[0]

In [None]:
dataset_mt_fr_2_json

In [None]:
dataset_mt_fr_2_json.push_to_hub("HoangHa/Pensez-v0.1", "fr-mt")

In [None]:
def convert_to_openai_messages_multiturn(
    dataset,
    source_label="fr-mt"
):
    def format_row(row):
        messages = []
        
        # First turn
        if "instruction" in row and row["instruction"]:
            messages.append({"role": "user", "content": row["instruction"]})
            
            # First assistant response with reasoning if available
            if "response" in row:
                content = ""
                if "reasoning_1" in row and row["reasoning_1"]:
                    content += f"<think>\n{row['reasoning_1']}\n</think>\n\n"
                content += f"{row['response']}"
                messages.append({"role": "assistant", "content": content})
        
        # Second turn
        if "instruction_2" in row and row["instruction_2"]:
            messages.append({"role": "user", "content": row["instruction_2"]})
            
            # Second assistant response with reasoning if available
            if "response_2" in row:
                content = ""
                if "reasoning_2" in row and row["reasoning_2"]:
                    content += f"<think>\n{row['reasoning_2']}\n</think>\n\n"
                content += f"{row['response_2']}"
                messages.append({"role": "assistant", "content": content})
        
        return {"messages": messages, "source": source_label}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Example usage:
dataset_mt_fr_openai_messages = convert_to_openai_messages_multiturn(
    dataset_mt_fr,
    source_label="fr-mt"
)

In [None]:
dataset_mt_fr_openai_messages[10]

# English

## En-limo (Gen output)

In [None]:
from datasets import load_dataset
dataset_limo_en = load_dataset("HoangHa/Pensez-v0.1", "en-limo", split='train')
dataset_limo_en

In [None]:
dataset_limo_en[0]

In [None]:
def add_combine_column(example):
    if example['question'] is None or example['solution'] is None:
        example['combine'] = None
    else:
        example['combine'] = f"Question:\n{example['question']}\Thinking process:\n{example['solution']}\nSure, this is my the conclusion.\n"
    return example
# Apply the transformation to create the new column
dataset_limo_en = dataset_limo_en.map(add_combine_column)

# Check the result
print(dataset_limo_en[0]['combine'])

from datasets import load_dataset
from transformers import AutoTokenizer

def count_tokens(dataset, model_path="HoangHa/Pensez-v0.1-init"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    def tokenize_and_count(example):
        text = example["question"] + example["solution"]
        return {"token_count": len(tokenizer(text).input_ids)}
    
    return dataset.map(tokenize_and_count)

# Load and process the dataset
dataset_with_token_counts = count_tokens(dataset_limo_en)
print(dataset_with_token_counts)

max(dataset_with_token_counts['token_count'])

dataset_with_token_counts.push_to_hub("HoangHa/Pensez-v0.1", "en-limo")

In [None]:
# generate + remember to change system prompt
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "en-limo" \
  --output-file "en-limo.jsonl" \
  --prompt-column "combine" \
  --uuid-column "combine" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_limo_en_json = process_generations_dataset_combine('en-limo-new.jsonl', 'output')
dataset_limo_en_json = dataset_limo_en_json.remove_columns("combine")
# Print results
print(dataset_limo_en_json)
dataset_limo_en_json[0]

In [None]:
dataset_limo_en_json.push_to_hub("HoangHa/Pensez-v0.1", "en-limo")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_limo_en_openai_messages = convert_to_openai_messages(
    dataset_limo_en_json,
    prompt_col="question",
    reasoning_col="solution",
    answer_col="output",
    source="en-limo"
)

# Print results
print(dataset_limo_en_openai_messages)
dataset_limo_en_openai_messages[0]

## En-long

### Done

In [None]:
from datasets import load_dataset
dataset_long_en = load_dataset("HoangHa/Pensez-v0.1", "en-long", split='train')

In [None]:
dataset_long_en[0]['reasoning']

In [None]:
from datasets import load_dataset
import re

def process_messages(example):
    """
    Process the 'messages' field to extract question, output, and reasoning into new columns.
    
    Args:
        example: A single example from the dataset containing 'messages' field
        
    Returns:
        dict: Dictionary with new columns 'question', 'output', 'reasoning'
    """
    # Initialize default values
    question = ""
    answer = ""
    reasoning = ""
    combine = ""
    
    # Check if 'messages' exists and is a list
    if 'messages' not in example or not isinstance(example['messages'], list):
        return {'question': question, 'output': output, 'reasoning': reasoning}
    
    # Process each message in the list
    for message in example['messages']:
        if message.get('role') == 'user':
            # Extract the question from user message
            question = message.get('content', '').strip()
        
        elif message.get('role') == 'assistant':
            # Extract the full response from assistant
            response = message.get('content', '')
            
            # Split into reasoning (between <think> and </think>) and output (after </think>)
            reasoning_match = re.search(r'<think>(.*?)</think>', response, re.DOTALL)
            if reasoning_match:
                reasoning = reasoning_match.group(1).strip()
            
            # Output is everything after </think>, if it exists
            output_start = response.find('</think>')
            if output_start != -1:
                output = response[output_start + 8:].strip()  # 8 is length of </think>
            else:
                output = response.strip()  # If no </think>, take the whole response
            
            # Clean up any residual tags (in case of malformed input)
            reasoning = re.sub(r'<think>|</think>', '', reasoning).strip()
            output = re.sub(r'<think>|</think>', '', output).strip()
    
    return {
        'question': question,
        'answer': output,
        'reasoning': reasoning,
        'combine': "Question:\n" + question + "\Answer:\n" + output + "\Sure, this is my thinking process.\n"
    }

# Apply the processing function to the dataset
dataset_long_en = dataset_long_en.map(process_messages)
dataset_long_en

In [None]:
dataset_long_en.push_to_hub("HoangHa/Pensez-v0.1", "en-long")

### WIP

In [None]:
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "en-long" \
  --output-file "en-long.jsonl" \
  --prompt-column "combine" \
  --uuid-column "combine" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_long_en_json = process_generations_dataset_combine('en-long.jsonl', 'reasoning')
dataset_long_en_json =  dataset_long_en_json.remove_columns("combine")
# Print results
print(dataset_long_en_json)
dataset_long_en_json[0]

In [None]:
dataset_long_en.push_to_hub("HoangHa/Pensez-v0.1", "en-long")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_long_en_openai_messages = convert_to_openai_messages(
    dataset_long_en,
    prompt_col="question",
    reasoning_col="reasoning",
    answer_col="answer",
    source="en-long"
)

# Print results
print(dataset_long_en_openai_messages)
dataset_long_en_openai_messages[0]

## En-single

### Done

In [None]:
from datasets import load_dataset
dataset_single_en = load_dataset("HoangHa/Pensez-v0.1", "en-single", split='train')
dataset_single_en

In [None]:
from datasets import load_dataset
import re

def process_messages(example):
    """
    Process the 'messages' field to extract question, output, and reasoning into new columns.
    
    Args:
        example: A single example from the dataset containing 'messages' field
        
    Returns:
        dict: Dictionary with new columns 'question', 'output', 'reasoning'
    """
    # Initialize default values
    question = ""
    answer = ""
    reasoning = ""
    combine = ""
    
    # Check if 'messages' exists and is a list
    if 'messages' not in example or not isinstance(example['messages'], list):
        return {'question': question, 'output': output, 'reasoning': reasoning}
    
    # Process each message in the list
    for message in example['messages']:
        if message.get('role') == 'user':
            # Extract the question from user message
            question = message.get('content', '').strip()
        
        elif message.get('role') == 'assistant':
            # Extract the full response from assistant
            response = message.get('content', '')
            
            # Split into reasoning (between <think> and </think>) and output (after </think>)
            reasoning_match = re.search(r'<think>(.*?)</think>', response, re.DOTALL)
            if reasoning_match:
                reasoning = reasoning_match.group(1).strip()
            
            # Output is everything after </think>, if it exists
            output_start = response.find('</think>')
            if output_start != -1:
                output = response[output_start + 8:].strip()  # 8 is length of </think>
            else:
                output = response.strip()  # If no </think>, take the whole response
            
            # Clean up any residual tags (in case of malformed input)
            reasoning = re.sub(r'<think>|</think>', '', reasoning).strip()
            output = re.sub(r'<think>|</think>', '', output).strip()
    
    return {
        'question': question,
        'answer': output,
        'reasoning': reasoning,
        'combine': "Question:\n" + question + "\Answer:\n" + output + "\Sure, this is my thinking process.\n"
    }

# Apply the processing function to the dataset
dataset_single_en = dataset_single_en.map(process_messages)
dataset_single_en


dataset_single_en.push_to_hub("HoangHa/Pensez-v0.1", "en-single")

### WIP

In [None]:
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "en-single" \
  --output-file "en-single.jsonl" \
  --prompt-column "combine" \
  --uuid-column "combine" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_single_en_json = process_generations_dataset_combine('en-single.jsonl', 'reasoning')
dataset_single_en_json =  dataset_single_en_json.remove_columns("combine")
# Print results
print(dataset_single_en_json)
dataset_single_en_json[0]

In [None]:
dataset_single_en_json.push_to_hub("HoangHa/Pensez-v0.1", "en-single")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n<{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_single_en_openai_messages = convert_to_openai_messages(
    dataset_single_en,
    prompt_col="question",
    reasoning_col="reasoning",
    answer_col="answer",
    source="en-single"
)

# Print results
print(dataset_single_en_openai_messages)
dataset_single_en_openai_messages[0]

## En-single-reasoning-correct

In [None]:
from datasets import load_dataset
dataset_single_reasoning_en = load_dataset("HoangHa/Pensez-v0.1", "en-single-reasoning-correct", split='train')
# dataset_single_reasoning_en = dataset_single_reasoning_en.rename_column("messages", "prompt")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

def count_tokens(dataset, model_path="HoangHa/Pensez-v0.1-init"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    def tokenize_and_count(example):
        text = example["prompt"] + example["reasoning"] + example["answer"]
        return {"token_count": len(tokenizer(text).input_ids)}
    
    return dataset.map(tokenize_and_count)

# Load and process the dataset
dataset_with_token_counts = count_tokens(dataset_single_reasoning_en)
print(dataset_with_token_counts)

In [None]:
dataset_with_token_counts.push_to_hub("HoangHa/Pensez-v0.1", "en-single-reasoning-correct")

In [None]:
def convert_to_openai_messages(dataset, prompt_col="prompt", reasoning_col="reasoning", answer_col="answer", source="default-source"):
    def format_row(row):
        assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
        
        messages = [
            {"role": "user", "content": row[prompt_col]},
            {"role": "assistant", "content": assistant_content}
        ]
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_single_reasoning_en_openai_messages = convert_to_openai_messages(
    dataset_single_reasoning_en,
    prompt_col="prompt",
    reasoning_col="reasoning",
    answer_col="answer",
    source="en-single-reasoning-correct"
)

# Print results
print(dataset_single_reasoning_en_openai_messages)
dataset_single_reasoning_en_openai_messages[0]

## En-mt (gen think)

In [None]:
from datasets import load_dataset
dataset_en_mt = load_dataset("HoangHa/Pensez-v0.1", "en-mt", split='train')
dataset_en_mt

### Turn 1

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

def add_combine_column(example):
    if example['instruction_1'] is None or example['answer_1'] is None:
        example['combine_1'] = None
    else:
        example['combine_1'] = f"Question:\n{example['instruction_1']}\nAnswer:\n{example['answer_1']}\nSure, this is my thinking process.\n"
    return example

# Apply the transformation to create the new column
dataset_en_mt_1 = dataset_en_mt.map(add_combine_column)

# Check the result
print(dataset_en_mt_1[0]['combine_1'])

In [None]:
dataset_en_mt_1.push_to_hub("HoangHa/Pensez-v0.1", "en-mt")

In [None]:
# generate + remember to change system prompt
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "en-mt" \
  --output-file "en-mt-1.jsonl" \
  --prompt-column "combine_1" \
  --uuid-column "combine_1" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_mt_en_1_json = process_generations_dataset_combine('en-mt-1.jsonl', 'reasoning_1')
dataset_mt_en_1_json = dataset_mt_en_1_json.remove_columns("combine_1")
# Print results
print(dataset_mt_en_1_json)
dataset_mt_en_1_json[0]

### Turn 2

In [None]:
def add_combine_column(example):
    if any(example[key] is None for key in ['instruction_1', 'reasoning_1', 'answer_1', 'instruction_2', 'answer_2']):
        example['combine_2'] = None
    else:
        example['combine_2'] = f"Question:\n{example['instruction_1']}\nAnswer:\n{example['reasoning_1']}\n{example['answer_1']}\nQuestion:\n{example['instruction_2']}\nAnswer:\n{example['answer_2']}\nSure, this is my thinking process.\n"
    return example


# Apply the transformation to create the new column
dataset_mt_fr_2 = dataset_mt_fr_1_json.map(add_combine_column)

# Check the result
print(dataset_mt_fr_2[0]['combine_2'])

In [None]:
dataset_mt_fr_2.push_to_hub("HoangHa/Pensez-v0.1", "en-mt")

In [None]:
# generate + remember to change system prompt
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "en-mt" \
  --output-file "en-mt-2.jsonl" \
  --prompt-column "combine_2" \
  --uuid-column "combine_2" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_mt_en_2_json = process_generations_dataset_combine('en-mt-2.jsonl', 'reasoning_2')
dataset_mt_en_2_json = dataset_mt_en_2_json.remove_columns("combine_2")
# Print results
print(dataset_mt_en_2_json)
dataset_mt_en_2_json[0]

### Turn 3

In [None]:
def add_combine_column_3(example):
    if any(example[key] is None for key in ['instruction_1', 'reasoning_1', 'answer_1',
                                            'instruction_2', 'reasoning_2', 'answer_2',
                                            'instruction_3', 'answer_3']):
        example['combine_3'] = None
    else:
        example['combine_3'] = f"Question:\n{example['instruction_1']}\nAnswer:\n{example['reasoning_1']}\n{example['answer_1']}\nQuestion:\n{example['instruction_2']}\nAnswer:\n{example['reasoning_2']}\n{example['answer_2']}\nQuestion:\n{example['instruction_3']}\nAnswer:\n{example['answer_3']}\nSure, this is my thinking process.\n"
    return example


# Apply the transformation to create the new column
dataset_mt_en_3 = dataset_mt_en_2_json.map(add_combine_column_3)

# Check the result
print(dataset_mt_en_3[0]['combine_3'])

In [None]:
dataset_mt_en_3[50]['combine_3']

In [None]:
dataset_mt_en_3.push_to_hub("HoangHa/Pensez-v0.1", "en-mt")

In [None]:
# generate + remember to change system prompt
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "en-mt" \
  --output-file "en-mt-3.jsonl" \
  --prompt-column "combine_3" \
  --uuid-column "combine_3" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
# generate + remember to change system prompt
python sglang.py \
  --dataset-name "OpenMedical/m1-stage1-medical-reasoning" \
  --dataset-sub "r1-32b" \
  --output-file "question.jsonl" \
  --prompt-column "question" \
  --uuid-column "question" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 2048 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_mt_en_3_json = process_generations_dataset_combine('en-mt-3.jsonl', 'reasoning_3')
dataset_mt_en_3_json = dataset_mt_en_3_json.remove_columns("combine_3")
# Print results
print(dataset_mt_en_3_json)
dataset_mt_en_3_json[0]

### Turn 4

In [None]:
def add_combine_column_4(example):
    if any(example[key] is None for key in ['instruction_1', 'reasoning_1', 'answer_1',
                                            'instruction_2', 'reasoning_2', 'answer_2',
                                            'instruction_3', 'reasoning_3', 'answer_3',
                                            'instruction_4', 'answer_4']):
        example['combine_4'] = None
    else:
        example['combine_4'] = f"Question:\n{example['instruction_1']}\nAnswer:\n{example['reasoning_1']}\n{example['answer_1']}\nQuestion:\n{example['instruction_2']}\nAnswer:\n{example['reasoning_2']}\n{example['answer_2']}\nQuestion:\n{example['instruction_3']}\nAnswer:\n{example['reasoning_3']}\n{example['answer_3']}\nQuestion:\n{example['instruction_4']}\nAnswer:\n{example['answer_4']}\nSure, this is my thinking process.\n"
    return example


# Apply the transformation to create the new column
dataset_mt_en_4 = dataset_mt_en_3_json.map(add_combine_column_4)

# Check the result
print(dataset_mt_en_4[0]['combine_4'])

In [None]:
dataset_mt_en_4[50]

In [None]:
dataset_mt_en_4_json.push_to_hub("HoangHa/Pensez-v0.1", "en-mt")

In [None]:
# generate + remember to change system prompt
python sglang.py \
  --dataset-name "HoangHa/Pensez-v0.1" \
  --dataset-sub "en-mt" \
  --output-file "en-mt-4.jsonl" \
  --prompt-column "combine_4" \
  --uuid-column "combine_4" \
  --api-addr "127.0.0.1:30000" \
  --num-generations 1 \
  --max-tokens 16384 \
  --max-concurrent 200

In [None]:
from datasets import load_dataset

def process_generations_dataset_combine(file_path, new_col):
    dataset = load_dataset('json', data_files=file_path, split='train')
    dataset = dataset.map(lambda x: {new_col: x['generations'][0] if x['generations'] else ''})
    dataset = dataset.remove_columns("generations")
    return dataset

# Process both datasets
dataset_mt_en_4_json = process_generations_dataset_combine('en-mt-4.jsonl', 'reasoning_4')
dataset_mt_en_4_json = dataset_mt_en_4_json.remove_columns("combine_4")
# Print results
print(dataset_mt_en_4_json)
# dataset_mt_en_4_json[0]

In [None]:
def convert_to_openai_messages(dataset, base_instruction_col="instruction", base_reasoning_col="reasoning", base_answer_col="answer", source="default-source", max_turns=4):
    def format_row(row):
        messages = []
        
        # Function to check if a turn is complete
        def has_complete_turn(turn_num):
            # Construct column names for this turn
            instruction_col = f"{base_instruction_col}_{turn_num}"
            reasoning_col = f"{base_reasoning_col}_{turn_num}"
            answer_col = f"{base_answer_col}_{turn_num}"
            
            # Check if all columns exist and have non-empty values
            return all(col in row.keys() for col in [instruction_col, reasoning_col, answer_col]) and \
                   all(row[col] is not None and row[col] != "" for col in [instruction_col, reasoning_col, answer_col])
        
        # Iterate through all possible turns
        for turn in range(1, max_turns + 1):
            if has_complete_turn(turn):
                # Get column names for this turn
                instruction_col = f"{base_instruction_col}_{turn}"
                reasoning_col = f"{base_reasoning_col}_{turn}"
                answer_col = f"{base_answer_col}_{turn}"
                
                # Add user message
                messages.append({
                    "role": "user",
                    "content": row[instruction_col]
                })
                
                # Add assistant message with reasoning and answer
                assistant_content = f"<think>\n{row[reasoning_col]}\n</think>\n\n{row[answer_col]}"
                messages.append({
                    "role": "assistant",
                    "content": assistant_content
                })
                
        return {"messages": messages, "source": source}

    return dataset.map(format_row).select_columns(["messages", "source"])

# Convert dataset to OpenAI message format with a source label
dataset_mt_en_4_openai_messages = convert_to_openai_messages(
    dataset_mt_en_4_json,
    base_instruction_col="instruction",
    base_reasoning_col="reasoning",
    base_answer_col="answer",
    source="mt-en-4",
    max_turns=4  # Since your dataset has 4 turns
)

# Print results
print(dataset_mt_en_4_openai_messages)
dataset_mt_en_4_openai_messages[0]

# Merge

In [None]:
from datasets import load_dataset, concatenate_datasets
dataset = concatenate_datasets([#fr
                                dataset_fr_s1_openai_messages,
                                dataset_fr_dolphin_openai_messages,
                                dataset_fr_openmath_openai_messages,
                                dataset_single_fr_openai_messages,
                                dataset_single_llama_fr_openai_messages,
                                dataset_long_fr_openai_messages,
                                dataset_qwen_single_fr_openai_messages,
                                dataset_mt_fr_openai_messages,
                                # en
                                dataset_long_en_openai_messages,
                                dataset_single_reasoning_en_openai_messages,
                                dataset_single_en_openai_messages,
                                dataset_limo_en_openai_messages,
                                dataset_mt_en_4_openai_messages
                               ])
print(dataset)
dataset[0]

In [57]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
model_path =  "HoangHa/Pensez-v0.1-init"

tokenizer = AutoTokenizer.from_pretrained(model_path)

def count_tokens(example, tokenizer):
    """Count tokens in messages that have a content field"""
    total_tokens = 0
    for message in example['messages']:
        tokens = tokenizer(message['content'], return_tensors="pt", truncation=False)
        total_tokens += len(tokens['input_ids'][0])
    example['token_count'] = total_tokens
    return example
    
dataset = dataset.map(
    lambda example: count_tokens(example, tokenizer), 
    num_proc=16
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
token_lengths = [sample["token_count"] for sample in dataset]

# Plot distribution
plt.figure(figsize=(8, 5))
sns.histplot(token_lengths, bins=20, kde=True)
plt.xlabel("Number of Messages per Conversation")
plt.ylabel("Frequency")
plt.title("Distribution of Messages per Conversation")
plt.show()

In [None]:
# Get the indices of examples where token_count exceeds 16384
exceeding_indices = [i for i, example in enumerate(dataset) if example['token_count'] > 16384]

print("Indices with token_count > 16384:", exceeding_indices)

In [66]:
new_assistant_response = {'content': "<think>\nLet's break down this problem step-by-step, combining physics principles and calculations. **1. Understanding the Physics Principles** * **De Broglie Wavelength:** The wave nature of particles is crucial here. The de Broglie wavelength (λ) is given by λ = h/p, where h is Planck's constant and p is the momentum of the particle. * **Momentum and Kinetic Energy:** For non-relativistic particles, kinetic energy (KE) = 1/2 * mv², and momentum p = mv. Therefore, KE = p²/2m. * **Average Kinetic Energy and Temperature:** For a gas in thermal equilibrium, the average kinetic energy of the particles is proportional to the temperature: KE_avg = (f/2)kT, where f is the number of degrees of freedom (3 for translational motion in this case), k is the Boltzmann constant, and T is the temperature. * **Average Speed:** The particles selected by the filter have a speed close to the average speed. We can relate this average speed to the temperature. * **Single-Slit Diffraction:** The phenomenon of bright and dark regions on the screen indicates single-slit diffraction. The condition for constructive interference (bright regions) in single-slit diffraction is given by: d sin θ = (m + 1/2)λ, where d is the slit width, θ is the angle of the bright fringe, and m is the order of the fringe (m = 0 for the first bright region, m = 1 for the second, etc.). **2. Analyzing the First Scenario (Mass m1, Temperature T1)** * **Average Speed:** The average kinetic energy is (3/2)kT1 = (1/2)m1 * v1_avg². Therefore, v1_avg = sqrt(3kT1/m1). * **De Broglie Wavelength:** The de Broglie wavelength of the selected particles is λ1 = h / (m1 * v1_avg) = h / (m1 * sqrt(3kT1/m1)) = h / sqrt(3km1T1). * **First Bright Region:** For the first bright region (m=0), the condition for constructive interference is: d sin θ1 = (0 + 1/2)λ1 = λ1/2. * **Substituting λ1:** d sin θ1 = h / (2 * sqrt(3km1T1)). **3. Analyzing the Second Scenario (Mass m2, Temperature T2)** * **Average Speed:** The average kinetic energy is (3/2)kT2 = (1/2)m2 * v2_avg². Therefore, v2_avg = sqrt(3kT2/m2). * **De Broglie Wavelength:** The de Broglie wavelength of the selected particles is λ2 = h / (m2 * v2_avg) = h / (m2 * sqrt(3kT2/m2)) = h / sqrt(3km2T2). * **Second Bright Region:** For the second bright region (m=1), the condition for constructive interference is: d sin θ2 = (1 + 1/2)λ2 = 3λ2/2. * **Substituting λ2:** d sin θ2 = 3h / (2 * sqrt(3km2T2)). **4. Setting up the Equations and Solving for T1/T2** We have two equations: Equation 1: d sin θ1 = h / (2 * sqrt(3km1T1)) Equation 2: d sin θ2 = 3h / (2 * sqrt(3km2T2)) We are given d = 2.5 micrometers, θ1 = 15 degrees, θ2 = 20 degrees, and m2 = 2m1. We want to find T1/T2. Divide Equation 1 by Equation 2: (d sin θ1) / (d sin θ2) = [h / (2 * sqrt(3km1T1))] / [3h / (2 * sqrt(3km2T2))] Simplify: sin θ1 / sin θ2 = (1/3) * sqrt(3km2T2) / sqrt(3km1T1) sin θ1 / sin θ2 = (1/3) * sqrt(m2T2 / (m1T1)) Substitute m2 = 2m1: sin θ1 / sin θ2 = (1/3) * sqrt(2m1T2 / (m1T1)) sin θ1 / sin θ2 = (1/3) * sqrt(2 * (T2/T1)) Now, plug in the values for θ1 and θ2: sin(15°) / sin(20°) = (1/3) * sqrt(2 * (T2/T1)) Calculate the sine values: sin(15°) ≈ 0.2588 sin(20°) ≈ 0.3420 0.2588 / 0.3420 = (1/3) * sqrt(2 * (T2/T1)) 0.7567 ≈ (1/3) * sqrt(2 * (T2/T1)) Multiply by 3: 2.2701 ≈ sqrt(2 * (T2/T1)) Square both sides: (2.2701)² ≈ 2 * (T2/T1) 5.153 ≈ 2 * (T2/T1) Solve for T1/T2: T1/T2 ≈ 2 / 5.153 T1/T2 ≈ 0.388 Something is wrong, I must have confused the bright fringe order. Let's revisit the bright fringe condition. **Revisiting the Bright Fringe Condition** The condition for *constructive* interference in single-slit diffraction is actually more complex and involves minima. However, the problem statement refers to \"bright regions\", which are often approximated using a simplified approach related to the angles of the minima. A better approach might be to consider the path difference for the *first* bright fringe away from the central maximum. Alternatively, we might be interpreting \"first bright one\" as the first *principal maximum*, which occurs at the center (m=0), and the next bright regions correspond to the secondary maxima. However, in single slit diffraction, the secondary maxima are much weaker than the central maximum. The wording \"a series of bright and dark regions\" strongly suggests diffraction minima. Let's consider the condition for the *minima* in single-slit diffraction, as the bright fringes lie between them. The condition for the *m-th minimum* is d sin θ = mλ. The first bright region is roughly halfway between the 0th and 1st minimum. **Revised Approach using Minima** * **First Scenario (m1, T1):** * λ1 = h / sqrt(3km1T1) * The first bright region is roughly where the path difference is λ/2. This occurs approximately where the angle is such that it's halfway to the first minimum. The angle for the first minimum is d sin θ_min1 = 1 * λ1. The angle for the first bright fringe is more complex to precisely determine, but let's assume the problem uses the simplified condition for constructive interference, d sin θ = (m + 1/2)λ. Let's stick with the constructive interference condition as initially interpreted, because the answer choices suggest a straightforward calculation is intended. The wording \"first bright one (closest to the center of the slit)\" implies m=0. \"the second bright region\" in the second scenario implies m=1. **Double Checking the Calculations** sin(15°) / sin(20°) = (1/3) * sqrt(2 * (T2/T1)) 0.7567 = (1/3) * sqrt(2 * (T2/T1)) 2.2701 = sqrt(2 * (T2/T1)) 5.153 = 2 * (T2/T1) T2/T1 = 5.153 / 2 = 2.5765 T1/T2 = 1 / 2.5765 = 0.388 There must be a mistake in the interpretation of the bright fringe order. The \"first bright one\" usually refers to the m=1 bright fringe in double-slit interference, but in single-slit diffraction, it refers to the regions between the minima. Let's consider the angles given are for the *first diffraction maximum* away from the central maximum. The condition for this is approximately given by the constructive interference formula. **Re-evaluating the Bright Fringe Order** In single-slit diffraction, the central maximum is at θ = 0. The subsequent bright fringes are located approximately halfway between the minima. * **Minima:** d sin θ = mλ (m = 1, 2, 3, ...) * **Bright fringes (approximate):** d sin θ ≈ (m + 1/2)λ For the first bright region (closest to the center, excluding the central max), m = 1. For the second bright region, m = 2. **Scenario 1 (First bright region):** d sin θ1 = (1 + 1/2)λ1 = (3/2)λ1 d sin 15° = (3/2) * h / sqrt(3km1T1) **Scenario 2 (Second bright region):** d sin θ2 = (2 + 1/2)λ2 = (5/2)λ2 d sin 20° = (5/2) * h / sqrt(3km2T2) Now, divide the two equations: (d sin 15°) / (d sin 20°) = [(3/2) * h / sqrt(3km1T1)] / [(5/2) * h / sqrt(3km2T2)] sin 15° / sin 20° = (3/5) * sqrt(3km2T2) / sqrt(3km1T1) sin 15° / sin 20° = (3/5) * sqrt(m2T2 / (m1T1)) Substitute m2 = 2m1: sin 15° / sin 20° = (3/5) * sqrt(2m1T2 / (m1T1)) sin 15° / sin 20° = (3/5) * sqrt(2 * (T2/T1)) 0.7567 = (3/5) * sqrt(2 * (T2/T1)) Multiply by 5/3: 0.7567 * (5/3) = sqrt(2 * (T2/T1)) 1.261 = sqrt(2 * (T2/T1)) Square both sides: (1.261)² = 2 * (T2/T1) 1.59 = 2 * (T2/T1) T2/T1 = 1.59 / 2 = 0.795 T1/T2 = 1 / 0.795 = 1.257 This is close to option (B) 1.3. Let's verify the assumption about the bright fringe order. The \"first bright one closest to the center\" is typically the first one *outside* the central maximum. Consider the diffraction pattern intensity: the central maximum is the brightest, and the subsequent maxima are less intense. The positions are roughly halfway between the minima. If the \"first bright one\" indeed refers to m=0 in the constructive interference formula: Scenario 1: d sin 15° = (1/2)λ1 Scenario 2: d sin 20° = (3/2)λ2 Dividing: sin 15° / sin 20° = (1/3) * sqrt(m2T2 / (m1T1)) 0.7567 = (1/3) * sqrt(2 * T2/T1) 2.2701 = sqrt(2 * T2/T1) 5.153 = 2 * T2/T1 T2/T1 = 2.5765 T1/T2 = 0.388 (This doesn't match any options.) Let's assume the problem implicitly refers to the bright fringes as described by the constructive interference formula. The wording is slightly ambiguous for single-slit diffraction, where the central maximum is the m=0 case. If \"first bright one\" means the first *secondary* maximum, it corresponds to roughly m=1 in the constructive interference approximation. If the problem intends the standard single-slit diffraction minima approach, the bright fringes are between them. The first bright fringe is roughly at the angle where sin θ = 3λ/2d. Let's stick with the constructive interference interpretation as it leads to a plausible answer. The phrase \"first bright one (closest to the center of the slit)\" likely implies the first order bright fringe beyond the central maximum. With the assumption that the bright fringes follow d sin θ = (m + 1/2)λ: First bright region: m=0 (This would be the central maximum, but the problem implies it's the *first one* observed as you move away from the center.) Let's interpret \"first bright one\" as the first side maximum. If the bright fringes are exactly halfway between the minima: First bright fringe position is related to the first minimum at d sin θ = λ. The bright fringe is around sin θ ≈ 3λ/2d. If the first bright one corresponds to the condition d sin θ = 3λ/2, and the second bright one to d sin θ = 5λ/2. Scenario 1: d sin 15° = 3λ1/2 Scenario 2: d sin 20° = 5λ2/2 sin 15° / sin 20° = (3λ1/2) / (5λ2/2) = 3λ1 / 5λ2 0.7567 = (3/5) * sqrt(m2T2 / (m1T1)) 0.7567 = (3/5) * sqrt(2 T2 / T1) 1.261 = sqrt(2 T2 / T1) 1.59 = 2 T2 / T1 T2 / T1 = 0.795 T1 / T2 = 1.257 This matches option (B) if rounded. The key is the interpretation of \"first bright one\". Given the answer choices, the constructive interference approximation seems to be what the question setters intended. Solution: The condition for constructive interference (bright regions) in single-slit diffraction can be approximated by: d sin θ = (m + 1/2)λ where d is the slit width, θ is the angle of the bright fringe, λ is the de Broglie wavelength, and m is the order of the fringe (m = 0 for the first bright region, m = 1 for the second, etc.). The de Broglie wavelength is given by λ = h/p = h/(mv), and the average kinetic energy is (3/2)kT = (1/2)mv², so v = sqrt(3kT/m). Thus, λ = h / (m * sqrt(3kT/m)) = h / sqrt(3mkT). For the first scenario (mass m1, temperature T1), the first bright region (m=0) is at θ1 = 15 degrees: d sin θ1 = (0 + 1/2)λ1 = (1/2) * h / sqrt(3km1T1) (Equation 1) For the second scenario (mass m2 = 2m1, temperature T2), the second bright region (m=1) is at θ2 = 20 degrees: d sin θ2 = (1 + 1/2)λ2 = (3/2) * h / sqrt(3km2T2) (Equation 2) Divide Equation 1 by Equation 2: (d sin θ1) / (d sin θ2) = [(1/2) * h / sqrt(3km1T1)] / [(3/2) * h / sqrt(3km2T2)] sin θ1 / sin θ2 = (1/3) * sqrt(3km2T2) / sqrt(3km1T1) sin θ1 / sin θ2 = (1/3) * sqrt(m2T2 / (m1T1)) Substitute the given values θ1 = 15°, θ2 = 20°, and m2 = 2m1: sin(15°) / sin(20°) = (1/3) * sqrt(2m1T2 / (m1T1)) 0.2588 / 0.3420 = (1/3) * sqrt(2 * (T2/T1)) 0.7567 ≈ (1/3) * sqrt(2 * (T2/T1)) 2.2701 ≈ sqrt(2 * (T2/T1)) Squaring both sides: 5.153 ≈ 2 * (T2/T1) T2/T1 ≈ 5.153 / 2 = 2.5765 T1/T2 ≈ 1 / 2.5765 ≈ 0.388 This interpretation assumed m=0 for the \"first bright one\". If \"first bright one\" refers to the first side maximum, then for the first scenario m=1, and for the second scenario the \"second bright region\" would be m=2. d sin θ1 = (1 + 1/2)λ1 = 3λ1/2 d sin θ2 = (2 + 1/2)λ2 = 5λ2/2 sin 15° / sin 20° = (3λ1/2) / (5λ2/2) = (3/5) * sqrt(m2T2 / (m1T1)) 0.7567 = (3/5) * sqrt(2 T2 / T1) 1.261 = sqrt(2 T2 / T1) 1.59 = 2 T2 / T1 T2 / T1 = 0.795 T1 / T2 = 1.258 Final Answer: The final answer is $\\boxed{1.3}$</think>\\n\\nThe problem involves understanding the interplay between De Broglie wavelength, single-slit diffraction, and the relationship between kinetic energy and temperature. The key is correctly interpreting the conditions for bright fringes in single-slit diffraction. By analyzing the two scenarios (mass m1, temperature T1, first bright fringe; and mass m2, temperature T2, second bright fringe), and using the approximation d sin θ = (m + 1/2)λ for the bright fringes (where 'm' is re-interpreted to mean the order number from the *central maximum*), we set up two equations. Given that m2 = 2m1, θ1 = 15 degrees, and θ2 = 20 degrees, by dividing these equations and solving for the ratio of temperatures, we interpret the 'first bright one' as m=1 and 'second bright region' as m=2. We find sin(15°)/sin(20°) = (3/5)sqrt(2T2/T1). Solving this gives T1/T2 ≈ 1.258, which is closest to 1.3.", 'role': 'assistant'}

In [None]:
modified_dataset = dataset.map(lambda example, idx: {'messages': [example['messages'][0], {'content': new_assistant_response, 'role': 'assistant'}]} if idx == 60 else example, with_indices=True)

In [None]:
dataset[1000]

In [None]:
dataset.push_to_hub("HoangHa/Pensez-v0.1-fr")