In [1]:
import json
import re
from pathlib import Path
from typing import Dict, List, Tuple
from tqdm import tqdm
import random

## 1. Ph√¢n t√≠ch Strategy theo t·ª´ng m√¥n h·ªçc

M·ªói m√¥n h·ªçc c√≥ c√°ch suy lu·∫≠n kh√°c nhau:

In [2]:
# ƒê·ªãnh nghƒ©a reasoning strategy cho t·ª´ng m√¥n

REASONING_STRATEGIES = {
    "Mathematics": {
        "description": "To√°n h·ªçc - Gi·∫£i t·ª´ng b∆∞·ªõc, c√¥ng th·ª©c, t√≠nh to√°n",
        "steps": [
            "X√°c ƒë·ªãnh d·∫°ng b√†i to√°n v√† c√¥ng th·ª©c c·∫ßn d√πng",
            "Li·ªát k√™ c√°c d·ªØ ki·ªán ƒë√£ cho",
            "Th·ª±c hi·ªán c√°c b∆∞·ªõc t√≠nh to√°n",
            "Ki·ªÉm tra v√† k·∫øt lu·∫≠n"
        ],
        "keywords": ["t√≠nh", "ph∆∞∆°ng tr√¨nh", "c√¥ng th·ª©c", "bi·ªÉu th·ª©c", "gi·∫£i", "t√¨m", "ƒë·∫°o h√†m", "t√≠ch ph√¢n"]
    },
    
    "Physics": {
        "description": "V·∫≠t l√Ω - Ph√¢n t√≠ch hi·ªán t∆∞·ª£ng, √°p d·ª•ng ƒë·ªãnh lu·∫≠t",
        "steps": [
            "X√°c ƒë·ªãnh hi·ªán t∆∞·ª£ng v·∫≠t l√Ω v√† ƒë·ªãnh lu·∫≠t li√™n quan",
            "Ph√¢n t√≠ch c√°c ƒë·∫°i l∆∞·ª£ng v√† m·ªëi quan h·ªá",
            "√Åp d·ª•ng c√¥ng th·ª©c ho·∫∑c ƒë·ªãnh lu·∫≠t",
            "So s√°nh v·ªõi c√°c ƒë√°p √°n v√† k·∫øt lu·∫≠n"
        ],
        "keywords": ["ƒë·ªãnh lu·∫≠t", "l·ª±c", "nƒÉng l∆∞·ª£ng", "chuy·ªÉn ƒë·ªông", "ƒëi·ªán", "t·ª´", "quang", "nhi·ªát"]
    },
    
    "Chemistry": {
        "description": "H√≥a h·ªçc - Ph∆∞∆°ng tr√¨nh, c√¢n b·∫±ng, t√≠nh to√°n mol",
        "steps": [
            "X√°c ƒë·ªãnh ch·∫•t tham gia v√† ph·∫£n ·ª©ng h√≥a h·ªçc",
            "Vi·∫øt v√† c√¢n b·∫±ng ph∆∞∆°ng tr√¨nh (n·∫øu c√≥)",
            "T√≠nh to√°n theo mol, kh·ªëi l∆∞·ª£ng ho·∫∑c th·ªÉ t√≠ch",
            "Ph√¢n t√≠ch t√≠nh ch·∫•t v√† k·∫øt lu·∫≠n"
        ],
        "keywords": ["ph·∫£n ·ª©ng", "ch·∫•t", "mol", "kh·ªëi l∆∞·ª£ng", "dung d·ªãch", "axit", "baz∆°", "oxi h√≥a", "kh·ª≠"]
    },
    
    "Biology": {
        "description": "Sinh h·ªçc - Ph√¢n t√≠ch ƒë·∫∑c ƒëi·ªÉm, ch·ª©c nƒÉng sinh h·ªçc",
        "steps": [
            "X√°c ƒë·ªãnh kh√°i ni·ªám ho·∫∑c qu√° tr√¨nh sinh h·ªçc",
            "Ph√¢n t√≠ch ƒë·∫∑c ƒëi·ªÉm v√† ch·ª©c nƒÉng",
            "So s√°nh c√°c ƒë√°p √°n v·ªõi ki·∫øn th·ª©c sinh h·ªçc",
            "Lo·∫°i tr·ª´ c√°c ƒë√°p √°n sai v√† k·∫øt lu·∫≠n"
        ],
        "keywords": ["t·∫ø b√†o", "gen", "di truy·ªÅn", "sinh s·∫£n", "h√¥ h·∫•p", "quang h·ª£p", "ti·∫øn h√≥a", "sinh th√°i"]
    },
    
    "Geography": {
        "description": "ƒê·ªãa l√Ω - Ph√¢n t√≠ch v·ªã tr√≠, ƒë·∫∑c ƒëi·ªÉm ƒë·ªãa l√Ω",
        "steps": [
            "X√°c ƒë·ªãnh v√πng ƒë·ªãa l√Ω ho·∫∑c hi·ªán t∆∞·ª£ng",
            "Ph√¢n t√≠ch c√°c y·∫øu t·ªë ƒë·ªãa l√Ω (kh√≠ h·∫≠u, ƒë·ªãa h√¨nh, t√†i nguy√™n)",
            "So s√°nh v·ªõi c√°c ƒë·∫∑c ƒëi·ªÉm trong ƒë√°p √°n",
            "K·∫øt lu·∫≠n d·ª±a tr√™n ki·∫øn th·ª©c ƒë·ªãa l√Ω"
        ],
        "keywords": ["v√πng", "kh√≠ h·∫≠u", "ƒë·ªãa h√¨nh", "t√†i nguy√™n", "d√¢n c∆∞", "kinh t·∫ø", "m√¥i tr∆∞·ªùng"]
    },
    
    "History": {
        "description": "L·ªãch s·ª≠ - Ph√¢n t√≠ch s·ª± ki·ªán, nguy√™n nh√¢n, h·∫≠u qu·∫£",
        "steps": [
            "X√°c ƒë·ªãnh s·ª± ki·ªán l·ªãch s·ª≠ v√† th·ªùi gian",
            "Ph√¢n t√≠ch nguy√™n nh√¢n v√† b·ªëi c·∫£nh",
            "Xem x√©t h·∫≠u qu·∫£ v√† √Ω nghƒ©a",
            "So s√°nh v·ªõi c√°c ƒë√°p √°n v√† k·∫øt lu·∫≠n"
        ],
        "keywords": ["s·ª± ki·ªán", "th·ªùi k·ª≥", "chi·∫øn tranh", "c√°ch m·∫°ng", "phong tr√†o", "tri·ªÅu ƒë·∫°i", "nƒÉm"]
    },
    
    "Literature": {
        "description": "Ng·ªØ vƒÉn - Ph√¢n t√≠ch vƒÉn h·ªçc, tu t·ª´, √Ω nghƒ©a",
        "steps": [
            "X√°c ƒë·ªãnh t√°c ph·∫©m, t√°c gi·∫£ ho·∫∑c ƒëo·∫°n vƒÉn",
            "Ph√¢n t√≠ch n·ªôi dung, h√¨nh th·ª©c v√† ngh·ªá thu·∫≠t",
            "Xem x√©t bi·ªán ph√°p tu t·ª´ v√† gi√° tr·ªã vƒÉn h·ªçc",
            "K·∫øt lu·∫≠n v·ªÅ √Ω nghƒ©a v√† th√¥ng ƒëi·ªáp"
        ],
        "keywords": ["t√°c ph·∫©m", "t√°c gi·∫£", "th∆°", "vƒÉn xu√¥i", "bi·ªán ph√°p", "h√¨nh t∆∞·ª£ng", "ch·ªß ƒë·ªÅ"]
    },
    
    "CivicEducation": {
        "description": "Gi√°o d·ª•c c√¥ng d√¢n - Ph√¢n t√≠ch lu·∫≠t ph√°p, ƒë·∫°o ƒë·ª©c",
        "steps": [
            "X√°c ƒë·ªãnh kh√°i ni·ªám ho·∫∑c quy ƒë·ªãnh ph√°p lu·∫≠t",
            "Ph√¢n t√≠ch quy·ªÅn, nghƒ©a v·ª• v√† tr√°ch nhi·ªám",
            "So s√°nh v·ªõi c√°c gi√° tr·ªã ƒë·∫°o ƒë·ª©c v√† x√£ h·ªôi",
            "K·∫øt lu·∫≠n d·ª±a tr√™n chu·∫©n m·ª±c ph√°p l√Ω v√† ƒë·∫°o ƒë·ª©c"
        ],
        "keywords": ["quy·ªÅn", "nghƒ©a v·ª•", "ph√°p lu·∫≠t", "hi·∫øn ph√°p", "ƒë·∫°o ƒë·ª©c", "c√¥ng d√¢n", "x√£ h·ªôi"]
    },
    
    "English": {
        "description": "Ti·∫øng Anh - Grammar, vocabulary, context",
        "steps": [
            "Identify the grammar rule or vocabulary context",
            "Analyze the sentence structure and meaning",
            "Evaluate each option against the context",
            "Select the most appropriate answer"
        ],
        "keywords": ["grammar", "tense", "vocabulary", "preposition", "phrase", "clause"]
    }
}

# Hi·ªÉn th·ªã strategies
for subject, strategy in REASONING_STRATEGIES.items():
    print(f"\n{'='*60}")
    print(f"üìö {subject}: {strategy['description']}")
    print(f"{'='*60}")
    for i, step in enumerate(strategy['steps'], 1):
        print(f"  {i}. {step}")


üìö Mathematics: To√°n h·ªçc - Gi·∫£i t·ª´ng b∆∞·ªõc, c√¥ng th·ª©c, t√≠nh to√°n
  1. X√°c ƒë·ªãnh d·∫°ng b√†i to√°n v√† c√¥ng th·ª©c c·∫ßn d√πng
  2. Li·ªát k√™ c√°c d·ªØ ki·ªán ƒë√£ cho
  3. Th·ª±c hi·ªán c√°c b∆∞·ªõc t√≠nh to√°n
  4. Ki·ªÉm tra v√† k·∫øt lu·∫≠n

üìö Physics: V·∫≠t l√Ω - Ph√¢n t√≠ch hi·ªán t∆∞·ª£ng, √°p d·ª•ng ƒë·ªãnh lu·∫≠t
  1. X√°c ƒë·ªãnh hi·ªán t∆∞·ª£ng v·∫≠t l√Ω v√† ƒë·ªãnh lu·∫≠t li√™n quan
  2. Ph√¢n t√≠ch c√°c ƒë·∫°i l∆∞·ª£ng v√† m·ªëi quan h·ªá
  3. √Åp d·ª•ng c√¥ng th·ª©c ho·∫∑c ƒë·ªãnh lu·∫≠t
  4. So s√°nh v·ªõi c√°c ƒë√°p √°n v√† k·∫øt lu·∫≠n

üìö Chemistry: H√≥a h·ªçc - Ph∆∞∆°ng tr√¨nh, c√¢n b·∫±ng, t√≠nh to√°n mol
  1. X√°c ƒë·ªãnh ch·∫•t tham gia v√† ph·∫£n ·ª©ng h√≥a h·ªçc
  2. Vi·∫øt v√† c√¢n b·∫±ng ph∆∞∆°ng tr√¨nh (n·∫øu c√≥)
  3. T√≠nh to√°n theo mol, kh·ªëi l∆∞·ª£ng ho·∫∑c th·ªÉ t√≠ch
  4. Ph√¢n t√≠ch t√≠nh ch·∫•t v√† k·∫øt lu·∫≠n

üìö Biology: Sinh h·ªçc - Ph√¢n t√≠ch ƒë·∫∑c ƒëi·ªÉm, ch·ª©c nƒÉng sinh h·ªçc
  1. X√°c ƒë·ªãnh kh√°i ni·ªám ho·

## 2. Load v√† ph√¢n t√≠ch d·ªØ li·ªáu

In [3]:
# Load data
input_file = Path("../data/sft_dataset_vnhsge/train_sft.jsonl")

data = []
with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} samples")

# Ph√¢n t√≠ch ph√¢n ph·ªëi m√¥n h·ªçc
from collections import Counter

subjects = [item['subject'] for item in data]
subject_counts = Counter(subjects)

print("\nSubject distribution:")
for subject, count in sorted(subject_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{subject:20s}: {count:4d} samples")

Loaded 1573 samples

Subject distribution:
English             :  250 samples
CivicEducation      :  200 samples
History             :  200 samples
Chemistry           :  196 samples
Biology             :  190 samples
Geography           :  190 samples
Mathematics         :  180 samples
Physics             :  167 samples


## 3. C√°c template reasoning theo m√¥n h·ªçc

T·∫°o templates c·ª• th·ªÉ cho t·ª´ng m√¥n:

In [4]:
def generate_reasoning_template(subject: str, question: str, options: Dict[str, str], correct_answer: str) -> str:
    """
    Generate reasoning template based on subject
    
    Args:
        subject: M√¥n h·ªçc
        question: C√¢u h·ªèi
        options: Dict c·ªßa c√°c options {"A": "...", "B": "...", ...}
        correct_answer: ƒê√°p √°n ƒë√∫ng ("A", "B", "C", "D")
    
    Returns:
        Reasoning text v·ªõi <think> tags
    """
    
    if subject == "Mathematics":
        reasoning = f"""<think>
X√°c ƒë·ªãnh d·∫°ng b√†i v√† c√¥ng th·ª©c c·∫ßn d√πng
Ph√¢n t√≠ch d·ªØ ki·ªán v√† m·ªëi quan h·ªá
√Åp d·ª•ng c√¥ng th·ª©c v√† t√≠nh to√°n
ƒê√°p √°n {correct_answer} ph√π h·ª£p v·ªõi k·∫øt qu·∫£
</think>"""
    
    elif subject == "Physics":
        reasoning = f"""<think>
X√°c ƒë·ªãnh hi·ªán t∆∞·ª£ng v·∫≠t l√Ω v√† ƒë·ªãnh lu·∫≠t li√™n quan
Ph√¢n t√≠ch c√°c ƒë·∫°i l∆∞·ª£ng v√† m·ªëi quan h·ªá
√Åp d·ª•ng ƒë·ªãnh lu·∫≠t v√† t√≠nh to√°n
ƒê√°p √°n {correct_answer} ƒë√∫ng theo ƒë·ªãnh lu·∫≠t v·∫≠t l√Ω
</think>"""
    
    elif subject == "Chemistry":
        reasoning = f"""<think>
X√°c ƒë·ªãnh ch·∫•t tham gia v√† lo·∫°i ph·∫£n ·ª©ng
Ph√¢n t√≠ch t√≠nh ch·∫•t h√≥a h·ªçc
Vi·∫øt v√† c√¢n b·∫±ng ph∆∞∆°ng tr√¨nh (n·∫øu c·∫ßn)
ƒê√°p √°n {correct_answer} ph√π h·ª£p v·ªõi t√≠nh ch·∫•t/ph·∫£n ·ª©ng
</think>"""
    
    elif subject == "Biology":
        reasoning = f"""<think>
X√°c ƒë·ªãnh kh√°i ni·ªám/qu√° tr√¨nh sinh h·ªçc
Ph√¢n t√≠ch c·∫•u tr√∫c v√† ch·ª©c nƒÉng
So s√°nh c√°c ƒë√°p √°n v·ªõi ki·∫øn th·ª©c sinh h·ªçc
ƒê√°p √°n {correct_answer} ch√≠nh x√°c
</think>"""
    
    elif subject == "Geography":
        reasoning = f"""<think>
X√°c ƒë·ªãnh v√πng ƒë·ªãa l√Ω ho·∫∑c hi·ªán t∆∞·ª£ng
Ph√¢n t√≠ch y·∫øu t·ªë ƒë·ªãa l√Ω (kh√≠ h·∫≠u, ƒë·ªãa h√¨nh, t√†i nguy√™n)
So s√°nh v·ªõi ƒë·∫∑c ƒëi·ªÉm c√°c ƒë√°p √°n
ƒê√°p √°n {correct_answer} ƒë√∫ng
</think>"""
    
    elif subject == "History":
        reasoning = f"""<think>
X√°c ƒë·ªãnh s·ª± ki·ªán, th·ªùi k·ª≥ v√† b·ªëi c·∫£nh l·ªãch s·ª≠
Ph√¢n t√≠ch nguy√™n nh√¢n v√† di·ªÖn bi·∫øn
Xem x√©t h·∫≠u qu·∫£ v√† √Ω nghƒ©a
ƒê√°p √°n {correct_answer} ph√π h·ª£p v·ªõi s·ª± ki·ªán l·ªãch s·ª≠
</think>"""
    
    elif subject == "Literature":
        reasoning = f"""<think>
X√°c ƒë·ªãnh t√°c ph·∫©m, t√°c gi·∫£ v√† th·ªÉ lo·∫°i
Ph√¢n t√≠ch n·ªôi dung, ngh·ªá thu·∫≠t v√† bi·ªán ph√°p tu t·ª´
ƒê√°nh gi√° gi√° tr·ªã vƒÉn h·ªçc v√† th√¥ng ƒëi·ªáp
ƒê√°p √°n {correct_answer} ph√π h·ª£p
</think>"""
    
    elif subject == "CivicEducation":
        reasoning = f"""<think>
X√°c ƒë·ªãnh kh√°i ni·ªám ho·∫∑c quy ƒë·ªãnh ph√°p lu·∫≠t
Ph√¢n t√≠ch quy·ªÅn, nghƒ©a v·ª• v√† tr√°ch nhi·ªám
So s√°nh v·ªõi chu·∫©n m·ª±c ph√°p l√Ω v√† ƒë·∫°o ƒë·ª©c
ƒê√°p √°n {correct_answer} ƒë√∫ng theo quy ƒë·ªãnh
</think>"""
    
    elif subject == "English":
        reasoning = f"""<think>
Identify question type (grammar/vocabulary)
Analyze sentence structure and context
Evaluate each option against the rule
Answer {correct_answer} is correct
</think>"""
    
    else:
        # Generic template for unknown subjects
        reasoning = f"""<think>
Ph√¢n t√≠ch c√¢u h·ªèi v√† y√™u c·∫ßu
Xem x√©t v√† so s√°nh c√°c ƒë√°p √°n
Lo·∫°i tr·ª´ ƒë√°p √°n kh√¥ng ph√π h·ª£p
ƒê√°p √°n {correct_answer} l√† ƒë√°p √°n ƒë√∫ng
</think>"""
    
    return reasoning.strip()


# Test v·ªõi m·ªôt v√†i samples
print("\nTesting reasoning generation:\n")
test_sample = data[0]
print(f"Subject: {test_sample['subject']}")
print(f"Question: {test_sample['messages'][1]['content'][:100]}...")
print(f"\nGenerated reasoning preview (first 300 chars):")

# Parse question to get options
def parse_question(content):
    lines = content.strip().split('\n')
    question_parts = []
    options = {}
    expected_labels = ['A', 'B', 'C', 'D']
    current_label_idx = 0
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Match c·∫£ c√≥ v√† kh√¥ng c√≥ d·∫•u ch·∫•m: A. text ho·∫∑c A text
        match = re.match(r'^([A-D])[\.\:\s]*\s*(.*)', line)
        if match:
            label = match.group(1)
            text = match.group(2).strip()
            
            # Skip n·∫øu text r·ªóng
            if not text:
                continue
            
            # Auto-fix: Lu√¥n d√πng label theo th·ª© t·ª± expected
            if current_label_idx < len(expected_labels):
                actual_label = expected_labels[current_label_idx]
                
                if label in options or label != actual_label:
                    # print(f"‚ö†Ô∏è Warning: Found '{label}', using '{actual_label}' instead")
                    label = actual_label
                
                options[label] = text
                current_label_idx += 1
        else:
            # N·∫øu kh√¥ng ph·∫£i option, coi l√† ph·∫ßn question
            question_parts.append(line)
    
    return ' '.join(question_parts), options

q, opts = parse_question(test_sample['messages'][1]['content'])
correct = json.loads(test_sample['messages'][2]['content'])['answer']
reasoning = generate_reasoning_template(test_sample['subject'], q, opts, correct)
print(reasoning)


Testing reasoning generation:

Subject: Biology
Question: C√¢u 81: C√≥ th·ªÉ s·ª≠ d·ª•ng h√≥a ch·∫•t n√†o sau ƒë√¢y ƒë·ªÉ ph√°t hi·ªán qu√° tr√¨nh h√¥ h·∫•p ·ªü th·ª±c v·∫≠t th·∫£i ra kh√≠ CO2...

Generated reasoning preview (first 300 chars):
<think>
X√°c ƒë·ªãnh kh√°i ni·ªám/qu√° tr√¨nh sinh h·ªçc
Ph√¢n t√≠ch c·∫•u tr√∫c v√† ch·ª©c nƒÉng
So s√°nh c√°c ƒë√°p √°n v·ªõi ki·∫øn th·ª©c sinh h·ªçc
ƒê√°p √°n B ch√≠nh x√°c
</think>


## 4. Augment dataset v·ªõi reasoning

In [5]:
def add_reasoning_to_dataset(input_file: Path, output_file: Path, sample_ratio: float = 1.0):
    """
    Th√™m reasoning v√†o dataset
    
    Args:
        input_file: File JSONL ƒë·∫ßu v√†o
        output_file: File JSONL ƒë·∫ßu ra
        sample_ratio: T·ª∑ l·ªá samples c·∫ßn th√™m reasoning (0.0-1.0)
    """
    # Load data
    data = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    print(f"Loaded {len(data)} samples")
    
    # Process data
    augmented_data = []
    skipped = 0
    
    for item in tqdm(data, desc="Adding reasoning"):
        try:
            # Parse question
            user_content = item['messages'][1]['content']
            question, options = parse_question(user_content)
            
            if len(options) != 4:
                # Gi·ªØ nguy√™n n·∫øu kh√¥ng parse ƒë∆∞·ª£c
                augmented_data.append(item)
                print(item)
                skipped += 1
                continue
            
            # Get correct answer
            correct_answer = json.loads(item['messages'][2]['content'])['answer']
            
            # Decide whether to add reasoning
            if random.random() > sample_ratio:
                # Gi·ªØ nguy√™n
                augmented_data.append(item)
                continue
            
            # Generate reasoning
            reasoning = generate_reasoning_template(
                subject=item['subject'],
                question=question,
                options=options,
                correct_answer=correct_answer
            )
            
            # ‚úÖ NEW FORMAT: reasoning n·∫±m ngo√†i JSON
            # Format: <think>...</think>\n{"answer":"A"}
            answer_json = json.dumps({"answer": correct_answer}, ensure_ascii=False)
            new_response_content = f"{reasoning}\n{answer_json}"
            
            # Create new item
            new_item = {
                "messages": [
                    item['messages'][0],  # System message
                    item['messages'][1],  # User message
                    {
                        "role": "assistant",
                        "content": new_response_content
                    }
                ],
                "id": item['id'],
                "subject": item['subject']
            }
            
            augmented_data.append(new_item)
        
        except Exception as e:
            print(f"\nError processing {item.get('id', 'unknown')}: {e}")
            augmented_data.append(item)  # Gi·ªØ nguy√™n n·∫øu l·ªói
            print(item)
            skipped += 1
    
    print(f"\nProcessed {len(augmented_data)} samples")
    print(f"Skipped: {skipped}")
    print(f"Added reasoning to: {len(augmented_data) - skipped} samples")
    
    # Save
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in augmented_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"\nSaved to {output_file}")
    return augmented_data

In [6]:
# Th·ª±c hi·ªán augmentation
output_file = Path("../data/sft_dataset_vnhsge/train_sft_with_reasoning.jsonl")

augmented_data = add_reasoning_to_dataset(
    input_file=input_file,
    output_file=output_file,
    sample_ratio=1.0  # 1.0 = th√™m reasoning cho 100% samples, 0.5 = 50%, etc.
)

Loaded 1573 samples


Adding reasoning: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1573/1573 [00:00<00:00, 74379.00it/s]


Processed 1573 samples
Skipped: 0
Added reasoning to: 1573 samples

Saved to ..\data\sft_dataset_vnhsge\train_sft_with_reasoning.jsonl





## 5. Ki·ªÉm tra k·∫øt qu·∫£

In [7]:
# Ki·ªÉm tra m·ªôt v√†i samples
print("\n" + "="*80)
print("Sample data with reasoning:")
print("="*80)

# Group by subject
from collections import defaultdict
samples_by_subject = defaultdict(list)
for item in augmented_data:
    samples_by_subject[item['subject']].append(item)

# Show one sample per subject
for subject in ["Mathematics", "Physics", "Chemistry", "Biology"]:
    if subject in samples_by_subject:
        sample = samples_by_subject[subject][0]
        print(f"\n{'='*80}")
        print(f"üìö Subject: {subject}")
        print(f"ID: {sample['id']}")
        print(f"{'='*80}")
        print(f"\nüìù Question:\n{sample['messages'][1]['content'][:200]}...")
        print(f"\nü§î Assistant Response:")
        response_content = sample['messages'][2]['content']
        
        # Check if response has reasoning (contains <think>)
        if '<think>' in response_content:
            print(response_content[:500] + "...")
        else:
            # Old format or no reasoning
            try:
                response = json.loads(response_content)
                print(f"Answer: {response['answer']}")
            except:
                print(response_content[:200])


Sample data with reasoning:

üìö Subject: Mathematics
ID: MET_Math_IE_2019_1

üìù Question:
C√¢u 1) Th·ªÉ t√≠ch c·ªßa kh·ªëi l·∫≠p ph∆∞∆°ng c·∫°nh 2a b·∫±ng:
A. 8a^3. 
B. 2a^3. 
C. a^3.¬†¬† 
D. 6a^3....

ü§î Assistant Response:
<think>
X√°c ƒë·ªãnh d·∫°ng b√†i v√† c√¥ng th·ª©c c·∫ßn d√πng
Ph√¢n t√≠ch d·ªØ ki·ªán v√† m·ªëi quan h·ªá
√Åp d·ª•ng c√¥ng th·ª©c v√† t√≠nh to√°n
ƒê√°p √°n A ph√π h·ª£p v·ªõi k·∫øt qu·∫£
</think>
{"answer": "A"}...

üìö Subject: Physics
ID: MET_Phy_IE_2019_1

üìù Question:
C√¢u 1. 	M·ªôt v·∫≠t dao ƒë·ªông ƒëi·ªÅu ho√† theo ph∆∞∆°ng tr√¨nh: 
x = Acos(\omega*t+\varphi) v·ªõi A > 0 v√† omega > 0. Pha c·ªßa dao ƒë·ªông ·ªü th·ªùi ƒëi·ªÉm t l√† 
A. \omega
B. cos(\omega*t+ \varphi)
C. \omega*t+ \varphi
D. ...

ü§î Assistant Response:
<think>
X√°c ƒë·ªãnh hi·ªán t∆∞·ª£ng v·∫≠t l√Ω v√† ƒë·ªãnh lu·∫≠t li√™n quan
Ph√¢n t√≠ch c√°c ƒë·∫°i l∆∞·ª£ng v√† m·ªëi quan h·ªá
√Åp d·ª•ng ƒë·ªãnh lu·∫≠t v√† t√≠nh to√°n
ƒê√°p √°n C ƒë√∫ng theo ƒë·ªãnh lu·∫≠t v·∫≠t l√Ω
</think>
{

## 6. C·∫≠p nh·∫≠t system prompt

Khi training v·ªõi reasoning, c·∫ßn c·∫≠p nh·∫≠t system prompt:

In [8]:
# System prompt m·ªõi
NEW_SYSTEM_PROMPT = """B·∫°n l√† tr·ª£ l√Ω tr·∫£ l·ªùi tr·∫Øc nghi·ªám th√¥ng minh. 
H√£y suy lu·∫≠n t·ª´ng b∆∞·ªõc trong th·∫ª <think>, sau ƒë√≥ tr·∫£ JSON v·ªõi ƒë√°p √°n.
Format: 
<think>B∆∞·ªõc 1: ...
B∆∞·ªõc 2: ...
K·∫øt lu·∫≠n: ...</think>
{"answer":"A"}"""

print("New system prompt:")
print(NEW_SYSTEM_PROMPT)
print("\n" + "="*80)

# Update system message in all samples
def update_system_prompt(data, new_prompt):
    for item in data:
        item['messages'][0]['content'] = new_prompt
    return data

# Apply
augmented_data = update_system_prompt(augmented_data, NEW_SYSTEM_PROMPT)

# Save again
with open(output_file, 'w', encoding='utf-8') as f:
    for item in augmented_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Updated system prompt and saved to {output_file}")

New system prompt:
B·∫°n l√† tr·ª£ l√Ω tr·∫£ l·ªùi tr·∫Øc nghi·ªám th√¥ng minh. 
H√£y suy lu·∫≠n t·ª´ng b∆∞·ªõc trong th·∫ª <think>, sau ƒë√≥ tr·∫£ JSON v·ªõi ƒë√°p √°n.
Format: 
<think>B∆∞·ªõc 1: ...
B∆∞·ªõc 2: ...
K·∫øt lu·∫≠n: ...</think>
{"answer":"A"}

Updated system prompt and saved to ..\data\sft_dataset_vnhsge\train_sft_with_reasoning.jsonl


## 7. Statistics v√† ph√¢n t√≠ch

In [9]:
# T√≠nh to√°n statistics
import numpy as np

reasoning_lengths = []
total_lengths = []

for item in augmented_data:
    response_content = item['messages'][2]['content']
    
    # Check if response has reasoning (new format)
    if '<think>' in response_content:
        # Extract reasoning part (from <think> to </think>)
        import re
        match = re.search(r'<think>.*?</think>', response_content, re.DOTALL)
        if match:
            reasoning_text = match.group(0)
            reasoning_lengths.append(len(reasoning_text))
            total_lengths.append(len(item['messages'][1]['content']) + len(reasoning_text))

print(f"Samples with reasoning: {len(reasoning_lengths)}")
print(f"\nReasoning length statistics:")
if reasoning_lengths:
    print(f"  Mean: {np.mean(reasoning_lengths):.0f} characters")
    print(f"  Median: {np.median(reasoning_lengths):.0f} characters")
    print(f"  Min: {np.min(reasoning_lengths):.0f} characters")
    print(f"  Max: {np.max(reasoning_lengths):.0f} characters")

    print(f"\nTotal length (question + reasoning) statistics:")
    print(f"  Mean: {np.mean(total_lengths):.0f} characters")
    print(f"  Max: {np.max(total_lengths):.0f} characters")

    # ∆Ø·ªõc t√≠nh tokens (roughly 1 token ~ 4 characters for Vietnamese)
    avg_tokens = np.mean(total_lengths) / 4
    print(f"\nEstimated average tokens: ~{avg_tokens:.0f} tokens")
else:
    print("No samples with reasoning found")

Samples with reasoning: 1573

Reasoning length statistics:
  Mean: 159 characters
  Median: 157 characters
  Min: 147 characters
  Max: 172 characters

Total length (question + reasoning) statistics:
  Mean: 516 characters
  Max: 3272 characters

Estimated average tokens: ~129 tokens


## 8. T·∫°o mixed dataset (c√≥ v√† kh√¥ng c√≥ reasoning)

C√≥ th·ªÉ t·∫°o dataset k·∫øt h·ª£p ƒë·ªÉ model h·ªçc c·∫£ hai c√°ch:

In [10]:
# T·∫°o mixed dataset: 50% c√≥ reasoning, 50% kh√¥ng c√≥
mixed_output = Path("../data/sft_dataset_vnhsge/train_sft_mixed_reasoning.jsonl")

print("Creating mixed dataset (50% with reasoning, 50% without)...")
augmented_mixed = add_reasoning_to_dataset(
    input_file=input_file,
    output_file=mixed_output,
    sample_ratio=0.5  # 50% c√≥ reasoning
)

# Count samples with reasoning (new format)
with_reasoning = sum(1 for item in augmented_mixed 
                     if '<think>' in item['messages'][2]['content'])
print(f"\nMixed dataset statistics:")
print(f"  Total samples: {len(augmented_mixed)}")
print(f"  With reasoning: {with_reasoning} ({with_reasoning/len(augmented_mixed)*100:.1f}%)")
print(f"  Without reasoning: {len(augmented_mixed)-with_reasoning} ({(len(augmented_mixed)-with_reasoning)/len(augmented_mixed)*100:.1f}%)")

Creating mixed dataset (50% with reasoning, 50% without)...
Loaded 1573 samples


Adding reasoning: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1573/1573 [00:00<00:00, 56284.25it/s]


Processed 1573 samples
Skipped: 0
Added reasoning to: 1573 samples

Saved to ..\data\sft_dataset_vnhsge\train_sft_mixed_reasoning.jsonl

Mixed dataset statistics:
  Total samples: 1573
  With reasoning: 786 (50.0%)
  Without reasoning: 787 (50.0%)





## T·ªïng k·∫øt

**Files ƒë√£ t·∫°o:**
1. `train_sft_with_reasoning.jsonl` - 100% samples c√≥ reasoning
2. `train_sft_mixed_reasoning.jsonl` - 50% c√≥ reasoning, 50% kh√¥ng

**L·ª£i √≠ch:**
- Model h·ªçc c√°ch suy lu·∫≠n c√≥ c·∫•u tr√∫c
- TƒÉng t√≠nh gi·∫£i th√≠ch ƒë∆∞·ª£c
- C√≥ th·ªÉ c·∫£i thi·ªán accuracy nh·ªù reasoning steps

**L∆∞u √Ω khi training:**
- C·∫ßn tƒÉng `max_seq_length` v√¨ c√≥ th√™m reasoning
- C√≥ th·ªÉ c·∫ßn ƒëi·ªÅu ch·ªânh `batch_size` do sequence d√†i h∆°n
- N√™n d√πng `completion_only_loss=True` ƒë·ªÉ ch·ªâ t√≠nh loss tr√™n reasoning + answer