In [None]:
import json
import pandas as pd
import os

FILE_NAME = "..\..\storage\data\dataset.json"
def load_data(file_path):
    """Tải dữ liệu từ tệp JSON."""
    if not os.path.exists(file_path):
        print(f"Lỗi: Không tìm thấy tệp tại đường dẫn: {file_path}")
        return None
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            print(f"Đã tải thành công {len(data)} đối tượng Project.")
            return data
    except json.JSONDecodeError as e:
        print(f"Lỗi khi đọc JSON: {e}")
        return None
    except Exception as e:
        print(f"Lỗi không xác định khi tải dữ liệu: {e}")
        return None

# Thực hiện tải dữ liệu
data_json = load_data(FILE_NAME)

Đã tải thành công 5 đối tượng Project.


In [None]:
# Các template cho prompt (sử dụng f-string trong Python)
SYSTEM_DAILY = "You are a team leader writing internal daily progress notes."
SYSTEM_IN_PROGRESS = "You are a software engineer reporting task progress to your manager."
SYSTEM_COMPLETED = "You are a software engineer writing completion reports."

def format_prompt(system_role: str, user_instruction: str, assistant_response: str) -> str:
    """Tạo chuỗi prompt theo định dạng Llama 3"""
    return (
        "<|begin_of_text|>\n"
        f"<|start_header_id|>system<|end_header_id|>\n{system_role}\n<|eot_id|>\n"
        f"<|start_header_id|>user<|end_header_id|>\n{user_instruction}\n<|eot_id|>\n"
        f"<|start_header_id|>assistant<|end_header_id|>\n{assistant_response}\n<|eot_id|>"
    )

def generate_finetuning_dataset(data):
    """
    Chuyển đổi dữ liệu JSON thành dataset finetuning và thống kê số lượng.
    Trả về: (list_of_prompts, report_counts)
    """
    finetuning_dataset = []
    report_counts = {
        "Daily Report": 0,
        "Processing Report": 0,
        "Completed Report": 0,
    }

    if not data:
        return finetuning_dataset, report_counts
    
    # Bắt đầu lặp qua cấu trúc phân cấp
    for project in data:
        for phase in project.get("phases", []):
            for collection in phase.get("collections", []):
                for task in collection.get("tasks", []):
                    
                    # --- Loại A: Daily Report (Task) ---
                    daily_report_text = task.get("daily_report")
                    if daily_report_text:
                        subtasks_info = "\n".join([
                            f"- {s.get('subtask_name', 'N/A')} - Status: {s.get('status', 'N/A')}"
                            for s in task.get("subtasks", [])
                        ])
                        
                        user_instruction = (
                            "Generate DAILY_PROGRESS_NOTE based on the following project context:\n\n"
                            f"Project: {project.get('project_name', 'N/A')} ({project.get('project_description', 'N/A')})\n"
                            f"Phase: {phase.get('phase_name', 'N/A')} ({phase.get('phase_description', 'N/A')})\n"
                            f"Collection: {collection.get('collection_name', 'N/A')} ({collection.get('collection_description', 'N/A')})\n"
                            f"Task: {task.get('task_name', 'N/A')} ({task.get('task_description', 'N/A')})\n"
                            f"Subtasks Status:\n{subtasks_info}"
                        )
                        
                        prompt = format_prompt(SYSTEM_DAILY, user_instruction, daily_report_text)
                        finetuning_dataset.append({"prompt": prompt})
                        report_counts["Daily Report"] += 1

                    
                    # --- Loại B & C: Processing/Completed Report (Subtask) ---
                    for subtask in task.get("subtasks", []):
                        
                        user_instruction_base = (
                            f"Project: {project.get('project_name', 'N/A')} ({project.get('project_description', 'N/A')})\n"
                            f"Phase: {phase.get('phase_name', 'N/A')} ({phase.get('phase_description', 'N/A')})\n"
                            f"Collection: {collection.get('collection_name', 'N/A')} ({collection.get('collection_description', 'N/A')})\n"
                            f"Task: {task.get('task_name', 'N/A')} ({task.get('task_description', 'N/A')})\n"
                            f"Subtask: {subtask.get('subtask_name', 'N/A')} ({subtask.get('subtask_description', 'N/A')})"
                        )
                        
                        # B. Processing Report
                        processing_report_text = subtask.get("processing_report")
                        if processing_report_text:
                            user_instruction = "Generate IN_PROGRESS_EMAIL based on the following task context:\n\n" + user_instruction_base
                            prompt = format_prompt(SYSTEM_IN_PROGRESS, user_instruction, processing_report_text)
                            finetuning_dataset.append({"prompt": prompt})
                            report_counts["Processing Report"] += 1

                        # C. Completed Report
                        completed_report_text = subtask.get("completed_report")
                        if completed_report_text:
                            user_instruction = "Generate COMPLETED_EMAIL based on the following task context:\n\n" + user_instruction_base
                            prompt = format_prompt(SYSTEM_COMPLETED, user_instruction, completed_report_text)
                            finetuning_dataset.append({"prompt": prompt})
                            report_counts["Completed Report"] += 1

    return finetuning_dataset, report_counts

In [None]:
if data_json:
    # Chạy hàm chuyển đổi
    final_dataset, counts = generate_finetuning_dataset(data_json)
    
    print("\n--- KẾT QUẢ CHUYỂN ĐỔI VÀ THỐNG KÊ ---")
    
    print(f"\nTổng số lượng Prompt đã tạo: {len(final_dataset)}")
    
    counts_df = pd.Series(counts, name="Số lượng Prompts").to_frame()
    
    print("\nThống kê theo Loại Report:")
    display(counts_df) 
    
    # 2. Xem một vài  Prompt đầu tiên
    print("\n--- 3 MẪU PROMPT ĐẦU TIÊN ---")
    for i in range(min(3, len(final_dataset))):
        print(f"\n[Mẫu {i+1}]:\n{final_dataset[i]['prompt']}")
        print("-" * 20)

    # 3. Lưu dataset
    output_file = "..\..\storage\data\finetuning_dataset_prompts.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(final_dataset, f, ensure_ascii=False, indent=2)
    print(f"\nĐã lưu dataset cuối cùng vào tệp: {output_file}")


--- KẾT QUẢ CHUYỂN ĐỔI VÀ THỐNG KÊ ---

Tổng số lượng Prompt (mẫu đào tạo) đã tạo: 1797

Thống kê theo Loại Report:


Unnamed: 0,Số lượng Prompts
Daily Report,278
Processing Report,712
Completed Report,807



--- 3 MẪU PROMPT ĐẦU TIÊN ---

[Mẫu 1]:
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a team leader writing internal daily progress notes.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Generate DAILY_PROGRESS_NOTE based on the following project context:

Project: HealthTrack Mobile App (A secure mobile application that allows patients to monitor vital signs, schedule appointments, and access electronic health records.)
Phase: Project Initiation & Requirements Gathering (Define project scope, objectives, and stakeholder expectations. Conduct workshops with healthcare professionals, patients, and IT staff to capture functional and non‑functional requirements, including data privacy and regulatory compliance.)
Collection: Stakeholder Requirement Workshops (Facilitate structured workshops with healthcare professionals, patients, and IT staff to elicit detailed functional and non‑functional requirements.)
Task: Conduct Stakeholder Interviews (Arrange and conduc