# README
This script creates json files for question-answer pairs.

Run this after `md_data_post_process.ipynb`

---

In [18]:
import os
import re
import json

def clean_instruction(text):
    """
    Removes markdown headers and emphasis (bold, italic, underline) from text.
    """
    # Remove Markdown emphasis (**bold**, *italic*, __underline__, _italic_)
    text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
    text = re.sub(r"\*(.*?)\*", r"\1", text)
    text = re.sub(r"__(.*?)__", r"\1", text)
    text = re.sub(r"_([^_]+)_", r"\1", text)

    # Remove leading '#' characters and extra whitespace
    return text.lstrip("#").strip()





def split_header_content(text):
    lines = text.splitlines()
    header = []
    content = []
    capture = False

    for line in lines:
        # Look for a markdown header that mentions "Section"
        if re.match(r"^#{1,6}\s*", line) and "Section" in line:
            header.append(line)
            capture = True
        elif capture:
            content.append(line)
    return header, content




def get_alpaca_data(root_directory):
    alpaca_list = []
    
    # Walk through the directory tree
    for root, _, files in os.walk(root_directory):
        common_input = None
        instruction_files = []

        # First, separate the common input file and instruction files
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                if "section" not in file.lower():
                    # This file is the common input (Section 1-3)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        common_input = f.read().strip()
                else:
                    # Files with "section" in the name are treated as instruction/output files
                    instruction_files.append(file_path)
        
        # Only process if a common input was found
        if common_input:
            for inst_file in instruction_files:
                with open(inst_file, 'r', encoding='utf-8') as f:
                    document_text = f.read().strip()
                    header, content = split_header_content(document_text)
                    
                    # Use the first header as the instruction
                    if header:
                        instruction = clean_instruction(header[0])
                    else:
                        instruction = "No instruction found for file " + inst_file

                    output_text = "\n".join(content).strip()
                    
                    # Only add if both instruction and output are present
                    if instruction and output_text:
                        alpaca_list.append({
                            "instruction": instruction,
                            "input": common_input,
                            "output": output_text
                        })
    
    return alpaca_list





root_directory = "./../../Output_table_cleaned2"

alpaca_data = get_alpaca_data(root_directory)

# Write output JSON
output_json_file = os.path.join(root_directory, "alpaca_data.json")
with open(output_json_file, "w", encoding="utf-8") as out_f:
    json.dump(alpaca_data, out_f, indent=2, ensure_ascii=False)

print(f"Alpaca format JSON saved to: {output_json_file}")


Alpaca format JSON saved to: ./../../Output_table_cleaned2\alpaca_data.json
