# 

# README
This script transforms Markdown files into question-answer pairs.

`filename.md` contains Sections 1 to 3 as input.

`filename(Section X).md` contains content from individual sections, used as instructions and answers.

Run this before `md_to_json.ipynb`

---

# process for tabular data

In [1]:
import os


input_dir = "./../../Output_M"
output_dir = "./../../Output_table_cleaned"
os.makedirs(output_dir, exist_ok=True)


for root, _, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".md"):
            input_file = os.path.join(root, file)

            # Construct corresponding output file path
            relative_path = os.path.relpath(input_file, input_dir)
            output_file = os.path.join(output_dir, relative_path)
            os.makedirs(os.path.dirname(output_file), exist_ok=True)

            with open(input_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            cleaned_lines = []

            # for i in range (len(lines)):
            #     print(i, lines[i])

            i = 0
            while i < len(lines) - 3:
                if (lines[i].startswith("|") and lines[i+1].strip() == "" and lines[i+2].startswith("|") and lines[i+3].startswith("|-")):
                    print("Found misbehaved table header at line", i, "in", file)

                    cleaned_lines.append(lines[i])
                    i += 1

                    # skip the empty line
                    i += 1

                    cleaned_lines.append(lines[i])
                    i += 1

                    # Skip the misplaced separator 
                    # i+=1 after this if statement
                else:
                    cleaned_lines.append(lines[i])
                i += 1

            # Append remaining lines if we stopped early
            while i < len(lines):
                cleaned_lines.append(lines[i])
                i += 1



            # save the new file 
            with open(output_file, 'w', encoding='utf-8') as f:
                f.writelines(cleaned_lines)

Found misbehaved table header at line 77 in WisTMP 8026 (1011-01-30) (1).md
Found misbehaved table header at line 98 in WisTMP 8026 (1011-01-30) (1).md
Found misbehaved table header at line 144 in WisTMP 8026 (1011-01-30) (1).md
Found misbehaved table header at line 53 in WisTMP 7999 (2695-07-01).md
Found misbehaved table header at line 53 in WisTMP 7605 (1610-41-00).md
Found misbehaved table header at line 87 in WisTMP 7605 (1610-41-00).md
Found misbehaved table header at line 106 in WisTMP 7605 (1610-41-00).md
Found misbehaved table header at line 160 in WisTMP 7605 (1610-41-00).md
Found misbehaved table header at line 53 in WisTMP 7699 (1170-20-02) (1).md
Found misbehaved table header at line 162 in WisTMP 7699 (1170-20-02) (1).md
Found misbehaved table header at line 53 in WisTMP 7682 (9140-12-31) (2).md
Found misbehaved table header at line 108 in WisTMP 7682 (9140-12-31) (2).md
Found misbehaved table header at line 138 in WisTMP 7682 (9140-12-31) (2).md
Found misbehaved table hea

# Extract comment(we don't need this any more)

In [8]:
import os
import re

def remove_date_author(text):
    """
    Removes date and author information from the text.
    It finds parenthesized expressions starting with "By" and removes them.
    """
    cleaned_text = re.sub(r"\s*\(By [^)]+\)", "", text)
    return cleaned_text

def extract_section_name_and_comments(text):
    """
    Extracts tuples of (section name, section comment) from the document text.
    
    Assumes that:
      - A section header is marked by a markdown header line containing a bold text
        that starts with "Section <number> - ..." (e.g., **Section 4 - Work Zone Strategies**).
      - A "Section Comments" header is used to indicate the start of a comment block.
    
    Returns:
      A list of tuples: (section name, comment block text)
    """
    lines = text.splitlines()
    current_section = None  
    results = []            
    capturing = False       
    comment_lines = []      

    for line in lines:
        section_header_match = re.match(r"^#{1,6}\s*\*\*(Section\s+\d+\s*-\s*.*?)\*\*", line)
        if section_header_match:
            current_section = section_header_match.group(1)
        
        if re.search(r"\*\*Section Comments\*\*", line):
            if capturing and comment_lines:
                comment_text = "\n".join(comment_lines).strip()
                comment_text = remove_date_author(comment_text)
                results.append((current_section, comment_text))
                comment_lines = []
            capturing = True
            continue  
        
        if capturing and re.match(r"^#{1,6}\s*", line):
            if comment_lines:
                comment_text = "\n".join(comment_lines).strip()
                comment_text = remove_date_author(comment_text)
                results.append((current_section, comment_text))
            capturing = False
            comment_lines = []
        
        if capturing:
            comment_lines.append(line)
    
    if capturing and comment_lines:
        comment_text = "\n".join(comment_lines).strip()
        comment_text = remove_date_author(comment_text)
        results.append((current_section, comment_text))
    
    return results

# Base directory for input files
input_base_dir = "./../../Output_table_cleaned"

# Recursively process all markdown files
for root, _, files in os.walk(input_base_dir):
    for file in files:
        if file.endswith(".md"):
            input_file = os.path.join(root, file)
            
            # Generate the output file name by appending "_extracted"
            output_file = os.path.join(root, file.replace(".md", "_extracted.md"))

            # Read the document text from the input file
            with open(input_file, 'r', encoding='utf-8') as f:
                document_text = f.read()

            # Extract section names and comments
            section_data = extract_section_name_and_comments(document_text)

            # Save the extracted content
            if section_data:
                with open(output_file, 'w', encoding='utf-8') as out_file:
                    for section_name, comment in section_data:
                        out_file.write(f"{section_name}\n")
                        out_file.write(f"{comment}\n")

                print(f"Processed: {input_file} -> {output_file}")


Processed: ./../../Output_table_cleaned/WisTMP 7999 (2695-07-01)/WisTMP 7999 (2695-07-01).md -> ./../../Output_table_cleaned/WisTMP 7999 (2695-07-01)/WisTMP 7999 (2695-07-01)_extracted.md
Processed: ./../../Output_table_cleaned/WisTMP 7834 (2250-14-11)/WisTMP 7834 (2250-14-11).md -> ./../../Output_table_cleaned/WisTMP 7834 (2250-14-11)/WisTMP 7834 (2250-14-11)_extracted.md
Processed: ./../../Output_table_cleaned/WisTMP 8172 (4236-02-00)/WisTMP 8172 (4236-02-00).md -> ./../../Output_table_cleaned/WisTMP 8172 (4236-02-00)/WisTMP 8172 (4236-02-00)_extracted.md
Processed: ./../../Output_table_cleaned/WisTMP 7828 (1330-29-00)/WisTMP 7828 (1330-29-00).md -> ./../../Output_table_cleaned/WisTMP 7828 (1330-29-00)/WisTMP 7828 (1330-29-00)_extracted.md
Processed: ./../../Output_table_cleaned/WisTMP 8941 (8941-05-01)/WisTMP 8941 (8941-05-01).md -> ./../../Output_table_cleaned/WisTMP 8941 (8941-05-01)/WisTMP 8941 (8941-05-01)_extracted.md
Processed: ./../../Output_table_cleaned/WisTMP 8741 (NO DESI

# Remove & extract comment (we are not using this any more)

In [None]:
import os
import re

def remove_date_author(text):
    """Removes date and author information from the text."""
    return re.sub(r"\s*\(By [^)]+\)", "", text)

def extract_and_remove_comments(text):
    """
    Extracts section names and comment blocks, and removes the comment blocks 
    from the original text.

    Returns:
      - results: a list of tuples (section name, comment text)
      - cleaned_text: the original text with comment blocks removed
    """
    lines = text.splitlines()
    current_section = None
    results = []
    capturing = False
    comment_lines = []
    cleaned_lines = []

    for line in lines:
        # Only update current_section when we're not capturing a comment block.
        if not capturing:
            header_match = re.match(r"^#{1,6}\s*\*\*(Section\s+\d+\s*-\s*.*?)\*\*", line)
            if header_match:
                current_section = header_match.group(1)
        
        # Check for the start of a comment block.
        if re.search(r"\*\*Section Comments\*\*", line):
            # If we were already capturing (should rarely happen), flush the block.
            if capturing and comment_lines:
                comment_text = "\n".join(comment_lines).strip()
                comment_text = remove_date_author(comment_text)
                results.append((current_section, comment_text))
                comment_lines = []
            capturing = True
            # Skip this line so it is not added to cleaned text.
            continue

        # If we are capturing and we hit a new markdown header, finish capturing.
        if capturing and re.match(r"^#{1,6}\s*", line):
            if comment_lines:
                comment_text = "\n".join(comment_lines).strip()
                comment_text = remove_date_author(comment_text)
                results.append((current_section, comment_text))
            capturing = False
            comment_lines = []
            # Now process this header line as a normal line (updating current_section below).
            header_match = re.match(r"^#{1,6}\s*\*\*(Section\s+\d+\s*-\s*.*?)\*\*", line)
            if header_match:
                current_section = header_match.group(1)
            cleaned_lines.append(line)
            continue

        # If capturing, collect comment lines; otherwise, add to cleaned text.
        if capturing:
            comment_lines.append(line)
        else:
            cleaned_lines.append(line)

    # If the file ends while still capturing a comment block, flush it.
    if capturing and comment_lines:
        comment_text = "\n".join(comment_lines).strip()
        comment_text = remove_date_author(comment_text)
        results.append((current_section, comment_text))

    return results, "\n".join(cleaned_lines)


# Base directory for input files
input_base_dir = "./../../Output_M"

# Process all markdown files recursively.
for root, _, files in os.walk(input_base_dir):
    for file in files:
        if file.endswith(".md"):
            input_file = os.path.join(root, file)
            # Save the extracted comments next to the original file.
            output_file = os.path.join(root, file.replace(".md", "_extracted.md"))

            # Read the original file.
            with open(input_file, 'r', encoding='utf-8') as f:
                document_text = f.read()

            # Extract comments and get cleaned text.
            section_data, cleaned_text = extract_and_remove_comments(document_text)

            # Save the extracted comments if any were found.
            if section_data:
                with open(output_file, 'w', encoding='utf-8') as out_file:
                    for section_name, comment in section_data:
                        out_file.write(f"{section_name}\n")
                        out_file.write(f"{comment}\n")
                print(f"Extracted comments: {input_file} -> {output_file}")

            # Overwrite the original file with the cleaned text (without comment blocks).
            with open(input_file, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
            print(f"Updated original file: {input_file}")


Updated original file: ./../../Output_M/WisTMP 8026 (1011-01-30) (1)/WisTMP 8026 (1011-01-30) (1).md
Extracted comments: ./../../Output_M/WisTMP 7999 (2695-07-01)/WisTMP 7999 (2695-07-01).md -> ./../../Output_M/WisTMP 7999 (2695-07-01)/WisTMP 7999 (2695-07-01)_extracted.md
Updated original file: ./../../Output_M/WisTMP 7999 (2695-07-01)/WisTMP 7999 (2695-07-01).md
Updated original file: ./../../Output_M/WisTMP 7605 (1610-41-00)/WisTMP 7605 (1610-41-00).md
Updated original file: ./../../Output_M/WisTMP 7699 (1170-20-02) (1)/WisTMP 7699 (1170-20-02) (1).md
Updated original file: ./../../Output_M/WisTMP 8551 (5808-00-06) (1)/WisTMP 8551 (5808-00-06) (1).md
Updated original file: ./../../Output_M/WisTMP 7682 (9140-12-31) (2)/WisTMP 7682 (9140-12-31) (2).md
Extracted comments: ./../../Output_M/WisTMP 7834 (2250-14-11)/WisTMP 7834 (2250-14-11).md -> ./../../Output_M/WisTMP 7834 (2250-14-11)/WisTMP 7834 (2250-14-11)_extracted.md
Updated original file: ./../../Output_M/WisTMP 7834 (2250-14-11)

# remove all file for comment 

In [2]:
import os

# Base directory for input files
input_base_dir = "./../../Output_table_cleaned2"

# Process all markdown files recursively.
for root, _, files in os.walk(input_base_dir):
    for file in files:
        if file.endswith("extracted.md"):
            # delete the file
            os.remove(os.path.join(root, file))
            print(f"Deleted: {file}")

Deleted: WisTMP 7041 (5730-00-00)_extracted.md
Deleted: WisTMP 7042 (9545-00-01)_extracted.md
Deleted: WisTMP 7043 (5130-00-00)_extracted.md
Deleted: WisTMP 7044 (1021-03-10)_extracted.md
Deleted: WisTMP 7046 (5579-00-03)_extracted.md
Deleted: WisTMP 7047 (3831-00-01)_extracted.md
Deleted: WisTMP 7048 (1180-00-05)_extracted.md
Deleted: WisTMP 7051 (1520-00-03)_extracted.md
Deleted: WisTMP 7054 (6220-00-31)_extracted.md
Deleted: WisTMP 7056 (6370-00-02)_extracted.md
Deleted: WisTMP 7064 (5540-02-00)_extracted.md
Deleted: WisTMP 7066 (3700-50-50)_extracted.md
Deleted: WisTMP 7067 (6150-00-02)_extracted.md
Deleted: WisTMP 7071 (8866-00-02)_extracted.md
Deleted: WisTMP 7074 (6540-11-00)_extracted.md
Deleted: WisTMP 7075 (6540-10-00)_extracted.md
Deleted: WisTMP 7076 (3830-02-00)_extracted.md
Deleted: WisTMP 7078 (3700-20-51)_extracted.md
Deleted: WisTMP 7079 (4867-03-00)_extracted.md
Deleted: WisTMP 7080 (2984-14-02)_extracted.md
Deleted: WisTMP 7081 (1204-05-03)_extracted.md
Deleted: WisT

# remove all directory with not approved files

In [6]:
import shutil

# Base directory for input files
input_base_dir = "./../../Output_table_cleaned2"

# Process all dir recursively
for root, dirs, _ in os.walk(input_base_dir):
    for dir in dirs:
        if dir.endswith("(1)") or dir.endswith("(2)"):
            full_path = os.path.join(root, dir)
            try:
                shutil.rmtree(full_path)
                print(f"Deleted: {full_path}")
            except Exception as e:
                print(f"Failed to delete {full_path}: {e}")

Deleted: ./../../Output_table_cleaned2\WisTMP 7041 (5730-00-00) (1)
Deleted: ./../../Output_table_cleaned2\WisTMP 7041 (5730-00-00) (2)
Deleted: ./../../Output_table_cleaned2\WisTMP 7042 (9545-00-01) (1)
Deleted: ./../../Output_table_cleaned2\WisTMP 7042 (9545-00-01) (2)
Deleted: ./../../Output_table_cleaned2\WisTMP 7043 (5130-00-00) (1)
Deleted: ./../../Output_table_cleaned2\WisTMP 7043 (5130-00-00) (2)
Deleted: ./../../Output_table_cleaned2\WisTMP 7044 (1021-03-10) (1)
Deleted: ./../../Output_table_cleaned2\WisTMP 7044 (1021-03-10) (2)
Deleted: ./../../Output_table_cleaned2\WisTMP 7046 (5579-00-03) (1)
Deleted: ./../../Output_table_cleaned2\WisTMP 7046 (5579-00-03) (2)
Deleted: ./../../Output_table_cleaned2\WisTMP 7047 (3831-00-01) (1)
Deleted: ./../../Output_table_cleaned2\WisTMP 7047 (3831-00-01) (2)
Deleted: ./../../Output_table_cleaned2\WisTMP 7048 (1180-00-05) (1)
Deleted: ./../../Output_table_cleaned2\WisTMP 7048 (1180-00-05) (2)
Deleted: ./../../Output_table_cleaned2\WisTMP 70

# Remove and Extract section 4 to 9 into QA pairs

In [3]:
import os
import re



def extract_and_remove_section(text, section_name):
    lines = text.splitlines()
    current_section = None
    results = []
    capturing = False
    section_info = []
    cleaned_lines = []

    for line in lines:
        # Look for the section header (only if not already capturing)
        # escape remove special characters from the section name
        if not capturing and re.match(r"^#{1,6}\s*", line) and re.search(re.escape(section_name), line): 
            current_section = line
            capturing = True
            continue


        # When capturing, if we hit a new header, finish capturing and save the section.
        if capturing and re.match(r"^#{1,6}\s*", line):
            # if attachment section is found, skip the section
            if re.search("Attachments", line):
                if section_info:
                    section_text = "\n".join(section_info).strip()
                    results.append(section_text)
                capturing = False
                continue
            if re.search("Section", line):
                #  if it contains a "+" character(it is a subsection),continue capturing
                if re.search(r"\+", line):
                    section_info.append(line)
                    continue
                
                if section_info:
                    section_text = "\n".join(section_info).strip()
                    results.append(section_text)
                capturing = False
                section_info = []
                cleaned_lines.append(line)
                continue

        # Collect lines if capturing; otherwise, add to cleaned text.
        if capturing:
            section_info.append(line)
        else:
            cleaned_lines.append(line)
        
    # Flush any remaining content at the end of the file.
    if capturing and section_info:
        section_text = "\n".join(section_info).strip()
        results.append(section_text)

    return current_section, results, "\n".join(cleaned_lines)





def main_loop(input_base_dir, section_marker):
    # Process all markdown files recursively.
    for root, _, files in os.walk(input_base_dir):
        for file in files:
            # check if the file is a markdown file and if it doesn't contain the word "section"
            if file.endswith(".md") and "section" not in file.lower():
                input_file = os.path.join(root, file)
                # Save the extracted section next to the original file.
                output_file = os.path.join(root, file.replace(".md", f"{section_marker}.md"))


                # Read the original file.
                with open(input_file, 'r', encoding='utf-8') as f:
                    document_text = f.read()

                # Extract comments and get cleaned text.
                current_section, section_data, cleaned_text = extract_and_remove_section(document_text, section_marker)


                # Save the extracted comments if any were found.
                if section_data:
                    with open(output_file, 'w', encoding='utf-8') as out_file:
                        out_file.write(f"{current_section}\n")
                        out_file.write("\n\n".join(section_data))

                    # print(f"Extracted comments: {input_file} -> {output_file}")

                # Overwrite the original file with the cleaned text (without comment blocks).
                with open(input_file, 'w', encoding='utf-8') as f:
                    f.write(cleaned_text)

                # print(f"Updated original file: {input_file}")


   


# Base directory for input files
input_base_dir = "./../../Output_table_cleaned2"

output_file = "Section 4"
main_loop(input_base_dir, output_file)

output_file = "Section 5"
main_loop(input_base_dir, output_file)

output_file = "Section 6"
main_loop(input_base_dir, output_file)

output_file = "Section 7"
main_loop(input_base_dir, output_file)

output_file = "Section 8"
main_loop(input_base_dir, output_file)

output_file = "Section 9"
main_loop(input_base_dir, output_file)


Extracting Section 4
section ## **Section 4 - Work Zone Strategies**
section ## **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section ### **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section ### **Section 4 - Work Zone Strategies**
section ## **Section 4 - Work Zone Strategies**
section ## **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section ### **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section ### **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section ### **Section 4 - Work Zone Strategies**
section # **Section 4 - Work Zone Strategies**
section ## **Section 4 - 

# Remove Sectional attatchment at the end 

In [6]:
import os
import re



def remove_attachments_section(text):
    lines = text.splitlines()
    cleaned_lines = []
    capturing = False

    for line in lines:
        # Start capturing if the line is a header containing "Attachments"
        if not capturing and re.match(r"^#{1,6}\s*", line) and "Attachments" in line:
            capturing = True
            continue

        # When capturing, look for a header that does NOT contain "Attachments"
        if capturing and re.match(r"^#{1,6}\s*", line):
            if "Attachments" not in line:
                # Stop capturing when a new, non-attachments header is found.
                capturing = False

        # If not capturing, add the line to the cleaned text.
        if not capturing:
            cleaned_lines.append(line)

    return "\n".join(cleaned_lines)




def main_loop(input_base_dir):
    # Process all markdown files recursively.
    for root, _, files in os.walk(input_base_dir):
        for file in files:
            # check if the file is a markdown file and if it doesn't contain the word "section"
            if file.endswith(".md") and "section" not in file.lower():
                input_file = os.path.join(root, file)

                # Read the original file.
                with open(input_file, 'r', encoding='utf-8') as f:
                    document_text = f.read()

                # remove section and get cleaned text.
                cleaned_text = remove_attachments_section(document_text)

                with open(input_file, 'w', encoding='utf-8') as f:
                    f.write(cleaned_text)

                print(f"Updated original file: {input_file}")


   


# Base directory for input files
input_base_dir = "./../../Output_table_cleaned2"

main_loop(input_base_dir)




Updated original file: ./../../Output_table_cleaned2\WisTMP 7041 (5730-00-00)\WisTMP 7041 (5730-00-00).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7042 (9545-00-01)\WisTMP 7042 (9545-00-01).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7043 (5130-00-00)\WisTMP 7043 (5130-00-00).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7044 (1021-03-10)\WisTMP 7044 (1021-03-10).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7046 (5579-00-03)\WisTMP 7046 (5579-00-03).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7047 (3831-00-01)\WisTMP 7047 (3831-00-01).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7048 (1180-00-05)\WisTMP 7048 (1180-00-05).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7049 (4323-08-01)\WisTMP 7049 (4323-08-01).md
Updated original file: ./../../Output_table_cleaned2\WisTMP 7050 (5990-00-80)\WisTMP 7050 (5990-00-80).md
Updated original file: ./../../Output_table_cl