# Data Processing 

In [1]:
import os
import re
import json
from pathlib import Path

DATA_DIR = Path("./data")
OUTPUT_DIR = Path("./parsed")
OUTPUT_DIR.mkdir(exist_ok=True)

def parse_markdown_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    channel = ""
    threads = []
    current_thread = None
    current_messages = []

    for line in lines:
        line = line.strip()

        if line.startswith("# Channel "):  # Channel name
            channel = line[len("# Channel "):].strip()


        elif line.startswith("## Thread: "):  # New thread
            # Save the previous thread
            if current_thread:
                threads.append(current_thread)

            thread_title = line.replace("## Thread: ", "").strip()
            current_thread = {
                "channel": channel,
                "thread_title": thread_title,
                "messages": []
            }

        elif re.match(r"\*\*.+\*\* \[.*\]", line):  # Message line
            match = re.match(r"\*\*(.+)\*\* \[(.+?)\]", line)
            if match:
                user, timestamp = match.groups()
                current_messages = {
                    "user": user.strip(),
                    "timestamp": timestamp.strip(),
                    "text": ""
                }
                current_thread["messages"].append(current_messages)

        elif line and current_messages:
            # Add to the last message's text
            current_messages["text"] += (line + " ")

    if current_thread:
        threads.append(current_thread)

    return threads

def parse_and_save_each_file():
    for file in DATA_DIR.glob("*.md"):
        threads = parse_markdown_file(file)
        output_path = OUTPUT_DIR / (file.stem + ".json")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(threads, f, indent=2)
        print(f"✅ Saved {len(threads)} threads to {output_path.name}")

if __name__ == "__main__":
    parse_and_save_each_file()


✅ Saved 4 threads to week-3-channel.json
✅ Saved 5 threads to aws-channel.json
✅ Saved 5 threads to week-2-channel.json
✅ Saved 5 threads to week-1-channel.json


In [2]:
import os
import re
import json
from pathlib import Path

DATA_DIR = Path("./data")

def parse_markdown_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    channel = ""
    threads = []
    current_thread = None
    current_messages = []

    for line in lines:
        line = line.strip()

        if line.startswith("# Channel"):  # Channel name
             channel = line[len("# Channel "):].strip()

        elif line.startswith("## Thread: "):  # New thread
            # Save the previous thread
            if current_thread:
                threads.append(current_thread)

            thread_title = line.replace("## Thread: ", "").strip()
            current_thread = {
                "channel": channel,
                "thread_title": thread_title,
                "messages": []
            }

        elif re.match(r"\*\*.+\*\* \[.*\]", line):  # Message line
            match = re.match(r"\*\*(.+)\*\* \[(.+?)\]", line)
            if match:
                user, timestamp = match.groups()
                current_messages = {
                    "user": user.strip(),
                    "timestamp": timestamp.strip(),
                    "text": ""
                }
                current_thread["messages"].append(current_messages)

        elif line and current_messages:
            # Add to the last message's text
            current_messages["text"] += (line + " ")

    # Append the last thread
    if current_thread:
        threads.append(current_thread)

    return threads

def parse_all_files():
    all_threads = []

    for file in DATA_DIR.glob("*.md"):
        threads = parse_markdown_file(file)
        all_threads.extend(threads)

    return all_threads

def save_as_json(data, output_path="parsed/parsed_threads.json"):
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

if __name__ == "__main__":
    parsed_threads = parse_all_files()
    save_as_json(parsed_threads)
    print(f"✅ Parsed {len(parsed_threads)} threads and saved to parsed_threads.json")


✅ Parsed 19 threads and saved to parsed_threads.json
