## Script Overview

This script recursively converts various document formats to Markdown while preserving the original folder structure.

code created by Haky using gpt-oss-120b, further formatted and annotated by claude code

## Import Required Libraries

Import all necessary modules for file conversion, path manipulation, and progress tracking.

In [None]:

# Core utilities
%pip install tqdm                # nice progress bars
## pip install python-docx         # read .docx (fallback)
## pip install markdownify        # turn HTML → Markdown (fallback)

# Excel → DataFrames → Markdown
%pip install pandas openpyxl xlrd

# PDF text extraction
%pip install pdfminer.six

# Powerpoint conversion
%pip install --upgrade python-pptx

# tabulate
%pip install tabulate

# Optional: use pandoc for higher‑quality conversion (recommended)
#   - Install pandoc from https://pandoc.org/installing.html
#   - Make sure `pandoc` is on your PATH.

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.
Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting xlrd
  Downloading xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading xlrd-2.0.2-py2.py3-none-any.whl (96 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: xlrd, et-xmlfile, openpyxl
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [openpyxl]2/3[0m [openpyxl]
[1A[2KSuccessfully installed et-xmlfile-2.0.0 openpyxl-3.1.5 xlrd-2.0.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import argparse
import os
import shutil
import subprocess
from pathlib import Path

import pandas as pd
from tqdm import tqdm
from pdfminer.high_level import extract_text

## DOCX Conversion Functions

Functions for converting Word documents to Markdown using pandoc (primary method) or python-docx (fallback).

In [5]:
def run_pandoc(input_path: Path, output_path: Path) -> bool:
    """Call pandoc to convert a file to markdown. Returns True on success."""
    try:
        subprocess.run(
            ["pandoc", str(input_path), "-f",
             "docx" if input_path.suffix == ".docx"
             else "markdown", "-t", "gfm", "-o", str(output_path)],
            check=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


def docx_to_md_fallback(docx_path: Path) -> str:
    """Very simple fallback conversion for .docx files."""
    from docx import Document

    document = Document(docx_path)
    lines = [para.text.strip() for para in document.paragraphs if para.text.strip()]
    return "\n\n".join(lines)

## Excel Conversion Function

Converts Excel files (.xlsx/.xls) by creating a separate Markdown file for each sheet.

In [6]:
def excel_to_md(excel_path: Path, dest_dir: Path) -> None:
    """Write each sheet of an Excel workbook to its own .md file."""
    xls = pd.ExcelFile(excel_path)
    for sheet in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet)
        safe_sheet = "".join(c if c.isalnum() or c in " _-" else "_" for c in sheet)
        md_fname = f"{excel_path.stem}_{safe_sheet}.md"
        out_path = dest_dir / md_fname
        with open(out_path, "w", encoding="utf-8") as fh:
            fh.write(df.to_markdown(index=False))

## PDF Conversion Function

Extracts plain text from PDF files using pdfminer.six.

In [7]:
def pdf_to_md(pdf_path: Path) -> str:
    """Extract plain text from a PDF."""
    return extract_text(str(pdf_path))

## PowerPoint Conversion Function

Converts PowerPoint (.pptx) presentations to Markdown format:
- Slide titles become level-2 headings
- Bulleted paragraphs become list items with proper indentation
- Slide notes are added as blockquotes

In [8]:
def pptx_to_md(pptx_path: Path) -> str:
    """
    Convert a PowerPoint (.pptx) file to markdown.
    - Slide titles become level‑2 headings (`## Title`)
    - Bulleted paragraphs become list items
    - Slide notes (if any) are added under an indented blockquote
    """
    from pptx import Presentation

    prs = Presentation(pptx_path)
    md_lines = []

    for idx, slide in enumerate(prs.slides, start=1):
        # ---- Title ---------------------------------------------------------
        title_shapes = [shape for shape in slide.shapes if shape.has_text_frame and shape.is_placeholder]
        title = None
        for sh in title_shapes:
            if "title" in sh.placeholder_format.idx.__str__().lower():
                title = sh.text.strip()
                break
        # Fallback: first textbox with larger font could be a title – keep it simple
        if not title and slide.shapes.title:
            title = slide.shapes.title.text.strip()

        md_lines.append(f"## Slide {idx}" + (f": {title}" if title else ""))

        # ---- Body ----------------------------------------------------------
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            # Skip the placeholder that we already used as title
            if shape == slide.shapes.title:
                continue

            txt = shape.text.strip()
            if not txt:
                continue

            # Detect bullet levels (pptx stores them in paragraph.level)
            for para in shape.text_frame.paragraphs:
                level = para.level  # 0 = top‑level bullet
                bullet = "* " * (level + 1)
                line_text = para.text.strip()
                if line_text:
                    md_lines.append(f"{bullet}{line_text}")

        # ---- Notes ---------------------------------------------------------
        if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
            notes = slide.notes_slide.notes_text_frame.text.strip()
            if notes:
                md_lines.append("\n> **Notes:**")
                for note_line in notes.splitlines():
                    md_lines.append(f"> {note_line}")

        md_lines.append("")  # blank line between slides

    return "\n".join(md_lines)

## Image Copy Function

Copies image files (PNG, JPG, GIF, SVG) unchanged to the destination directory.

In [9]:
def copy_image(src: Path, dst: Path) -> None:
    """Copy image files unchanged."""
    shutil.copy2(src, dst)

## File Conversion Router

Main function that determines the file type and calls the appropriate conversion function. Creates destination directory structure that mirrors the source.

In [10]:
def convert_one_file(src_path: Path, dest_root: Path) -> None:
    """
    Convert a single file according to its suffix.
    The destination folder structure mirrors the source one.
    """
    rel_path = src_path.relative_to(src_root)
    dest_dir = (dest_root / rel_path.parent).resolve()
    dest_dir.mkdir(parents=True, exist_ok=True)

    md_name = src_path.stem + ".md"
    dest_md_path = dest_dir / md_name

    suffix = src_path.suffix.lower()

    if suffix == ".docx":
        if run_pandoc(src_path, dest_md_path):
            return
        with open(dest_md_path, "w", encoding="utf-8") as f:
            f.write(docx_to_md_fallback(src_path))

    elif suffix in {".xlsx", ".xls"}:
        excel_to_md(src_path, dest_dir)

    elif suffix == ".pdf":
        with open(dest_md_path, "w", encoding="utf-8") as f:
            f.write(pdf_to_md(src_path))

    elif suffix == ".pptx":
        # PowerPoint → markdown
        md_text = pptx_to_md(src_path)
        with open(dest_md_path, "w", encoding="utf-8") as f:
            f.write(md_text)

    elif suffix in {".png", ".jpg", ".jpeg", ".gif", ".svg"}:
        copy_image(src_path, dest_dir / src_path.name)

    else:
        # Unsupported – silently ignore (or log if you wish)
        pass

## Directory Walker

Recursively scans the source directory and processes all files with a progress bar.

In [12]:
def walk_and_convert(source_root: Path, destination_root: Path) -> None:
    """Recursively scan `source_root` and convert each file."""
    all_files = [p for p in source_root.rglob("*") if p.is_file()]
    with tqdm(total=len(all_files), desc="Converting", unit="file") as pbar:
        for fpath in all_files:
            convert_one_file(fpath, destination_root)
            pbar.update(1)

## Interactive Usage

Set up source and destination paths, then run the conversion process.

In [14]:
# Define source and destination paths
src_root = Path("/Users/haekyungim/Library/CloudStorage/GoogleDrive-haky@uchicago.edu/My Drive/Work/Service/IGES/iges-documents/backup-from-intranet/Archive-2023")
dest_root = Path("/Users/haekyungim/Downloads/tempo/")

# Validate source directory exists
if not src_root.is_dir():
    raise SystemExit(f"❌  Source directory does not exist: {src_root}")

# Create destination directory
dest_root.mkdir(parents=True, exist_ok=True)

print(f"🔍  Scanning: {src_root}")
print(f"📁  Destination root: {dest_root}")

# Run the conversion
walk_and_convert(src_root, dest_root)

print("\n✅  Done! Your markdown copy lives at:", dest_root)

🔍  Scanning: /Users/haekyungim/Library/CloudStorage/GoogleDrive-haky@uchicago.edu/My Drive/Work/Service/IGES/iges-documents/backup-from-intranet/Archive-2023
📁  Destination root: /Users/haekyungim/Downloads/tempo


  warn(msg)
  warn(msg)
  warn(msg)
Cannot set gray non-stroke color because /'P6' is an invalid float value
Converting: 100%|██████████| 152/152 [00:53<00:00,  2.82file/s]


✅  Done! Your markdown copy lives at: /Users/haekyungim/Downloads/tempo



