Extract Highlights

This notebook tests extracting the highlights from [Kobuddy](https://github.com/karlicoss/kobuddy?tab=readme-ov-file).

In [None]:
import pandas as pd
import re
from datetime import datetime
import os

In [None]:
# Step 1: Read the file
file_path = "annotations.txt"  # Replace with your file path
with open(file_path, "r") as file:
    data = file.read()

In [None]:
data

In [None]:
# Step 2: Split into individual annotations using the "------" separator
entries = data.strip().split("------")
entries = [entry.strip() for entry in entries if entry.strip()]  # Remove empty and strip whitespace

In [None]:
entries

In [None]:
# Step 3: Parse each entry into date/time, book title, and annotation
parsed_data = []
for entry in entries:
    lines = entry.split("\n")
    if len(lines) >= 2:
        # Extract date/time and book title using regex
        match = re.match(r"^([\d]{2} [A-Za-z]{3} [\d]{4} [\d]{2}:[\d]{2}) (.+)$", lines[0].strip())
        if match:
            date_time_str = match.group(1)
            book_title = match.group(2)
            annotation = "\n".join(line.strip() for line in lines[1:] if line.strip())  # Combine annotation lines
            try:
                date_time = datetime.strptime(date_time_str, "%d %b %Y %H:%M")  # Parse date/time
                parsed_data.append([date_time, book_title, annotation])
            except ValueError:
                print(f"Skipping invalid date format: {date_time_str}")

In [None]:
parsed_data

In [None]:
# Step 4: Create a DataFrame
df = pd.DataFrame(parsed_data, columns=["Date/Time", "Book Title", "Annotation"])

In [None]:
df.head()

In [None]:
# Step 5: Sort by Book Title and Date/Time
df = df.sort_values(by=["Book Title", "Date/Time"])

In [None]:
df.head()

In [None]:
# Optionally save to a CSV file
df.to_csv("annotations_sorted.csv", index=False)

In [None]:
# Step 1: Group annotations by book title
grouped = df.groupby("Book Title")

In [None]:
grouped.head()

In [None]:
# Step 2: Create a markdown string
markdown_content = ""
for book_title, group in grouped:
    markdown_content += f"## {book_title}\n\n"
    for annotation in group["Annotation"]:
        markdown_content += f"- {annotation}\n"
    markdown_content += "\n"

In [None]:
markdown_content

In [None]:
# Step 3: Save the markdown content to a file
output_file = "annotations.md"
with open(output_file, "w") as file:
    file.write(markdown_content)

print(f"Markdown file '{output_file}' created successfully.")

### Split out and save annotations in a one-book-per-md format

In [None]:
# Step 1: Create a directory to store the markdown files
output_dir = "book_annotations"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Step 3: Loop through each book and create individual markdown files
for book_title, group in grouped:
    # Sanitize the book title to create a valid filename
    sanitized_title = re.sub(r'[<>:"/\\|?*]', '', book_title).strip().replace(' ', '_')
    filename = f"{sanitized_title}.md"

    # Build the markdown content for this book
    markdown_content = f"## {book_title}\n\n"
    for annotation in group["Annotation"]:
        markdown_content += f"- {annotation}\n"
    markdown_content += "\n"

    # Save the content to a markdown file
    file_path = os.path.join(output_dir, filename)
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(markdown_content)

print(f"Markdown files for each book have been created in the '{output_dir}' directory.")