In [10]:
import pandas as pd
import json
import io

def robust_csv_to_json(input_file, output_file):
    # 1. Read the entire file as a raw string to handle the embedded newlines
    with open(input_file, 'r', encoding='utf-8') as f:
        raw_content = f.read()

    # 2. Split the content by actual record boundaries (\n) 
    # This helps separate those "merged" lines you showed me
    lines = raw_content.strip().split('\n')
    
    cleaned_data = []
    for line in lines:
        # Split each line by the pipe delimiter
        parts = line.split('|')
        
        # We expect 6 parts: time, timeString, quote, title, author, sfw
        if len(parts) >= 5:
            entry = {
                "time": parts[0].strip(),
                "timeString": parts[1].strip(),
                "quote": parts[2].strip(),
                "title": parts[3].strip(),
                "author": parts[4].strip()
            }
            cleaned_data.append(entry)

    # 3. Convert to JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

    print(f"Cleaned and saved {len(cleaned_data)} entries to {output_file}")

# Run the cleaning
robust_csv_to_json('litclock_annotated.csv', 'data.json')

Cleaned and saved 3626 entries to data.json


In [11]:
import json
from datetime import datetime, timedelta

def find_missing_times(json_file):
    # 1. Generate a list of all possible times from 00:00 to 23:59
    all_times = []
    start_time = datetime.strptime("00:00", "%H:%M")
    for i in range(1440):
        t = start_time + timedelta(minutes=i)
        all_times.append(t.strftime("%H:%M"))

    # 2. Load your existing JSON data
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print("JSON file not found.")
        return

    # 3. Extract the 'time' keys from your JSON
    times_in_json = set(entry['time'] for entry in data)

    # 4. Find the difference
    missing_times = [t for t in all_times if t not in times_in_json]

    # 5. Output results
    print(f"--- Statistics ---")
    print(f"Total possible minutes: 1440")
    print(f"Minutes covered: {len(times_in_json)}")
    print(f"Minutes missing: {len(missing_times)}")
    print(f"Coverage: {(len(times_in_json)/1440)*100:.2f}%")
    
    if missing_times:
        print("\n--- First 20 Missing Times ---")
        print(", ".join(missing_times[:20]) + "...")
    else:
        print("\nCongratulations! You have 100% coverage.")

    return missing_times

# Run the check
missing = find_missing_times('data.json')

--- Statistics ---
Total possible minutes: 1440
Minutes covered: 1432
Minutes missing: 8
Coverage: 99.44%

--- First 20 Missing Times ---
06:07, 06:18, 08:21, 10:28, 11:46, 12:31, 13:36, 18:44...
