In [1]:
from zipfile import ZipFile
from pathlib import Path
import json
from typing import NamedTuple

In [2]:
class Highlight(NamedTuple):
    '''single highlight obj parsed from json'''

    obj: dict  # json dict

    @property
    def excerpt(self) -> str:
        return self.obj['excerpt'].strip()

    @property
    def note(self) -> str:
        return json.loads(self.obj['note_text'])['note'].strip()

    @property
    def has_excerpt_or_note(self) -> bool:
        return bool(self.excerpt + self.note)


In [3]:
class Annotation(NamedTuple):
    '''annotation of an ebook'''
    
    metadata: dict              # book title, authors, publisher ...
    highlights: list[Highlight] # list of highlights w/ notes from json obj

    @property
    def annotated(self) -> bool:
        return bool(len(self.highlights))

    @property
    def title(self) -> str:
        return self.metadata['volume_info']['title']

    @property
    def highlights_with_notes(self) -> str:
        '''return highlighted text with notes as string'''
        op = ''
        for h in self.highlights:
            op += f"- {h.excerpt}\n" if h.excerpt else ''
            op += f"  +  {h.note}\n" if h.note else ''
        return op

    @property
    def md(self) -> str:
        heading = f"# {self.title}\n\n"
        return heading + self.highlights_with_notes

In [4]:
def takeout_convert(txt: str) -> Annotation:
    '''google takeout combined 3 json obj into one json file
    1. book metadata
    2. list of highlights and notes
    3. reading status, finish or not 
    this func split it and convert the first 2 into an Annotation obj'''
    delimiter = '~@~@^~@~'  # any rare string
    arr = (txt
        .replace('}[', f'}}{delimiter}[')
        .replace(']{', f']{delimiter}{{')
        .split(delimiter, maxsplit=2)
    )
    jsons = [json.loads(obj) for obj in arr]
    all_highlighs = [Highlight(obj) for obj in jsons[1]]
    highlighs = [h for h in all_highlighs if h.has_excerpt_or_note]
    return Annotation(
        metadata=jsons[0], 
        highlights=highlighs
    )

In [5]:
def annotations_from(file_path:str) -> list[Annotation]:
    '''return list of Annotation objs from given file path'''
    with ZipFile(file_path) as z:
        json_texts = [
            z.read(f).decode('UTF-8') 
            for f in z.namelist() 
            if f.endswith('.json')
        ]
        return [takeout_convert(txt) for txt in json_texts]

## params

In [6]:
md_folder = Path('./annotations')       # output markdown folder
takeout_zip = './takeout-ebook-annotations.zip'

## main program

In [None]:
md_folder.mkdir(parents=True, exist_ok=True)
cnt = 0
for annotation in annotations_from(takeout_zip):
    if annotation.highlights_with_notes: 
        cnt += 1
        # print(annotation.md)
        md_path = md_folder / f'{annotation.title}.md'
        md_path.write_text(annotation.md)
        print(annotation.title)
        
print('cnt: ', cnt)