In [None]:
from pathlib import Path
from itertools import chain
from functools import partial
import os

In [None]:
import xmltodict
from glom import glom
import pandas as pd
import fitz # pip install PyMuPDF

In [None]:
# path to the XML export from Papers 3: "EndNote XML Library"
path_xml = Path('~/Downloads/temp-papers-export-tidy.xml').expanduser()
# export from Papers 3 "PDF Files and Media", without annotation
path_original = Path('~/Downloads/temp-papers-export-original').expanduser()
# export from Papers 3 "PDF Files and Media", with annotation
path_annotated = Path('~/Downloads/temp-papers-export').expanduser()
# input path of the library file
# it can be the same as the path_xml, or another one such as RIS or BibTeX
in_path = path_xml
# output path of the modified library file, extension should be the same as in_path
out_path = Path('~/Downloads/temp-papers-export-annotated.xml').expanduser()

In [None]:
def md5(fname):
    '''https://stackoverflow.com/a/3431838'''
    import hashlib

    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [None]:
def has_annotation(path):
    doc = fitz.open(path)
    result = any(bool(doc[i].firstAnnot) for i in range(doc.pageCount))
    doc.close()
    return result

In [None]:
def get_all_info(df):
    '''get all info from df.path, inplace.
    '''
    df['md5'] = df.path.map(md5)
    df['size'] = df.path.map(os.path.getsize)
    df['stem'] = df.path.map(lambda x: x.stem)
    df['has_annotation'] = df.path.map(has_annotation)

In [None]:
def get_duplicate(df, col):
    '''detect duplicate of df[col]'''
    df_col = df[col]
    df_temp = df_col.value_counts()
    return df[df_col.isin(df_temp[df_temp > 1].index)]

# Papers XML export

In [None]:
with open(path_xml, 'r') as f:
    text = f.read()

In [None]:
records = glom(xmltodict.parse(text), 'xml.records.record')

In [None]:
paths_xml = (
    Path(path.replace('file://localhost', '', 1))
    for path in (glom(record, 'urls.pdf-urls.url.style.#text', default=None) for record in records)
    if path is not None
)
# only process pdf files. In principle other combination exists: Pdf, pDF, etc.
df_xml = pd.DataFrame((path for path in paths_xml if path.suffix in ('.pdf', '.PDF')), columns=['path'])

In [None]:
get_all_info(df_xml)

In [None]:
df_dup = get_duplicate(df_xml, 'md5')

In [None]:
df_dup.sort_values('md5')

In [None]:
# ensure nothing is duplicated
assert df_dup.size == 0

# Papers PDF export

## Original

In [None]:
df_original = pd.DataFrame(list(chain(path_original.glob('*.pdf'), path_original.glob('*.PDF'))), columns=['path'])

In [None]:
get_all_info(df_original)

In [None]:
# ensure nothing is duplicated
# assert get_duplicate(df_original, 'md5').size == 0
# somehow there might be some duplicates from here
# it doesn't need to be drop. After a merge below, there will be
# multiple rows for these md5. In the replace process at the end,
# only the first will actually replace something.
get_duplicate(df_original, 'md5')

## Annotated

In [None]:
df = pd.DataFrame(list(chain(path_annotated.glob('*.pdf'), path_annotated.glob('*.PDF'))), columns=['path'])

In [None]:
get_all_info(df)

In [None]:
# ensure nothing is duplicated
# assert get_duplicate(df, 'md5').size == 0
# somehow there might be some duplicates from here
# it doesn't need to be drop. After a merge below, there will be
# multiple rows for these md5. In the replace process at the end,
# only the first will actually replace something.
get_duplicate(df, 'md5')

## Original and Annotated merged

In [None]:
df_merge = pd.merge(df_original, df, on='stem', suffixes=('_original', '_annotated'))

In [None]:
# ensure the original and annotated directories are indentical in filenames
assert df_merge.shape[0] == df.shape[0]

In [None]:
# prove that all PDFs are transformed (regardless if it is annotated)
assert df_merge[df_merge.md5_original == df_merge.md5_annotated].size == 0

## showing how much size has inflated

In [None]:
(df_merge.size_annotated / df_merge.size_original).describe()

In [None]:
# overall
df_merge.size_annotated.sum() / df_merge.size_original.sum()

# Merge everything

In [None]:
df_merge_all = pd.merge(df_xml, df_merge, left_on='md5', right_on='md5_original', suffixes=('_library', ''))

In [None]:
df_merge_all.has_annotation_annotated.value_counts()

In [None]:
if in_path != path_xml:
    with open(in_path, 'r') as f:
        text = f.read()

In [None]:
# replace original path in XML to the path of the annotated file
for path_in, path_out in df_merge_all.loc[df_merge_all.has_annotation_annotated, ('path', 'path_annotated')].values:
    text = text.replace(str(path_in), str(path_out), 1)

In [None]:
with open(out_path, 'w') as f:
    f.write(text)