In [None]:
from pathlib import Path
from itertools import chain
import os
import shutil

In [None]:
import pandas as pd

In [None]:
from dautil.util import map_parallel

In [None]:
def md5(fname):
    '''https://stackoverflow.com/a/3431838'''
    import hashlib

    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [None]:
def glob(path):
    return list(chain(
    path.glob('**/*.epub'),
    path.glob('**/*.PDF'),
    path.glob('**/*.webarchive'),
    path.glob('**/*.pdf'),
))

In [None]:
# export from Papers 3 "PDF Files and Media", without annotation
path_original = Path('~/Downloads/temp-papers-export-original').expanduser()
# export from Papers 3 "PDF Files and Media", with annotation
path_annotated = Path('~/Downloads/temp-papers-export').expanduser()
# ZotFile location
path_zot = Path('~/iCloud/ZotFile').expanduser()

# Source—original

In [None]:
paths = glob(path_original)

In [None]:
%time md5sums = map_parallel(md5, paths, mode='multithreading', processes=os.cpu_count())

In [None]:
df = pd.DataFrame(
    [
        [path.relative_to(path_original) for path in paths],
        md5sums
    ],
    index=['path', 'md5']
).T

In [None]:
# per md5sum collided file, only need 1
df = df.groupby('md5').first()

In [None]:
df

In [None]:
df.loc['9ab299ca0b68b5966416b734f6defaa3']

# ZotFile

In [None]:
paths = glob(path_zot)

In [None]:
%time md5sums = map_parallel(md5, paths, mode='multithreading', processes=os.cpu_count())

In [None]:
df_zot = pd.DataFrame(
    [
        paths,
        md5sums
    ],
    index=['path', 'md5']
).T

In [None]:
assert (df_zot.md5.value_counts() > 1).sum() == 0

In [None]:
df_zot = df_zot.merge(df, how='left', left_on='md5', right_index=True, suffixes=('', '_source'))

In [None]:
df_zot

In [None]:
assert df_zot.path_source.isna().sum() == 0

In [None]:
%%time
for name, row in df_zot.iterrows():
    shutil.copy(path_annotated / row.path_source, row.path)