Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
issakuss committed Jun 11, 2023
0 parents commit acfcaed
Show file tree
Hide file tree
Showing 19 changed files with 680 additions and 0 deletions.
8 changes: 8 additions & 0 deletions MANIFEST.in
@@ -0,0 +1,8 @@
include papnt/config.ini
include VERSION
include REQUIREMENTS

recursive-exclude * __pycache__
recursive-exclude * *.py[co]

recursive-include docs
5 changes: 5 additions & 0 deletions Makefile
@@ -0,0 +1,5 @@
VERSION := $(shell cat VERSION)

distribute:
conda run -n papnt python setup.py sdist
shasum -a 256 dist/papnt-$(VERSION).tar.gz | cut -d ' ' -f 1
8 changes: 8 additions & 0 deletions REQUIREMENTS
@@ -0,0 +1,8 @@
bibtexparser >= 1.4.0
click >= 8.0.4
crossrefapi >= 1.5.0
iso4 >= 0.0.2
nltk >= 3.6.7
notion-client >= 2.0.0
pdf2doi >= 1.5
unidecode >= 1.3.6
1 change: 1 addition & 0 deletions VERSION
@@ -0,0 +1 @@
0.0.1
Binary file added add-pdf-to-notion.workflow.zip
Binary file not shown.
Binary file added dist/papnt-0.0.1.tar.gz
Binary file not shown.
1 change: 1 addition & 0 deletions docs/readme.rst
@@ -0,0 +1 @@
Academic paper management with Notion
Empty file added papnt/__init__.py
Empty file.
54 changes: 54 additions & 0 deletions papnt/abbrlister.py
@@ -0,0 +1,54 @@
import json

from bibtexparser import loads
from bibtexparser.bparser import BibTexParser
from iso4 import abbreviate
import nltk


def _remove_duplicated_space(dict_: dict):
return {k: v.replace(' ', ' ') for k, v in dict_.items()}


class AbbrLister:
def __init__(self, path_bib: str):
with open(path_bib, 'r') as f:
bibtext = f.read()
parser = BibTexParser()
bibdatabase = loads(bibtext, parser).entries_dict
names_journal = [article.get('journal')
for article in bibdatabase.values()]
self.names_journal = sorted(list(set(
[name for name in names_journal if name is not None])))

nltk.download('wordnet')

def listup(self, spec: dict | None=None):
"""
sepc: dict
Can specify abbreviation like...
{'PLOS ONE': 'PLOS ONE'}
Case insensitive.
"""
abbrs = {name: abbreviate(name) for name in self.names_journal}
self.abbrs = _remove_duplicated_space(abbrs)
if spec is None:
return self
specified_abbrs = {name: spec[name.lower()]
for name in self.names_journal
if spec.get(name.lower())}
self.abbrs = self.abbrs | specified_abbrs
return self

def save(self, save_path: str):
if not hasattr(self, 'abbrs'):
raise ValueError('Use listup() first.')

with open(save_path, 'w') as f:
json.dump(
{'default': {'container-title': self.abbrs}}, f, indent=2)


if __name__ == '__main__':
lister = AbbrLister('/Users/issakuss/Desktop/study14.bib')
lister.listup().save('/Users/issakuss/Desktop/study14.json')
83 changes: 83 additions & 0 deletions papnt/cli.py
@@ -0,0 +1,83 @@
from pathlib import Path

import click

from .misc import load_config
from .database import Database, DatabaseInfo
from .mainfunc import (
add_records_from_local_pdfpath,
update_unchecked_records_from_doi,
update_unchecked_records_from_uploadedpdf,
make_bibfile_from_records, make_abbrjson_from_bibpath)

global config, database
config = load_config(Path(__file__).parent / 'config.ini')
database = Database(DatabaseInfo())


def _config_is_ok():
tokenkey_is_empty = len(config['database']['tokenkey']) == 0
database_id_is_empty = len(config['database']['database_id']) == 0
if tokenkey_is_empty or database_id_is_empty:
click.echo('Open config.ini and edit database information: '
f'{Path(__file__).parent / "config.ini"}', err=True)
return False
else:
return True


# @click.group(context_settings=dict(help_option_names=['-h', '--help']))
@click.group(invoke_without_command=True)
@click.pass_context
def main(ctx):
if ctx.invoked_subcommand is None:
click.echo('try `papnt --help` for help')
if _config_is_ok():
click.echo('Your config file is in: '
f'{Path(__file__).parent / "config.ini"}')


@main.command()
@click.argument('paths')
def paths(paths: str):
"""Add record(s) to database by local path to PDF file"""
if not _config_is_ok():
return
SEP = ','
paths = paths.split(SEP) if SEP in paths else [paths]
for pdfpath in paths:
add_records_from_local_pdfpath(database, config['propnames'], pdfpath)


@main.command()
def doi():
"""Fill information in record(s) by DOI"""
if _config_is_ok():
update_unchecked_records_from_doi(database, config['propnames'])


@main.command()
def pdf():
"""Fill information in record(s) by uploaded PDF file"""
if _config_is_ok():
update_unchecked_records_from_uploadedpdf(
database, config['propnames'])


@main.command()
@click.argument('target')
def makebib(target: str):
"""Make BIB file including reference information from database"""
if not _config_is_ok():
return
make_bibfile_from_records(
database, target, config['propnames'],
config['misc']['dir_save_bib'])
make_abbrjson_from_bibpath(
f'{config["misc"]["dir_save_bib"]}/{target}.bib',
config['abbr'])


if __name__ == '__main__':
_config_is_ok()
...
32 changes: 32 additions & 0 deletions papnt/config.ini
@@ -0,0 +1,32 @@
[database]
tokenkey = ''
database_id = ''

[propnames] ; Propety Names
; bib name = property name
; Check bib names: https://ja.wikipedia.org/wiki/BibTeX
; Note that bib names will be used as lower case
doi = DOI
author = Authors
title = Title
edition = Edition
year = Year
journal = Journal
volume = Volume
pages = Pages
publisher = Publisher
ID = Citekey
ENTRYTYPE = Type
howpublished = HowPublished

; Other property
output_target = Cite in
pdf = PDF

[abbr] ; Specifiation of abbreviation
Full Name = Abbreviated
PLOS ONE = PLOS ONE

[misc]
; Directory to save bib files
dir_save_bib = ''
33 changes: 33 additions & 0 deletions papnt/const.py
@@ -0,0 +1,33 @@
SKIPWORDS = (
'a', 'ab', 'aboard', 'about', 'above', 'across', 'after', 'against', 'al', 'along', 'amid', 'among', 'an', 'and', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides',
'between', 'beyond', 'but', 'by',
'd', 'da', 'das', 'de', 'del', 'dell', 'dello', 'dei', 'degli', 'della', 'dell', 'delle', 'dem', 'den', 'der', 'des', 'despite', 'die', 'do', 'down', 'du', 'during',
'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'el', 'en', 'et', 'except',
'for', 'from',
'gli',
'i', 'il', 'in', 'inside', 'into', 'is',
'l', 'la', 'las', 'le', 'les', 'like', 'lo', 'los',
'near', 'nor',
'of', 'off', 'on', 'onto', 'or', 'over',
'past', 'per', 'plus',
'round',
'save', 'since', 'so', 'some', 'sur',
'than', 'the', 'through', 'to', 'toward', 'towards',
'un', 'una', 'unas', 'under', 'underneath', 'une', 'unlike', 'uno', 'unos', 'until', 'up', 'upon',
'versus', 'via', 'von',
'while', 'with', 'within', 'without',
'yet',
'zu', 'zum')

CROSSREF_TO_BIB = {
# https://ja.wikipedia.org/wiki/BibTeX
# https://api.crossref.org/v1/types
'journal-article': 'article',
'monograph': 'book',
'book': 'book',
'book-section': 'inbook',
'book-track': 'inbook',
'book-part': 'inbook',
'book-chapter': 'inbook',
'proceedings-article': 'inproceedings',
}
45 changes: 45 additions & 0 deletions papnt/database.py
@@ -0,0 +1,45 @@
from typing import Optional, Dict, List
from pathlib import Path

from notion_client import Client

from .misc import load_config


class DatabaseInfo:
def __init__(self, path_config: Optional[str | Path]=None):
path_config = path_config or (Path(__file__).parent / 'config.ini')
config = load_config(path_config)
self.tokenkey = config['database']['tokenkey']
self.database_id = config['database']['database_id']


class Database:
def __init__(self, dbinfo: DatabaseInfo):
self.notion = Client(auth=dbinfo.tokenkey)
self.database_id = dbinfo.database_id

def fetch_records(self, filter: Optional[dict]=None, debugmode: bool=False
) -> List:
records = []
start_cursor = None
while True:
database = self.notion.databases.query(
database_id=self.database_id, filter=filter,
start_cursor=start_cursor)
records += database['results']
if not database['has_more']:
self.db_results = records
return self
start_cursor = database['next_cursor']
if debugmode:
print('It is debugmode, records were fetched partly.')
self.db_results = records
return self

def update(self, page_id: str, prop: Dict):
self.notion.pages.update(page_id=page_id, properties=prop)

def create(self, prop: Dict):
self.notion.pages.create(
parent={'database_id': self.database_id}, properties=prop)
106 changes: 106 additions & 0 deletions papnt/mainfunc.py
@@ -0,0 +1,106 @@
import requests
from pathlib import Path

from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase

from .database import Database
from .abbrlister import AbbrLister
from .pdf2doi import pdf_to_doi
from .notionprop import NotionPropMaker
from .prop2entry import notionprop_to_entry


DEBUGMODE = False


def add_records_from_local_pdfpath(
database: Database, propnames: dict, input_pdfpath: str):

doi = pdf_to_doi(input_pdfpath)
if doi is None:
raise Exception('DOI was not extracted from PDF.')
prop = NotionPropMaker().from_doi(doi, propnames)
prop |= {'info': {'checkbox': True}}
database.create(prop)


def _update_record_from_doi(
database: Database, doi: str, id_record: str, propnames: dict):

prop_maker = NotionPropMaker()
prop = prop_maker.from_doi(doi, propnames)
prop |= {'info': {'checkbox': True}}
try:
database.update(id_record, prop)
except Exception as e:
print(str(e))
name = prop['Name']['title'][0]['text']['content']
raise ValueError(f'Error while updating record: {name}')


def update_unchecked_records_from_doi(database: Database, propnames: dict):
filter = {
'and': [{'property': 'info', 'checkbox': {'equals': False}},
{'property': 'DOI', 'rich_text': {'is_not_empty': True}}]}
for record in database.fetch_records(filter).db_results:
doi = record['properties']['DOI']['rich_text'][0]['plain_text']
_update_record_from_doi(database, doi, record['id'], propnames)


def update_unchecked_records_from_uploadedpdf(
database: Database, propnames: dict):
PATH_TEMP_PDF = Path('you-can-delete-this-file.pdf')
filter = {
'and': [{'property': 'info', 'checkbox': {'equals': False}},
{'property': propnames['pdf'],
'files': {'is_not_empty': True}}]}
for record in database.fetch_records(filter).db_results:
fileurl = record['properties'][propnames['pdf']]
fileurl = fileurl['files'][0]['file']['url']
pdffile = requests.get(fileurl).content
with PATH_TEMP_PDF.open(mode='wb') as f:
f.write(pdffile)
doi = pdf_to_doi(PATH_TEMP_PDF)
PATH_TEMP_PDF.unlink()
if doi is None:
continue
_update_record_from_doi(database, doi, record['id'], propnames)


def make_bibfile_from_records(database: Database, target: str,
propnames: dict, dir_save_bib: str):
propname_to_bibname = {val: key for key, val in propnames.items()}
filter = {'property': propnames['output_target'],
'multi_select': {'contains': target}}
entries = [notionprop_to_entry(record['properties'], propname_to_bibname)
for record in database.fetch_records(filter).db_results]

bib_db = BibDatabase()
bib_db.entries = entries
writer = BibTexWriter()
with open(f'{dir_save_bib}/{target}.bib', 'w') as bibfile:
bibfile.write(writer.write(bib_db))


def make_abbrjson_from_bibpath(input_bibpath: str, special_abbr: dict):
lister = AbbrLister(input_bibpath)
lister.listup(special_abbr).save(input_bibpath.replace('.bib', '.json'))


if __name__ == '__main__':
from .misc import load_config
from .database import DatabaseInfo

config = load_config(Path(__file__).parent / 'config.ini')
database = Database(DatabaseInfo())

add_records_from_local_pdfpath(
database, config['propnames'], 'test/samplepdfs/sample1.pdf')
update_unchecked_records_from_doi(database, config['propnames'])
update_unchecked_records_from_uploadedpdf(
database, config['propnames'])
make_bibfile_from_records(
database, 'test', config['propnames'], config['misc']['dir_save_bib'])
make_abbrjson_from_bibpath(
config['misc']['dir_save_bib'] + 'test.bib', config['abbr'])

0 comments on commit acfcaed

Please sign in to comment.