Skip to content

Commit

Permalink
Merge 28a5a40 into 1c4c7ad
Browse files Browse the repository at this point in the history
  • Loading branch information
dkloster committed Apr 8, 2021
2 parents 1c4c7ad + 28a5a40 commit 63df230
Show file tree
Hide file tree
Showing 16 changed files with 745 additions and 150 deletions.
39 changes: 32 additions & 7 deletions docs/source/cli.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
HTRC Workset Toolkit
======================
The HTRC Workset Toolkit povides a command line interface for interacting with
The HTRC Workset Toolkit povides a command line interface for interacting with
and analyzing volumes in the HathiTrust Digital Library:

- Volume Download (``htrc download``)
Expand All @@ -11,7 +11,7 @@ and analyzing volumes in the HathiTrust Digital Library:
Workset Path
--------------

Each of these commands takes a *workset path*. Valid types of workset paths
Each of these commands takes a *workset path*. Valid types of workset paths
and examples of each are:

================================== ==============================================================================
Expand Down Expand Up @@ -71,7 +71,7 @@ download`_, the

Topic Modeling
''''''''''''''''
There are two implementations of LDA topic modeling supported by the
There are two implementations of LDA topic modeling supported by the


Arguments
Expand Down Expand Up @@ -114,6 +114,35 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da

``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset -c``

* Download specific pages from a single volume :

``htrc download -pg coo.31924089593846[5,10,15,20,25,30]``

* Download volumes and then extract headers/footers from the volumes :

``htrc download -hf /home/dcuser/HTRC/htrc-id``

* Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) :

``htrc download -hfc /home/dcuser/HTRC/htrc-id``

* Download volumes, extract headers/footers from the volumes, skip downloading the .csv files containing removed headers and footers :

``htrc download -hf -s /home/dcuser/HTRC/htrc-id``

* Download volumes, extract headers/footers from volumes, change window of pages in extractor algorithm (The default is 6, lower numbers increase speed, but are less accurate) :

``htrc download -hf -w 3 /home/dcuser/HTRC/htrc-id``

* Download volumes, extract headers/footers from volumes, change minimum similarity rate for lines on pages to be considered a header or footer (Default is .7 or 70%, so if a line is 70% the same as other lines on other pages within the window of pages it is labeled a header or footer and removed) :

``htrc download -hf -msr .9 /home/dcuser/HTRC/htrc-id``

* Download volumes, extract headers/footers from volumes, change the max number of concurrent tasks (note that the only options are 1 or 2):

``htrc download -hf --parallelism 2 /home/dcuser/HTRC/htrc-id``


|
+---------------------------------+-----------------------------------------------+
| command: ``htrc metadata`` | capsule mode: **secure** and **maintenance** |
Expand Down Expand Up @@ -246,7 +275,3 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da
* Run topicexplorer on already downloaded volume - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names).

``htrc topicexplorer /home/dcuser/unzipped_volumes -k 20``




70 changes: 54 additions & 16 deletions htrc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from future import standard_library
standard_library.install_aliases()

import json
import os, os.path
import os
import os.path
import shutil
import sys
from tempfile import NamedTemporaryFile
Expand All @@ -16,6 +16,7 @@
import htrc.volumes
import htrc.workset
import htrc.tools.mallet

from argparse import ArgumentParser
import htrc.tools.topicexplorer
from htrc.lib.cli import bool_prompt
Expand All @@ -25,18 +26,37 @@
def download_parser(parser=None):
if parser is None:
parser = ArgumentParser()
parser.add_argument("-u", "--username", help="HTRC username")
parser.add_argument("-p", "--password", help="HTRC password")
#parser.add_argument("-u", "--username", help="HTRC username")
#parser.add_argument("-p", "--password", help="HTRC password")
parser.add_argument("file", nargs='?', default=sys.stdin,
help="workset path[s]")
help="Workset path[s]")
parser.add_argument("-f", "--force", action='store_true',
help="remove folder if exists")
parser.add_argument("-o", "--output", help="output directory",
help="Remove folder if exists")
parser.add_argument("-o", "--output", help="Output directory",
default='/media/secure_volume/workset/')
parser.add_argument("-hf", "--remove-headers-footers", action='store_true',
help="Remove headers and footers from individual pages and save in a separate csv file for inspection")
parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true',
help="Remove headers and footers from individual pages and save in a separate csv file for inspection then concatenate pages")
parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6,
help="How many pages ahead does the header/footer extractor algorithm look to find potential "
"matching headers/footers (higher value gives potentially more accurate results on lower "
"quality OCR volumes at the expense of runtime)")
parser.add_argument("-msr", "--min-similarity-ratio", required=False, type=float, metavar="N", default=0.7,
help="The minimum string similarity ratio required for the Levenshtein distance fuzzy-matching "
"algorithm to declare that two headers are considered 'the same' (the higher the value, up "
"to a max of 1.0, the more strict the matching has to be; lower values allow for more "
"fuzziness to account for OCR errors)")
parser.add_argument("-s", "--skip-removed-hf", action='store_true',
help="Skip creating a saved report of the removed headers and footers for each page for inspection")
parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(),
help="The max number of concurrent tasks to start when downloading or removing headers/footers")
parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250,
help="The max number of volumes to download at a time from DataAPI")
parser.add_argument("-c", "--concat", action='store_true',
help="concatenate a volume's pages in to a single file")
help="Concatenate a volume's pages in to a single file")
parser.add_argument("-m", "--mets", action='store_true',
help="add volume's METS file")
help="Add volume's METS file")
parser.add_argument("-pg", "--pages",action='store_true',
help="Download given page numbers of a volumes.")
parser.add_argument("-t", "--token", help="JWT for volumes download.")
Expand All @@ -47,17 +67,17 @@ def download_parser(parser=None):
parser.add_argument("-dk", "--datakey", help="Client key file for mutual TLS with Data API.")
return parser


def add_workset_path(parser=None):
if parser is None:
parser = ArgumentParser()
parser.add_argument("path", nargs='+', help="workset path[s]")
parser.add_argument("path", nargs='+', help="Workset path[s]")
return parser



def main():
parser = ArgumentParser()
parser.add_argument('-d', '--debug', help="print long debug messages",
parser.add_argument('-d', '--debug', help="Print long debug messages",
action='store_true')
parsers = parser.add_subparsers(help="select a command")

Expand All @@ -78,10 +98,11 @@ def main():
help="Download HathiTrust volumes to disk [requires auth]")
download_parser(parser_download)
parser_download.set_defaults(func='download')



# Run helper
parser_run = parsers.add_parser('run', help="Run a built-in algorithm.")
run_parsers = parser_run.add_subparsers(help="select a command")
run_parsers = parser_run.add_subparsers(help="Select a command")

parser_mallet = run_parsers.add_parser('mallet')
htrc.tools.mallet.populate_parser(parser_mallet)
Expand Down Expand Up @@ -131,10 +152,25 @@ def main():
else:
print("Please choose another output folder and try again.")
sys.exit(1)


if args.concat and args.remove_headers_footers:
print("Cannot set both concat and remove-headers-footers")
sys.exit(1)
if args.concat and args.remove_headers_footers_and_concat:
print("Cannot set both concat and remove-headers-footers-and-concat")
sys.exit(1)
if args.remove_headers_footers and args.remove_headers_footers_and_concat:
print("Cannot set both remove_headers_footers and remove_headers_footers_and_concat")
sys.exit(1)
if args.mets and args.remove_headers_footers_and_concat:
print("Cannot set both mets and remove_headers_footers_and_concat")
sys.exit(1)
if args.pages:
if args.mets and args.concat:
print ("Cannot set both concat and mets with pages")
print("Cannot set both concat and mets with pages")
sys.exit(1)
if args.mets and args.remove_headers_footers_and_concat:
print("Cannot set both mets and remove_headers_footers_and_concat with pages")
sys.exit(1)

try:
Expand All @@ -143,6 +179,7 @@ def main():
print("Invalid identifier:", args.file)
sys.exit(1)


def resolve_and_download(args):
if args.file == sys.stdin:
# For use with UNIX pipes
Expand Down Expand Up @@ -207,6 +244,7 @@ def download(args):
else:
raise e


def download_with_tempfile(args, volumes):
f = NamedTemporaryFile()
for volume in volumes:
Expand Down
15 changes: 7 additions & 8 deletions htrc/auth.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from base64 import b64encode
from getpass import getpass
import http.client
import ssl
import time
from getpass import getpass

import requests
import requests.auth

import htrc.config


def get_jwt_token():
# Currently we just store one common jwt token locally at .htrc file for simplicity
# Expect to add POST method to query unique jwt token with the combo of username and password
Expand All @@ -17,10 +15,10 @@ def get_jwt_token():
client_id, client_secret = htrc.config.get_credentials()

auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
data = { "grant_type": "password",
"username": username,
"password": password,
"scope" : "openid"}
data = {"grant_type": "password",
"username": username,
"password": password,
"scope": "openid"}

url = htrc.config.get_idp_url()
r = requests.post(url, data=data, auth=auth)
Expand All @@ -35,6 +33,7 @@ def get_jwt_token():
else:
raise RuntimeError("JWT token retrieval failed: {}".format(data['error']))


def credential_prompt():
"""
A prompt for entering HathiTrust Research Center credentials.
Expand Down
48 changes: 39 additions & 9 deletions htrc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,14 @@
"""
from future import standard_library
standard_library.install_aliases()
from builtins import input

from typing import Optional
from configparser import RawConfigParser as ConfigParser, NoSectionError
from codecs import open
from getpass import getpass
import logging
import os.path
import shutil
import time

from htrc.lib.cli import bool_prompt

DEFAULT_PATH = os.path.expanduser('~')
DEFAULT_PATH = os.path.join(DEFAULT_PATH, '.htrc')
if not os.path.exists(DEFAULT_PATH):
Expand All @@ -26,6 +22,25 @@
logging.info("Copying default config file to home directory.")
shutil.copyfile(DEFAULT_FILE, DEFAULT_PATH)


class HtrcDataApiConfig:
def __init__(self,
token: Optional[str] = None,
host: Optional[str] = None,
port: Optional[int] = None,
epr: Optional[str] = None,
cert: Optional[str] = None,
key: Optional[str] = None) -> None:
super().__init__()

self.token = token or get_jwt_token(save_new_token=False)
self.host = host or get_dataapi_host()
self.port = port or get_dataapi_port()
self.epr = epr or get_dataapi_epr()
self.cert = cert or get_dataapi_cert()
self.key = key or get_dataapi_key()


def _get_value(section, key, path=None):
if path is None:
path = DEFAULT_PATH
Expand All @@ -38,36 +53,45 @@ def _get_value(section, key, path=None):
except NoSectionError:
raise EnvironmentError("Config not set for {} {} in {}".format(
section, key, path))



def get_dataapi_port(path=None):
port = int(_get_value('data', 'port', path))
return (port)


def get_dataapi_host(path=None):
host = _get_value('data', 'host', path)
return (host)


def get_dataapi_epr(path=None):
return _get_value('data', 'url', path)


def get_dataapi_cert(path=None):
return _get_value('data', 'cert', path)


def get_dataapi_key(path=None):
return _get_value('data', 'key', path)


def get_dataapi_access(path=None):
return _get_value('data', 'pd_only', path)


def get_idp_host_port(path=None):
host = _get_value('idp', 'host', path)
port = _get_value('idp', 'port', path)

return (host, port)


def get_idp_path(path=None):
return _get_value('idp', 'url')


def get_idp_url(path=None):
host, port = get_idp_host_port(path)
path = get_idp_path(path)
Expand All @@ -79,23 +103,26 @@ def get_idp_url(path=None):


# Add jwt credential access methods
def get_jwt_token(path=None):
def get_jwt_token(path=None, save_new_token=True):
try:
token = _get_value('jwt', 'token', path)

# check expiration date
expiration = int(_get_value('jwt', 'expiration', path))
if time.time() > expiration:
import htrc
htrc.config.remove_jwt_token()
raise RuntimeError("JWT token expired.")
except:
# This should run on either a missing or expired token.
import htrc.auth
token, expiration = htrc.auth.get_jwt_token()
htrc.config.save_jwt_token(token, expiration, path)

if save_new_token:
htrc.config.save_jwt_token(token, expiration, path)

return token


def save_jwt_token(token, expiration=None, path=None):
"""
Saves JWT token in the config file.
Expand Down Expand Up @@ -124,6 +151,7 @@ def save_jwt_token(token, expiration=None, path=None):

return token


def remove_jwt_token(path=None):
"""
Removes JWT token from the config file.
Expand Down Expand Up @@ -161,9 +189,11 @@ def get_credentials(path=None):

return (client_id, client_secret)


def populate_parser(parser):
return parser


if __name__ == '__main__':
from argparse import ArgumentParser

Expand Down

0 comments on commit 63df230

Please sign in to comment.