In [None]:
%load_ext autoreload
%autoreload 2

In [24]:
import htrc_features
import htrc_features.resolvers
from htrc_features import Volume, resolvers, FeatureReader, caching
import tempfile
import os
import json
import logging
from pathlib import Path

# Why use IDs?

In the new version, requesting objects by IDs is the preferred method.

There are a few reasons for this.

1. Compatability among machines and groups. I've often found it hard to run other people's code because it requires downloading hundreds or thousands of books, when I *already* have a full copy of the Hathi Features on one machine. Different settings will call for different optimizations; workshop users may want to temporarily download files, while HPC environments may want minimally zipped version.

2. Alternate storage formats. The new version includes a parquet-based way of referring to files; reading word counts from parquets is much faster than reading from bzipped json files. Referring to IDs makes it easy to silently optimize access to parquet.

## What is an ID?



In [3]:
logging.getLogger().setLevel(logging.INFO)


# Some explanations and tests for the new loading methods.

This is not a comprehensive set of tests, but should provide the basics.

## Loading from a path.

An unnamed initial arg to 'Volume' looks at the format to see if it's an ID or a path. This looks like an ID, so reads from disk.

In [4]:
project_root = Path(htrc_features.__file__).parent.parent
pz_root = Path(project_root, "data", "PZ-volumes")
file_path = Path(pz_root, "hvd.hwrqs8.json.bz2").joinpath()
file_path = str(file_path)

FeatureReader(file_path).first()



## Loading over the web

This one loads from the web. There are probably more gentle defaults than re-pulling from online every time, and these should be implemented.

In [5]:
Volume("hvd.hwrqs8", id_resolver = 'http')

## Accessing files by volume ids.

That's basically the entire old method. But you may want to access local objects by their HTIDs. The simplest way to 
do that is to use the 'local' resolver, which looks in a named directory for an appropriate file. Since the default arguments are 'json' storage with 'bz2' compression, this works with three arguments.

In [6]:
vol = Volume(id = "hvd.hwrqs8", dir = os.path.join(pz_root), id_resolver = "local")
vol.tokenlist().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,1C,CC,1
2,body,i,NN,1
7,body,.,$.,1
7,body,CHILDREN,NE,1
7,body,MR,NE,1


In [7]:
p = Path(htrc_features.__file__).parent.parent / 'tests/data/partialparq'
Volume(dir = p, format = 'parquet', id='uc2.ark:/13960/t1xd0sc6x')

## Passing arguments to resolver directly.

Ordinary users will generally only interact with these classes through arguments to the Volume method. But we can also call 'LocalResolver' directly.

In this example, we can use instead 'localResolver'. We say we're using json, bz2, and a folder named `../data/PZ-volumes'

In [8]:
fileholder = resolvers.LocalResolver(dir = pz_root, format = "json", compression = "bz2")

locally_resolved_file = Volume(id = "hvd.hwrqs8", id_resolver = fileholder, format = "json")

locally_resolved_file


## Passing arguments to resolver and handler directly.

While HTTP fetching is currently silent, we should probably warn when that happens without an explicit request. Here's how you'd do that. Here I invoke the JsonFileHandler directly, rather than wrapping it in volume; it's unlikely an end user will ever need to do that.

In [16]:
webresolver = htrc_features.resolvers.HttpResolver(url = 'http://data.htrc.illinois.edu/htrc-ef-access/get?action=download-ids&id={id}&output=json')

remote_handler = htrc_features.JsonFileHandler(id = "hvd.hwrqs8", id_resolver = webresolver)

remote_handler._make_tokencount_df().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,1C,CC,1
2,body,i,NN,1
7,body,.,$.,1
7,body,CHILDREN,NE,1
7,body,MR,NE,1


# Fancy zip storage

When working with millions of files, some systems start to run out of inodes. Here, we build a storage using the 'ziptreeresolver' method, which assigns each file to one of 4096 zip files based on its name. Here, I'll create one in a tmpdir first. This, as a file writing operation, is a little more complicated than the above.

None


In [22]:
sample_dir = Path(tempfile.gettempdir(), "test_zipdir")
import shutil
if sample_dir.exists():
    shutil.rmtree(sample_dir)
Path.mkdir(sample_dir)
zipdir_1 = str(sample_dir)

zipholder = resolvers.ZiptreeResolver(zipdir_1, format = "json", compression = "bz2")


Now we'll go through the PZ-volumes folder and, for every volume, 

1. Grab the ID.
2. Copy from one resolution to the other.

Note that we tell the zipholder to use 'json' storage' and 'bz2' compression. The 'write' method notes the compression equivalence, and
so doesn't waste memory compressing and uncompressing.

In [25]:
#!rm /tmp/*.zip

ids = set()

current_resolver = htrc_features.resolvers.LocalResolver(pz_root, format = "json", compression = "bz2")

for file in os.listdir(pz_root):
    if file.endswith(".bz2"):
        id = htrc_features.utils.extract_htid(file)
        htrc_features.caching.copy_between_resolvers(id, current_resolver, zipholder)

This new tmpdir is filled with zipfiles. There are 4096 names, built from the first three characters of sha-1 hashes of the filenames.

In [26]:
[z for z in os.listdir(zipdir_1) if z.endswith(".zip")]

['e99.zip',
 'd33.zip',
 '7d2.zip',
 '173.zip',
 'e6f.zip',
 'e14.zip',
 '553.zip',
 'c5f.zip',
 'e6b.zip',
 '96c.zip',
 'a97.zip',
 '613.zip',
 '915.zip',
 '940.zip',
 'b75.zip']

## Zipdir reading

Now we can extract any individual file from these zipdirs

In [27]:
Volume("hvd.32044010273894", format = "json", compression = "bz2", id_resolver = "ziptree", dir = zipdir_1).tokenlist()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
1,body,On,IN,1
1,body,Serao,NN,1
1,body,The,DT,1
1,body,and,CC,1
1,body,ballet,NN,1
...,...,...,...,...
276,body,with,IN,1
283,body,COLLEGE,NNP,1
283,body,CONSERVED,VBN,1
283,body,HARVARD,NNP,1


In [28]:
zipdir = tempfile.tempdir + "/ziptree_abc"
if Path(zipdir).exists():
    shutil.rmtree(zipdir)

Path(zipdir).mkdir()

my_resolver = resolvers.ZiptreeResolver(zipdir, format = 'parquet')
web_resolver = resolvers.HttpResolver()

htrc_features.caching.copy_between_resolvers("innd.00000004583746", web_resolver, my_resolver)


In [29]:
Volume("innd.00000004583746", id_resolver = my_resolver).tokenlist()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
7,body,"""",'',1
7,body,&,CC,1
7,body,'HE,JJ,1
7,body,'IMORE.,.,1
7,body,'agmenta,.,1
...,...,...,...,...
330,body,wit,NN,1
330,body,wrangling,VBG,1
335,body,NOTREDAME,NN,1
335,body,UNNERSITY,NNP,1


### Copying between resolvers

In [30]:
from htrc_features.caching import copy_between_resolvers

It's possible to copy from these to some other form of resolution. In general, it's fairly easy to subclass 'IdResolver' to whatever way you might like to store files.

So when you finally get fed up with pairtrees, for instance, you can use your new method here.

It's generally possible to use the returned filehandles directly. You should be aware, though,
that especially for 'write' methods on resolvers it can occasionally be tricky to handle the locations.

In [31]:
pairtree_resolver = resolvers.PairtreeResolver(dir = zipdir_1, format = "json", compression = "bz2")

copy_between_resolvers("hvd.32044010273894", resolvers.ZiptreeResolver(dir = zipdir_1, compression = "bz2", format="json"), pairtree_resolver)

pairtree_resolver.open(id = "hvd.32044010273894").read()[:300]

b'{"id":"hvd.32044010273894","metadata":{"schemaVersion":"1.3","dateCreated":"2016-06-18T21:16:55.4637562Z","volumeIdentifier":"hvd.32044010273894","accessProfile":"google","rightsAttributes":"pd","hathitrustRecordNumber":"1219987","enumerationChronology":" ","sourceInstitution":"HVD","sourceInstituti'

## Resolving Parquet

Parquet caches can be resolved in exactly the same way. The defaults may break more easily, though.

In [32]:
v = Volume(id = "mdp.39015028036104", dir = Path(project_root, "data/parquet"), format = "parquet", id_resolver = "local")
v

# Transferring between formats

To transfer files between formats, you simply need two resolvers. The following code shows how to do do this.

This code is smart enough to know that if you are copying from one bz2 file to another, it need not decompress and recompress.

In [33]:
from htrc_features.caching import copy_between_resolvers

## 3 resolvers

Here's a sort of silly example that's part of the test suite. It's easy to transition between a variety of different implementations.

This simply read and copy method is the basis of methods that allow fallback searches and automatic caching.

In [34]:
with tempfile.TemporaryDirectory() as first_new_dir:
    with tempfile.TemporaryDirectory() as second_new_dir:
        resolver1 = htrc_features.resolvers.LocalResolver(dir = Path(project_root, "tests", "data"), format = "json", compression = "bz2")
        resolver2 = htrc_features.resolvers.PairtreeResolver(dir = first_new_dir,  format = "json", compression = "gz")
        resolver3 = htrc_features.resolvers.LocalResolver(dir = second_new_dir, format = "parquet", compression = "snappy")
        
        copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver1, resolver2)
        copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver2, resolver3)
        
        all_files = []
        for loc, dir, files in os.walk(first_new_dir):
            for file in files:
                all_files.append(os.path.join(loc, file))
                
        assert(len(all_files) == 1)
        assert(all_files[0].endswith("aeu/pairtree_root/ar/k+/=1/39/60/=t/1r/f6/3t/52/ark+=13960=t1rf63t52/aeu.ark+=13960=t1rf63t52.json.gz"))
        
        # Our test assertion ensures that the data has made it all the way through.
        assert(Volume("aeu.ark:/13960/t1rf63t52", id_resolver = resolver3).tokenlist()['count'].sum() == 97691)

# Cache resolvers

This logic is extended in the core library to the methods for 'cache_local', 'cache_pairtree', and 'cache_ziptree' that first look locally and then on the internet. These are implemented using a function, make_cache_resolver, that returns a new class constructor. We can make our own between any two resolvers. 

In [35]:
tmpdir = tempfile.gettempdir() + "/neopairtree"
from htrc_features import caching
CacheResolver = caching.make_fallback_resolver(resolvers.PairtreeResolver, resolvers.HttpResolver(), cache = True)
my_resolver = CacheResolver(dir = os.path.join(tmpdir, "new_resolution"), format = "json", compression = "gz")

The type of the composite resolver, if you look at it, is pretty ugly! It's generated inside a stew of closures and functions.

In [36]:
type(my_resolver)

htrc_features.caching.make_fallback_resolver.<locals>.FallbackResolver

In [37]:
isinstance(my_resolver, resolvers.IdResolver)

True

But its behavior is great. You can now download automatically from Hathi. The first occasion takes a while--9 seconds--to pull from the Internet.


In [38]:
%%time

Volume(id = "mdp.39015051692625", id_resolver = my_resolver)

CPU times: user 3.34 s, sys: 136 ms, total: 3.48 s
Wall time: 13.6 s


But the next one is quick! Just 500 ms.

In [39]:
%%time

Volume(id = "mdp.39015051692625", id_resolver = my_resolver)

CPU times: user 332 ms, sys: 20 ms, total: 352 ms
Wall time: 353 ms


In [40]:
newtreepath = os.path.join(tmpdir, "new_ziptree")
if Path(newtreepath).exists():
    shutil.rmtree(newtreepath)
new_ziptree = resolvers.ZiptreeResolver(dir = newtreepath, format='parquet', mode='w')

copy_between_resolvers("mdp.39015051692625", my_resolver, new_ziptree)
#copy_between_resolvers("mdp.39015051692625", resolvers.HttpResolver(), zipholder)

In [41]:
new_ziptree.format

'parquet'

In [42]:
parquet_vol = Volume(id = "mdp.39015051692625", id_resolver = new_ziptree, format = "parquet")

parquet_vol.tokenlist()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,"""",``,2
2,body,',POS,2
2,body,(,-LRB-,7
2,body,),-RRB-,1
2,body,)..,CC,1
...,...,...,...,...
1394,body,४,NN,2
1394,body,६,NNP,1
1394,body,७,NN,1
1394,body,८,NNP,2


In [43]:
Volume(id = "mdp.39015051692625", id_resolver = "pairtree_cached_http", dir = os.path.join(tmpdir, "new_pairtree"), format = "parquet").tokenlist()

--- Logging error ---
Traceback (most recent call last):
  File "/home/bschmidt/miniconda3/envs/htrc/lib/python3.7/logging/__init__.py", line 1025, in emit
    msg = self.format(record)
  File "/home/bschmidt/miniconda3/envs/htrc/lib/python3.7/logging/__init__.py", line 869, in format
    return fmt.format(record)
  File "/home/bschmidt/miniconda3/envs/htrc/lib/python3.7/logging/__init__.py", line 608, in format
    record.message = record.getMessage()
  File "/home/bschmidt/miniconda3/envs/htrc/lib/python3.7/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/home/bschmidt/miniconda3/envs/htrc/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/bschmidt/miniconda3/envs/htrc/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/bschmidt/miniconda3/envs/htrc/lib/python3.7/site-packages/ipykernel_launcher

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,"""",``,2
2,body,',POS,2
2,body,(,-LRB-,7
2,body,),-RRB-,1
2,body,)..,CC,1
...,...,...,...,...
1394,body,४,NN,2
1394,body,६,NNP,1
1394,body,७,NN,1
1394,body,८,NNP,2


In [44]:
z = tempfile.TemporaryDirectory()

In [45]:
from pathlib import Path
Path(project_root, "tests", "data").iterdir().__next__()

PosixPath('/home/bschmidt/Dropbox/lib/python/htrc-feature-reader/tests/data/green-gables-15pages.json')

In [46]:
Volume(Path(project_root, 'tests/data/green-gables-15pages.json').__str__(), compression = None)

In [47]:
resolver2 = htrc_features.resolvers.IdResolver(dir = ".", format = "parquet", compression = "snappy")

# Don't use compression in the name
testname = resolver2.fname("mdp.12345", format = "parquet", compression = "snappy", suffix = "tokens")
assert(testname == "mdp.12345.tokens.parquet")


# Custom configurations

You can build up resolvers of any length using this

One convenient way to do so may be to manage configurations using yaml.

For example, I have the following in ~/.ef-files.yaml.

The YAML list is, simply, a set of arguments to resolvers.

1. First I look in a gzip parquet ziptree at /drobo/hathi_ziptree
2. Then I look in the bz2, json feature counts at /drobo/feature-counts. These will be slower, but more complete.
3. Finally, it it's in neither of those places, I go to Hathi's website.

When it's found, I cache to the parquet ziptree; but **not** to the local file. 

In [48]:
import yaml
s = """
-
  id_resolver: local
  dir: /tmp/loc
  format: parquet
  compression: gzip
  token_kwargs:
    drop_section: true
    sections: body
    pos: false
-
  id_resolver: pairtree
  dir: /drobo/feature-counts
  format: json
  compression: bz2
  cache: false
-
  format: json
  compression: null
  id_resolver: "http"
"""

config = yaml.load(s, yaml.SafeLoader)
p = Path(config[0]['dir'])
if p.exists():
    shutil.rmtree(p)
p.mkdir()

def resolver_factory(id_resolver, **kwargs):
    return htrc_features.resolvers.resolver_nicknames[id_resolver](**kwargs)

def combine_resolvers(l):
    assert(len(l) >= 2)
    first = l[0]
    rest = l[1:]
    if len(rest) > 1:
        second_choice = combine_resolvers(rest)
    else:
        second_choice = resolver_factory(**rest[0])
    with_fallback = caching.make_fallback_resolver(first['id_resolver'], second_choice, cache = first.get("cache", True))
    del first['id_resolver']
    return with_fallback(**first)



bens_resolver = combine_resolvers(config)
Volume(id="mdp.39015012434786", id_resolver = bens_resolver).tokenlist(pos=False, section="default")

Unnamed: 0_level_0,Unnamed: 1_level_0,count
page,token,Unnamed: 2_level_1
3,■^^w^Hm,1
5,",",1
5,16-December,1
5,1972,1
5,2,1
...,...,...
393,4786,1
393,9015,1
393,MICHIGAN,1
393,OF,1


In [49]:
def separate_resolvers(l):
    out = []
    for elem in l:
        print (elem)
        r = htrc_features.resolvers.resolver_nicknames[elem['id_resolver']](**elem)
        out.append(
        r
        )
    return out
config = yaml.load(s, yaml.SafeLoader)

components = separate_resolvers(config)


{'id_resolver': 'local', 'dir': '/tmp/loc', 'format': 'parquet', 'compression': 'gzip', 'token_kwargs': {'drop_section': True, 'sections': 'body', 'pos': False}}
{'id_resolver': 'pairtree', 'dir': '/drobo/feature-counts', 'format': 'json', 'compression': 'bz2', 'cache': False}
{'format': 'json', 'compression': None, 'id_resolver': 'http'}


In [50]:
Volume(id="mdp.39015012434786", id_resolver = bens_resolver).tokenlist()

MissingFieldError: Your internal tokenlist representation does not have enough information for the current args. Missing column: pos

In [51]:
import pandas as pd
#Volume('mdp.39015012434786', id_resolver = components[0]).tokenlist()
copy_between_resolvers('mdp.39015012434786', components[1], components[0])

pd.read_parquet(p / "mdp.39015012434786.tokens.parquet")

Unnamed: 0_level_0,Unnamed: 1_level_0,count
page,token,Unnamed: 2_level_1
3,■^^w^Hm,1
5,",",1
5,16-December,1
5,1972,1
5,2,1
...,...,...
393,4786,1
393,9015,1
393,MICHIGAN,1
393,OF,1


In [52]:
Volume('mdp.39015012434786', id_resolver = components[1]).tokenlist(chunk = True, pos=False, drop_section= True)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
chunk,token,Unnamed: 2_level_1
1,!,3
1,"""",66
1,'S,1
1,'ll,6
1,'m,1
...,...,...
16,yugas,1
16,—,1
16,—devotion,1
16,—supracausal,1
