In [2]:
%load_ext autoreload
%autoreload 2

In [81]:
import htrc_features
import htrc_features.resolvers
from htrc_features import Volume, resolvers
import tempfile
import os
import json
import logging

# Why use IDs?

In the new version, requesting objects by IDs is the preferred method.

There are a few reasons for this.

1. Compatability among machines and groups. I've often found it hard to run other people's code because it requires downloading hundreds or thousands of books, when I *already* have a full copy of the Hathi Features on one machine. Different settings will call for different optimizations; workshop users may want to temporarily download files, while HPC environments may want minimally zipped version.

2. Alternate storage formats. The new version includes a parquet-based way of referring to files; reading word counts from parquets is much faster than reading from bzipped json files. Referring to IDs makes it easy to silently optimize access to parquet.

## What is an ID?



In [82]:
logging.getLogger().setLevel(logging.INFO)


# Some explanations and tests for the new loading methods.

This is not a comprehensive set of tests, but should provide the basics.

## Loading from a path.

An unnamed initial arg to 'Volume' looks at the format to see if it's an ID or a path. This looks like an ID, so reads from disk.

In [228]:
project_root = Path(htrc_features.__file__).parent.parent
pz_root = Path(project_root, "data", "PZ-volumes")
file_path = Path(pz_root, "hvd.hwrqs8.json.bz2").joinpath()
file_path = str(file_path)

Volume(file_path)

## Loading over the web

This one loads from the web. There are probably more gentle defaults than re-pulling from online every time, and these should be implemented.

In [230]:
Volume("hvd.hwrqs8")

## Accessing files by volume ids.

That's basically the entire old method. But you may want to access local objects by their HTIDs. The simplest way to 
do that is to use the 'local' resolver, which looks in a named directory for an appropriate file. Since the default arguments are 'json' storage with 'bz2' compression, this works with three arguments.

In [231]:
vol = Volume(id = "hvd.hwrqs8", dir = os.path.join(pz_root), id_resolver = "local")
vol.tokenlist().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,1C,CC,1
2,body,i,NN,1
7,body,.,$.,1
7,body,CHILDREN,NE,1
7,body,MR,NE,1


## Passing arguments to resolver directly.

Ordinary users will generally only interact with these classes through arguments to the Volume method. But we can also call 'LocalResolver' directly.

In this example, we can use instead 'localResolver'. We say we're using json, bz2, and a folder named `../data/PZ-volumes'

In [238]:
fileholder = resolvers.LocalResolver(dir = pz_root, format = "json", compression = "bz2")

locally_resolved_file = Volume(id = "hvd.hwrqs8", id_resolver = fileholder, format = "json")

locally_resolved_file


In [233]:
"""
NOT IMPLEMENTED: lambda resolution

simple_handler = lambda x: open("../data/PZ-volumes/" + x + ".json.bz2", mode = "r")

locally_resolved_file = Volume(id = "hvd.hwrqs8", id_resolver = simple_handler, format = "json")

locally_resolved_file

"""

'\nNOT IMPLEMENTED: lambda resolution\n\nsimple_handler = lambda x: open("../data/PZ-volumes/" + x + ".json.bz2", mode = "r")\n\nlocally_resolved_file = Volume(id = "hvd.hwrqs8", id_resolver = simple_handler, format = "json")\n\nlocally_resolved_file\n\n'

## Passing arguments to resolver and handler directly.

While HTTP fetching is currently silent, we should probably warn when that happens without an explicit request. Here's how you'd do that. Here I invoke the JsonFileHandler directly, rather than wrapping it in volume; it's unlikely an end user will ever need to do that.

In [234]:
webresolver = htrc_features.resolvers.HttpResolver(url = 'http://data.htrc.illinois.edu/htrc-ef-access/get?action=download-ids&id={id}&output=json')

remote_handler = htrc_features.JsonFileHandler(id = "hvd.hwrqs8", id_resolver = webresolver)

remote_handler._make_tokencount_df().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,1C,CC,1
2,body,i,NN,1
7,body,.,$.,1
7,body,CHILDREN,NE,1
7,body,MR,NE,1


# Fancy zip storage

When working with millions of files, some systems start to run out of inodes. Here, we build a storage using the 'ziptreeresolver' method, which assigns each file to one of 4096 zip files based on its name. Here, I'll create one in a tmpdir first. This, as a file writing operation, is a little more complicated than the above.

In [237]:
sample_dir = tempfile.TemporaryDirectory()
zipdir = sample_dir.name

zipholder = resolvers.ZiptreeResolver(zipdir, format = "json", compression = "bz2")


Now we'll go through the PZ-volumes folder and, for every volume, 

1. Grab the ID.
2. Read the bzipped binary data into memory
3. Reinsert that binary data into the ziptree holder.

Note that we tell the zipholder to use 'json' storage' and 'bz2' compression. Note that this insertion roundtrip actually decompresses and recompresses the data because of the way that the `IdHandler.open` method works: there are faster ways to insert the binary data directly.

In [240]:
#!rm /tmp/*.zip


ids = set()

current_resolver = htrc_features.resolvers.LocalResolver(pz_root, format = "json", compression = "bz2")

for file in os.listdir(pz_root):
    if file.endswith(".bz2"):
        id = htrc_features.utils.extract_htid(file)
        with current_resolver.open(id, format = "json", compression = "bz2") as original:
            d = original.read()
            try:
                with zipholder.open(id, format = "json", compression = "bz2", mode = "wb") as fout:
                    fout.write(d)
            except KeyError:
                print("Already inserted {id}".format(id=id))        

This new tmpdir is filled with zipfiles. There are 4096 names, built from the first three characters of sha-1 hashes of the filenames.

In [241]:
[z for z in os.listdir(zipdir) if z.endswith(".zip")]

['e99.zip',
 'd33.zip',
 '7d2.zip',
 '173.zip',
 'e6f.zip',
 'e14.zip',
 '553.zip',
 'c5f.zip',
 'e6b.zip',
 '96c.zip',
 'a97.zip',
 '613.zip',
 '915.zip',
 '940.zip',
 'b75.zip']

## Zipdir reading

Now we can extract

In [242]:
Volume("hvd.32044010273894", format = "json", compression = "bz2", id_resolver = "ziptree", dir = zipdir)

In [247]:
fin = resolvers.PairtreeResolver(dir = "/home/bschmidt/hathi-ef/", format = "json", compression = "bz2").open(id = "hvd.hwrevp")

In [248]:
fin

<bz2.BZ2File at 0x7f91c3a50390>

In [249]:
print(json.loads(fin.read().decode("utf-8"))['metadata'])

{'schemaVersion': '1.3', 'dateCreated': '2016-06-20T03:50:48.3790146Z', 'volumeIdentifier': 'hvd.hwrevp', 'accessProfile': 'google', 'rightsAttributes': 'pd', 'hathitrustRecordNumber': '11592503', 'enumerationChronology': ' ', 'sourceInstitution': 'HVD', 'sourceInstitutionRecordNumber': '005701721', 'oclc': ['62415828'], 'isbn': [], 'issn': [], 'lccn': [], 'title': 'Annals of the poor. By the Rev. Legh Richmond ...', 'imprint': 'Crocker & Brewster, 1829.', 'lastUpdateDate': '2012-06-04 10:28:09', 'governmentDocument': False, 'pubDate': '1829', 'pubPlace': 'mau', 'language': 'eng', 'bibliographicFormat': 'BK', 'genre': ['not fiction'], 'issuance': 'monographic', 'typeOfResource': 'text', 'classification': {}, 'names': ['Richmond, Legh 1772-1827 ', 'Ayre, John 1801-1869 '], 'htBibUrl': 'http://catalog.hathitrust.org/api/volumes/full/htid/hvd.hwrevp.json', 'handleUrl': 'http://hdl.handle.net/2027/hvd.hwrevp'}


## Resolving Parquet

Parquet caches can be resolved in exactly the same way. The defaults may break more easily, though.

In [253]:
v = Volume(id = "mdp.39015028036104", dir = Path(project_root, "data/parquet"), format = "parquet", id_resolver = "local")
v

In [254]:
class ChunkedParquetResolver(htrc_features.parsers.ParquetFileHandler):
    def __init__(self, target, chunk_strategy, **kwargs):
        self.chunk_size = target
        self.chunk_strategy = chunk_strategy
        
        super().__init__(kwargs)
        
    def write(self, volume, meta=True, tokens=True, section_features=False, chars=False, **kwargs):
        
        if meta:
            metastring = BytesIO(json.dumps(volume.parser.meta).encode("utf-8"))
            with self.resolver.open(self.id, **kwargs) as fout:
                fout.write(metastring.read())
        
        if tokens:
            feats = volume.chunked_tokenlist(target = self.chunk_size, strategy = 'ends')
            if not feats.empty:
                with self.resolver.open(id = self.id, suffix = 'tokens') as fout:
                    feats.to_parquet(fout, compression=self.compression)                

# Transferring between formats

To transfer files between formats, you simply need two resolvers. The following code shows how to do do this.

This code is smart enough to know that if you are copying from one bz2 file to another, it need not decompress and recompress.

In [301]:
def copy_between_resolvers(id, resolver1, resolver2):
    input = Volume(id, id_resolver=resolver1)
    output = Volume(id, id_resolver=resolver2, mode = 'wb')
    output.write(input)


## 3 resolvers

We'll work with three resolvers at once. It's easy to transition between a variety of different implementations.

This simply read and copy method will be the basis of methods that allow fallback searches and automatic caching.

In [335]:
with tempfile.TemporaryDirectory() as first_new_dir:
    with tempfile.TemporaryDirectory() as second_new_dir:
        resolver1 = htrc_features.resolvers.LocalResolver(dir = Path(project_root, "tests", "data"), format = "json", compression = "bz2")
        resolver2 = htrc_features.resolvers.PairtreeResolver(dir = first_new_dir,  format = "json", compression = "gz")
        resolver3 = htrc_features.resolvers.LocalResolver(dir = second_new_dir, format = "parquet", compression = "snappy")
        
        copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver1, resolver2)
        copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver2, resolver3)
        
        all_files = []
        for loc, dir, files in os.walk(first_new_dir):
            for file in files:
                all_files.append(os.path.join(loc, file))
                
        assert(len(all_files) == 1)
        assert(all_files[0].endswith("aeu/pairtree_root/ar/k+/=1/39/60/=t/1r/f6/3t/52/ark+=13960=t1rf63t52/aeu.ark+=13960=t1rf63t52.json.gz"))
        
        # Our test assertion ensures that the data has made it all the way through.
        assert(Volume("aeu.ark:/13960/t1rf63t52", id_resolver = resolver3).tokenlist()['count'].sum() == 97691)

In [350]:
output = Volume("aeu.ark:/13960/t1rf63t52", id_resolver=resolver2, mode = 'rb')

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmps6i98bpf/aeu/pairtree_root/ar/k+/=1/39/60/=t/1r/f6/3t/52/ark+=13960=t1rf63t52/aeu.ark+=13960=t1rf63t52.json.gz'

In [204]:
z = tempfile.TemporaryDirectory()

In [259]:
from pathlib import Path
Path(project_root, "tests", "data").iterdir().__next__()

PosixPath('/home/bschmidt/Dropbox/lib/python/htrc-feature-reader/tests/data/green-gables-15pages.json')

In [363]:
Volume(Path(project_root, 'tests/data/green-gables-15pages.json').__str__(), compression = None)

In [365]:
resolver2 = htrc_features.resolvers.IdResolver(dir = ".", format = "parquet", compression = "snappy")

# Don't use compression in the name
testname = resolver2.fname("mdp.12345", format = "parquet", compression = "snappy", suffix = "tokens")
assert(testname == "mdp.12345.tokens.parquet")
