In [2]:
%load_ext autoreload
%autoreload 2

In [81]:
import htrc_features
import htrc_features.resolvers
from htrc_features import Volume, resolvers
import tempfile
import os
import json
import logging

# Why use IDs?

In the new version, requesting objects by IDs is the preferred method.

There are a few reasons for this.

1. Compatability among machines and groups. I've often found it hard to run other people's code because it requires downloading hundreds or thousands of books, when I *already* have a full copy of the Hathi Features on one machine. Different settings will call for different optimizations; workshop users may want to temporarily download files, while HPC environments may want minimally zipped version.

2. Alternate storage formats. The new version includes a parquet-based way of referring to files; reading word counts from parquets is much faster than reading from bzipped json files. Referring to IDs makes it easy to silently optimize access to parquet.

## What is an ID?



In [82]:
logging.getLogger().setLevel(logging.INFO)


# Some explanations and tests for the new loading methods.

This is not a comprehensive set of tests, but should provide the basics.

## Loading from a path.

An unnamed initial arg to 'Volume' looks at the format to see if it's an ID or a path. This looks like an ID, so reads from disk.

In [83]:
project_root = os.path.dirname(htrc_features.__file__)

Volume(project_root+ "/../data/PZ-volumes/hvd.hwrqs8.json.bz2")

## Loading over the web

This one loads from the web. There are probably more gentle defaults than re-pulling from online every time, and these should be implemented.

In [84]:
Volume("hvd.hwrqs8")


TypeError: __init__() got multiple values for keyword argument 'compression'

## Accessing files by volume ids.

That's basically the entire old method. But you may want to access local objects by their HTIDs. The simplest way to 
do that is to use the 'local' resolver, which looks in a named directory for an appropriate file. Since the default arguments are 'json' storage with 'bz2' compression, this works with three arguments.

In [10]:
ls "../data/PZ-volumes"

hvd.32044010273894.json.bz2  nyp.33433075749246.json.bz2
hvd.hwquxe.json.bz2          uc2.ark+=13960=t06w96h1q.json.bz2
hvd.hwrevu.json.bz2          uc2.ark+=13960=t0tq5v13m.json.bz2
hvd.hwrqs8.json.bz2          uc2.ark+=13960=t2s46hf9h.json.bz2
mdp.39015028036104.json.bz2  uc2.ark+=13960=t6057nf2g.json.bz2
mdp.39015078572313.json.bz2  uc2.ark+=13960=t8df6p182.json.bz2
njp.32101068970662.json.bz2  uiuo.ark+=13960=t72v2t63s.json.bz2
nyp.33433074811310.json.bz2


In [11]:
vol = Volume(id = "hvd.hwrqs8", dir = "../data/PZ-volumes", id_resolver = "local")
vol.tokenlist().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,1C,CC,1
2,body,i,NN,1
7,body,.,$.,1
7,body,CHILDREN,NE,1
7,body,MR,NE,1


## Passing arguments to resolver directly.

Ordinary users will generally only interact with these classes through arguments to the Volume method. But we can also call 'LocalResolver' directly.

In this example, we can use instead 'localResolver'. We say we're using json, bz2, and a folder named `../data/PZ-volumes'

In [12]:
fileholder = resolvers.LocalResolver(dir = "../data/PZ-volumes", format = "json", compression = "bz2")

locally_resolved_file = Volume(id = "hvd.hwrqs8", id_resolver = fileholder, format = "json")

locally_resolved_file


In [13]:
"""
NOT IMPLEMENTED: lambda resolution

simple_handler = lambda x: open("../data/PZ-volumes/" + x + ".json.bz2", mode = "r")

locally_resolved_file = Volume(id = "hvd.hwrqs8", id_resolver = simple_handler, format = "json")

locally_resolved_file

"""

'\nNOT IMPLEMENTED: lambda resolution\n\nsimple_handler = lambda x: open("../data/PZ-volumes/" + x + ".json.bz2", mode = "r")\n\nlocally_resolved_file = Volume(id = "hvd.hwrqs8", id_resolver = simple_handler, format = "json")\n\nlocally_resolved_file\n\n'

## Passing arguments to resolver and handler directly.

While HTTP fetching is currently silent, we should probably warn when that happens without an explicit request. Here's how you'd do that. Here I invoke the JsonFileHandler directly, rather than wrapping it in volume; it's unlikely an end user will ever need to do that.

In [16]:
webresolver = htrc_features.resolvers.HttpResolver(url = 'http://data.htrc.illinois.edu/htrc-ef-access/get?action=download-ids&id={id}&output=json')

remote_handler = htrc_features.JsonFileHandler(id = "hvd.hwrqs8", id_resolver = webresolver)

remote_handler._make_tokencount_df().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
2,body,1C,CC,1
2,body,i,NN,1
7,body,.,$.,1
7,body,CHILDREN,NE,1
7,body,MR,NE,1


# Fancy zip storage

When working with millions of files, some systems start to run out of inodes. Here, we build a storage using the 'ziptreeresolver' method, which assigns each file to one of 4096 zip files based on its name. Here, I'll create one in a tmpdir first. This, as a file writing operation, is a little more complicated than the above.

In [17]:
zipdir = tempfile.gettempdir()

zipholder = resolvers.ZiptreeResolver(zipdir, format = "json", compression = "bz2")


Now we'll go through the PZ-volumes folder and, for every volume, 

1. Grab the ID.
2. Read the bzipped binary data into memory
3. Reinsert that binary data into the ziptree holder.

Note that we tell the zipholder to use 'json' storage' and 'bz2' compression. Note that this insertion roundtrip actually decompresses and recompresses the data because of the way that the `IdHandler.open` method works: there are faster ways to insert the binary data directly.

In [18]:
#!rm /tmp/*.zip

sample_dir = "../data/PZ-volumes/"

ids = set()

current_resolver = htrc_features.resolvers.LocalResolver(sample_dir, format = "json", compression = "bz2")

for file in os.listdir(sample_dir):
    if file.endswith(".bz2"):
        id = htrc_features.utils.extract_htid(file)
        with current_resolver.open(id, format = "json", compression = "bz2") as original:
            d = original.read()
            try:
                with zipholder.open(id, format = "json", compression = "bz2", mode = "wb") as fout:
                    fout.write(d)
            except KeyError:
                print("Already inserted {id}".format(id=id))        

Already inserted njp.32101068970662
Already inserted uc2.ark:/13960/t8df6p182
Already inserted uiuo.ark:/13960/t72v2t63s
Already inserted hvd.32044010273894
Already inserted uc2.ark:/13960/t2s46hf9h
Already inserted hvd.hwquxe
Already inserted uc2.ark:/13960/t6057nf2g
Already inserted mdp.39015078572313
Already inserted uc2.ark:/13960/t06w96h1q
Already inserted nyp.33433074811310
Already inserted hvd.hwrevu
Already inserted mdp.39015028036104
Already inserted hvd.hwrqs8
Already inserted uc2.ark:/13960/t0tq5v13m
Already inserted nyp.33433075749246


This new tmpdir is filled with zipfiles. There are 4096 names, built from the first three characters of sha-1 hashes of the filenames.

In [19]:
[z for z in os.listdir(zipdir) if z.endswith(".zip")]

['e99.zip',
 'd33.zip',
 '7d2.zip',
 '173.zip',
 'e6f.zip',
 'e14.zip',
 '553.zip',
 'c5f.zip',
 'e6b.zip',
 '96c.zip',
 'a97.zip',
 '613.zip',
 '915.zip',
 '940.zip',
 'b75.zip']

## `get` calls return buffers

If we use the ZiptreeResolver's get method directly, we see it returns a BZ2File.

In [20]:
Volume("hvd.32044010273894", format = "json", compression = "bz2", id_resolver = "ziptree", dir = zipdir)

In [21]:
fin = resolvers.PairtreeResolver(dir = "/home/bschmidt/hathi-ef/", format = "json", compression = "bz2").open(id = "hvd.hwrevp")

print(json.loads(fin.read().decode("utf-8"))['metadata'])

{'schemaVersion': '1.3', 'dateCreated': '2016-06-20T03:50:48.3790146Z', 'volumeIdentifier': 'hvd.hwrevp', 'accessProfile': 'google', 'rightsAttributes': 'pd', 'hathitrustRecordNumber': '11592503', 'enumerationChronology': ' ', 'sourceInstitution': 'HVD', 'sourceInstitutionRecordNumber': '005701721', 'oclc': ['62415828'], 'isbn': [], 'issn': [], 'lccn': [], 'title': 'Annals of the poor. By the Rev. Legh Richmond ...', 'imprint': 'Crocker & Brewster, 1829.', 'lastUpdateDate': '2012-06-04 10:28:09', 'governmentDocument': False, 'pubDate': '1829', 'pubPlace': 'mau', 'language': 'eng', 'bibliographicFormat': 'BK', 'genre': ['not fiction'], 'issuance': 'monographic', 'typeOfResource': 'text', 'classification': {}, 'names': ['Richmond, Legh 1772-1827 ', 'Ayre, John 1801-1869 '], 'htBibUrl': 'http://catalog.hathitrust.org/api/volumes/full/htid/hvd.hwrevp.json', 'handleUrl': 'http://hdl.handle.net/2027/hvd.hwrevp'}


## Resolving Parquet

Parquet caches can be resolved in exactly the same way. The defaults may break more easily, though.

In [22]:
v = Volume(id = "mdp.39015028036104", dir = "../data/parquet", format = "parquet", id_resolver = "local")

# Skeleton code for writin between formats.

In [47]:
resolver1 = htrc_features.resolvers.PairtreeResolver(dir = "/home/bschmidt/hathi-ef/", format = "json", compression = "bz2")

In [77]:
resolver2 = htrc_features.resolvers.LocalResolver(dir = "/home/bschmidt/hathi-ef/", format = "json", compression = "bz2")

In [78]:
input = Volume("hvd.ah5922", id_resolver=resolver1)
input

{'compression': 'bz2'} False


In [80]:
output = Volume("hvd.ah5922", id_resolver=resolver2, mode = 'wb')
output.parser.write(input)

In [65]:
output.parser.compression

'bz2'

In [52]:
output = Volume("hvd.ah5922", id_resolver=resolver2, mode = 'rb')


bz2


ValueError: Expected object or value

In [45]:
output