In [None]:
# Getting Started

# This Jupyter notebook is meant as a simple introduction to the PyGallica package. The code included here will guide 
# you through some simple queries using PyGallica, introducing you to the Search, IIIF,
# and Document APIs. The demo is self-contained, so no files will be saved to your local machine.

In [2]:
# This line installs all of the package's requirements on your machine using Python 3.

! pip3 install --user -r requirements.txt

[33mYou are using pip version 9.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [8]:
# Although Beautiful Soup should be installed using the line above, I've sometimes run into errors saying that it 
# hasn't been successfully installed. Uncomment the line below and run it if you encounter that error.

# ! pip3 install BeautifulSoup4

[33mYou are using pip version 9.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
# Now that we've installed our requirements, we can start to use the wrapper.

# We'll start with the Search API. The Search API allows you to perform keyword searches
# in Gallica's holdings and retrieve xml returned by those searches.

# Here we import the Search class from the file search_api.py

from search_api import Search

In [4]:
# This is the basic syntax for running searches using the API wrapper. The words you put in quotes will be your 
# queries. You can separate multiple queries with commas in the format Search.search('this', 'is', 'a', 'test')

# Your results should appear in a few seconds after running this command.

Search.search('test')

https://gallica.bnf.fr/SRU?operation=searchRetrieve&version=1.2&query=(gallica all "test")&startRecord=1
<?xml version="1.0" encoding="utf-8"?>
<srw:searchRetrieveResponse xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns7="http://gallica.bnf.fr/namespaces/gallica/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:onix="http://www.editeur.org/onix/2.1/reference/" xmlns:onix_dc="http://bibnum.bnf.fr/NS/onix_dc/" xmlns:srw="http://www.loc.gov/zing/srw/">
<srw:version>1.2</srw:version>
<srw:echoedSearchRetrieveRequest>
<srw:query>(gallica all "test")</srw:query>
<srw:version>1.2</srw:version>
</srw:echoedSearchRetrieveRequest>
<srw:numberOfRecords>58268</srw:numberOfRecords>
<srw:extraResponseData>&lt;numberOfRecordsDecollapser&gt;482438&lt;/numberOfRecordsDecollapser&gt;</srw:extraResponseData>
<srw:records>
<srw:record>
<srw:recordSchema>http://www.openarchives.org/OAI/2.0/OAIdc.xsd</srw:recordSchema>
<srw:recordPacking>xml</srw:recordPacking>
<srw:recordData>
<oai_dc

In [7]:
# Now we'll look at the IIIF API. The IIIF API allows you to retrieve images from Gallica's holdings, 
# as well as the .json metadata associated with those images. Gallica, as a participant in the IIIF, 
# offers access to all of the more than 100 million images in its Gallica digital library.

# The API takes an Ark ID, region, size, rotation, quality, and format as arguments if you are saving an image.
# In this notebook we'll simply retrieve metadata for an image, so we only need an Ark ID.

# Here we import the IIIF class from the file iiif_api.py

from iiif_api import IIIF

In [10]:
# This line fetches metadata for the image with the id pasted in the quotes.

IIIF.metadata('12148/btv1b90017179/f15')

https://gallica.bnf.fr/iiif/ark:/12148/btv1b90017179/f15/info.json


{'profile': 'http://library.stanford.edu/iiif/image-api/1.1/compliance.html#level2',
 'width': 2400,
 'height': 3498,
 '@context': 'http://library.stanford.edu/iiif/image-api/1.1/context.json',
 '@id': 'https://gallica.bnf.fr/iiif/ark:/12148/btv1b90017179/f15'}

In [11]:
# Finally, we'll look at the Document API.

# The Document API allows you to retrieve metadata about a particular document in Gallica's holdings.
# There are a number of different methods for retrieving various types of metadata; please see the full
# code in GitHub for further usage examples and possibilities.

# Here we import the Document class from the file document_api.py

from document_api import Document

In [12]:
# This line fetches metadata for the image with the Ark ID pasted in the quotes.

Document.ocr('bpt6k5619759j', '10')

https://gallica.bnf.fr/RequestDigitalElement?O=bpt6k5619759j&E=ALTO&Deb=10
<?xml version="1.0" encoding="utf-8"?>
<alto ID="alto.5619759" xmlns="http://bibnum.bnf.fr/ns/alto_prod" xmlns:xlink="http://www.w3.org/TR/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://bibnum.bnf.fr/ns/alto_prod http://bibnum.bnf.fr/ns/alto_prod.xsd">
<Description>
<MeasurementUnit>pixel</MeasurementUnit>
<sourceImageInformation>
<fileName>00000010.tif</fileName>
</sourceImageInformation>
<OCRProcessing ID="OCR_1">
<ocrProcessingStep>
<processingDateTime>2009-09-15</processingDateTime>
<processingStepDescription>[002_TOTAL_CHARS] TOTAL CAR. {1850}</processingStepDescription>
<processingStepDescription>[003_TOTAL_CHARS_USED] TOTAL CAR. COMPTABILISES {1744}</processingStepDescription>
<processingStepDescription>[004_TOTAL_CHARS_SUSPECTS_USED] TOTAL CAR. SUSPECTS COMPTABILISES {0}</processingStepDescription>
<processingStepDescription>[005_TOTAL_CHARS_UNUSED] TOTAL CAR. NON

TypeError: write() argument must be str, not bytes

In [None]:
# For further documentation, and to download the full PyGallica package, 
# please visit https://github.com/ian-nai/PyGallica.