In [1]:
import json
import pandas as pd
import scattertext as st
import urllib.parse
import urllib.request

from IPython.display import display, HTML
from typing import Any, Dict, List, Set, TextIO

In [2]:
SERVER_URL: str = "https://blacklab.impfic-knaw.src.surf-hosted.nl/blacklab-server"
CORPUS: str = "brown_corpus"

In [3]:
def search(query: str) -> Dict[str, Any]:
    """ Search and return hits.
    
    Takes a Corpus Query Language string, assembles a URL string,
    queries the BlackLab REST API and returns the result as JSON.
    
    See https://inl.github.io/BlackLab/corpus-query-language.html
    """
    replaced: str = urllib.parse.quote_plus(query)  # escaping special characters with their URL safe equivalent
    url: str = SERVER_URL + "/" + CORPUS + "/hits?outputformat=json&number=999999999&patt=" + replaced
    print("Getting URL: ", url)
    raw_response: urllib.response.HTTPResponse = urllib.request.urlopen(url)
    response: Dict[str, Any] = json.loads(raw_response.read().decode('utf-8'))
    return response

In [11]:
query_text_she: str = '[] "she" []'
result_she: Dict[str, Any] = search(query_text_she)

Getting URL:  https://blacklab.impfic-knaw.src.surf-hosted.nl/blacklab-server/brown_corpus/hits?outputformat=json&number=999999999&patt=%5B%5D+%22she%22+%5B%5D


**The result is a dictionary with a search summary, information about the indexed documents and the search hits.**

In [12]:
result_she.keys()

dict_keys(['summary', 'hits', 'docInfos'])

**The search summary lists search parameters, search time, number of hits and many other interesting items.**

In [13]:
result_she["summary"]

{'searchParam': {'indexname': 'brown_corpus',
  'number': '999999999',
  'patt': '[] "she" []'},
 'searchTime': 50,
 'countTime': 152,
 'windowFirstResult': 0,
 'requestedWindowSize': 3000,
 'actualWindowSize': 2849,
 'windowHasPrevious': False,
 'windowHasNext': False,
 'stillCounting': False,
 'numberOfHits': 2849,
 'numberOfHitsRetrieved': 2849,
 'stoppedCountingHits': False,
 'stoppedRetrievingHits': False,
 'numberOfDocs': 227,
 'numberOfDocsRetrieved': 227,
 'docFields': {'titleField': 'fromInputFile'},
 'metadataFieldDisplayNames': {'fromInputFile': 'From input file'}}

**`docInfos` is a list, containing infos about all the documents in the corpus. As an example, let's have a look at the infos of "Document 5".**

In [14]:
result_she["docInfos"]["5"]

{'fromInputFile': ['/input/brownCorpus.lemmatized.xml'],
 'lengthInTokens': 1992,
 'mayView': True}

**`hits` is a list containing a dict of all the matches and a bunch of extra information. We can find the corresponding document ID, start and end position, the match itself and the "left" and "right" context of the match.**

In [15]:
result_she["hits"][1]

{'docPid': '5',
 'start': 1982,
 'end': 1985,
 'left': {'punct': [' ', ' ', ' ', ' ', ' '],
  'lemma': ['adoption', 'of', 'the', "women's", 'suffrage'],
  'pos': ['', '', '', '', ''],
  'word': ['adoption', 'of', 'the', "women's", 'suffrage']},
 'match': {'punct': [' ', ' . ', ' '],
  'lemma': ['amendment', 'she', 'serve'],
  'pos': ['', '', ''],
  'word': ['amendment', 'She', 'served']},
 'right': {'punct': [' ', ' ', ' ', ' ', ' '],
  'lemma': ['one', 'four-year', 'term', 'on', 'the'],
  'pos': ['', '', '', '', ''],
  'word': ['one', 'four-year', 'term', 'on', 'the']}}

**The "match" part contains the search match from the query as well as the corresponding lemmas.**

In [16]:
result_she["hits"][1]["match"]

{'punct': [' ', ' . ', ' '],
 'lemma': ['amendment', 'she', 'serve'],
 'pos': ['', '', ''],
 'word': ['amendment', 'She', 'served']}