In [66]:
import logging
import requests
import itertools
from typing import List
from collections import Counter, defaultdict


BAD_CACHE = defaultdict(Counter)


def get_general_frequencies(
    texts: List[str],
    words: List[str],
    max_query: int = 100,
    # Blacklab specific system
    server: str = "http://localhost:8888/blacklab-server/latin-corpus-index/hits",
):
    filters = " OR ".join(["docId:"+txt.replace(":", "\\:") for txt in texts])
    cache_id = tuple(sorted(texts))
    toks = Counter()
    for word in set(words):
        if cache_id in BAD_CACHE and word in BAD_CACHE[cache_id]:
            toks[word] = BAD_CACHE[cache_id][word]
            words.remove(word)
            print("Hitting cache", word)
            
    if words:
        words = sorted(list(set(words)))
        for n in range(0, len(words), max_query):
            word_subset = words[n:n+max_query]
            print("query", word_subset)
            req = requests.post(server, data={
            #    "patt": f"[]",
                "patt": "["+" | ".join([f"lemma=\"{word}\"" for word in set(word_subset)]) + "]",
                "filter": filters,
                "outputformat": "json",
                "wordsaroundhit": 0,
                "group": "hit:lemma:s",
                "first":0,
                "maxretrieve": -1,
                "waitfortotal": "yes",
                "maxcount": -1
            })
            json = req.json()
            toks.update({
                tok["identityDisplay"]: tok["size"]
                for tok in json["hitGroups"]
            })
    
    for word in toks:
        BAD_CACHE[cache_id][word] = toks[word]
    return toks   

import time
start = time.time()
get_general_frequencies([], ["mentula", "lasciuus"])
end = (time.time() - start) / 60
print(end)

query ['lasciuus', 'mentula']
0.00019690990447998046


In [67]:
start = time.time()
get_general_frequencies([], ['soror', 'audeo', 'duco', 'priapium', 'sum1', "priapum", "lasciuus"], max_query=2)
end = (time.time() - start) / 60
print(end)

Hitting cache lasciuus
query ['audeo', 'duco']
query ['priapium', 'priapum']
query ['soror', 'sum1']
0.031162325541178384


In [58]:
start = time.time()
get_general_frequencies([], ['soror', 'audeo', 'duco', 'priapium', 'sum1', "priapum", "lasciuus"])
end = (time.time() - start) / 60
print(end)

Hitting cache soror
Hitting cache lasciuus
Hitting cache sum1
Hitting cache audeo
Hitting cache priapium
Hitting cache duco
0.008697144190470378


In [42]:
SERVER = "http://localhost:8888/blacklab-server/latin-corpus-index/hits"
GLOBAL_WINDOW = 10
MIN_FREQ = 10
COOC_MIN_FREQ = 20
NORMALIZATION: str = "ratio"
IGNORE_WORDS = ("sum1", "que", "habeo", "dico2", "facio", "possum1", "do", "uideo", "uolo3", "iam", "sic",
               "tamen", "tam", "puto", "res", "suus")
TEXTS = ["urn:cts:latinLit:phi1294.phi002.perseus-lat2", "urn:cts:latinLit:phi1103.phi001.lascivaroma-lat1"]
LEMMAS = ['soror', 'audeo', 'duco', 'priapium', 'sum1']

filters = " OR ".join(["docId:"+txt.replace(":", "\\:") for txt in TEXTS])
req = requests.post(SERVER, data={
#    "patt": f"[]",
    "patt": "["+" | ".join([f"lemma=\"{word}\"" for word in LEMMAS]) + "]",
    "filter": filters,
    "outputformat": "json",
    "wordsaroundhit": 0,
    "group": "hit:lemma:s",
    "first":0,
    #"number": -1,
    "maxretrieve": -1,
    "waitfortotal": "yes",
    "maxcount": -1
})
data = req.json()
data

{'summary': {'searchParam': {'filter': 'docId:urn\\:cts\\:latinLit\\:phi1294.phi002.perseus-lat2 OR docId:urn\\:cts\\:latinLit\\:phi1103.phi001.lascivaroma-lat1',
   'first': '0',
   'group': 'hit:lemma:s',
   'indexname': 'latin-corpus-index',
   'maxcount': '-1',
   'maxretrieve': '-1',
   'patt': '[lemma="soror" | lemma="audeo" | lemma="duco" | lemma="priapium" | lemma="sum1"]',
   'waitfortotal': 'yes',
   'wordsaroundhit': '0'},
  'searchTime': 201,
  'numberOfGroups': 5,
  'largestGroupSize': 2019,
  'windowFirstResult': 0,
  'requestedWindowSize': 50,
  'actualWindowSize': 5,
  'windowHasPrevious': False,
  'windowHasNext': False,
  'stillCounting': False,
  'numberOfHits': 2105,
  'numberOfHitsRetrieved': 2105,
  'stoppedCountingHits': False,
  'stoppedRetrievingHits': False,
  'numberOfDocs': 2,
  'numberOfDocsRetrieved': 2,
  'subcorpusSize': {'documents': 2, 'tokens': 76198}},
 'hitGroups': [{'identity': 'cws:lemma:s:sum1',
   'identityDisplay': 'sum1',
   'size': 2019,
   '

{'summary': {'searchParam': {'filter': 'docId:urn\\:cts\\:latinLit\\:phi1294.phi002.perseus-lat2 OR docId:urn\\:cts\\:latinLit\\:phi1103.phi001.lascivaroma-lat1',
   'first': '0',
   'group': 'hit:lemma:s',
   'indexname': 'latin-corpus-index',
   'maxcount': '-1',
   'maxretrieve': '-1',
   'patt': '[]',
   'waitfortotal': 'yes',
   'wordsaroundhit': '0'},
  'searchTime': 310,
  'numberOfGroups': 6990,
  'largestGroupSize': 8275,
  'windowFirstResult': 0,
  'requestedWindowSize': 50,
  'actualWindowSize': 50,
  'windowHasPrevious': False,
  'windowHasNext': True,
  'stillCounting': False,
  'numberOfHits': 76198,
  'numberOfHitsRetrieved': 76198,
  'stoppedCountingHits': False,
  'stoppedRetrievingHits': False,
  'numberOfDocs': 2,
  'numberOfDocsRetrieved': 2,
  'subcorpusSize': {'documents': 2, 'tokens': 76198}},
 'hitGroups': [{'identity': 'cws:lemma:s:$CM',
   'identityDisplay': ',',
   'size': 8275,
   'numberOfDocs': 2},
  {'identity': 'cws:lemma:s:.',
   'identityDisplay': '.',