In [1]:
# Create an ES index for entity names. This index is used in the search page of the Compact Memory entity search
# https://labs.judaicalink.org/cm_e_search/
# as a backend for the autocomplete functionality

import os, json, pprint, time, pysolr, requests

In [16]:
root_path = "/data/scripts/cm/scripts/"
SOLR_URL = "http://localhost:8983/solr"
CORE_NAME = "cm_entity_names"
CONFIG_SET = "_default"

Import the files for the uri's

In [3]:
uri_to_name = json.load(open(root_path + 'cooccurrence/output/uri_to_name.json', 'r', encoding="utf-8"))
name_to_uri = json.load(open(root_path + 'cooccurrence/output/name_to_uri.json', 'r', encoding="utf-8"))

In [4]:
data = []
i = 0
for name, uri in name_to_uri.items():
    
    header = {'_id': str(i), '_index': 'cm_entity_names'}
    body = {'name': name, 'uri': uri}
    
    
    data.append(json.dumps({"index": header}, ensure_ascii=False))
    data.append(json.dumps(body, ensure_ascii=False))
    
    i += 1

In [5]:
print(data[0])

{"index": {"_id": "0", "_index": "cm_entity_names"}}


In [6]:
mappings = {
    'properties': {
        'name': {'type': 'text'},
        'uri': {'type': 'text'}
    }
}

Create the SOLR Core

In [7]:
# --- Configuration ---

def solr_core_exists(solr_url: str, core_name: str) -> bool:
    """
    Check if a given Solr core exists using the Core Admin STATUS action.
    """
    status_url = f"{solr_url}/admin/cores"
    params = {
        "action": "STATUS",
        "core": core_name,
        "wt": "json",
    }

    response = requests.get(status_url, params=params)
    response.raise_for_status()
    data = response.json()

    # If the core exists, it should appear in the "status" dict with some content
    status = data.get("status", {})
    return core_name in status and bool(status[core_name])


def solr_delete_core(solr_url: str, core_name: str) -> None:
    """
    Delete (unload) a Solr core and remove its index, data and instance directory.
    """
    unload_url = f"{solr_url}/admin/cores"
    params = {
        "action": "UNLOAD",
        "core": core_name,
        "deleteIndex": "true",
        "deleteDataDir": "true",
        "deleteInstanceDir": "true",
        "wt": "json",
    }

    print(f"Unloading and deleting core '{core_name}' …")
    response = requests.get(unload_url, params=params)
    response.raise_for_status()
    print(f"Core '{core_name}' successfully unloaded and deleted.")
    print(response.json())


def solr_create_core(solr_url: str, core_name: str, config_set: str) -> None:
    """
    Create a Solr core using a given configSet.
    """
    create_url = f"{solr_url}/admin/cores"
    params = {
        "action": "CREATE",
        "name": core_name,
        "configSet": config_set,
        "wt": "json",
    }

    print(f"Creating core '{core_name}' with configSet '{config_set}' …")
    response = requests.get(create_url, params=params)
    response.raise_for_status()
    print(f"Core '{core_name}' successfully created.")
    print(response.json())


def recreate_solr_core(solr_url: str, core_name: str, config_set: str) -> None:
    """
    Check if the core exists, delete it if necessary, and create it again.
    """
    try:
        if solr_core_exists(solr_url, core_name):
            print(f"Core '{core_name}' already exists.")
            solr_delete_core(solr_url, core_name)
            # Small delay to give Solr time to clean up
            time.sleep(2)
        else:
            print(f"Core '{core_name}' does not exist yet.")

        solr_create_core(solr_url, core_name, config_set)
    except requests.RequestException as e:
        print("Error while communicating with Solr:")
        print(e)


# --- Run ---
recreate_solr_core(SOLR_URL, CORE_NAME, CONFIG_SET)


Core 'cm_entity_names' already exists.
Unloading and deleting core 'cm_entity_names' …
Core 'cm_entity_names' successfully unloaded and deleted.
{'responseHeader': {'status': 0, 'QTime': 127}}
Creating core 'cm_entity_names' with configSet '_default' …
Core 'cm_entity_names' successfully created.
{'responseHeader': {'status': 0, 'QTime': 177}, 'core': 'cm_entity_names'}


In [8]:
solr = pysolr.Solr(f"{SOLR_URL}/{CORE_NAME}", always_commit=False, timeout=10)

Bulk upload the data into the SOLR Core

In [9]:
docs = []
current_id = None

for line in data:
    obj = json.loads(line)  # String -> Dict

    if "index" in obj:
        current_id = obj["index"]["_id"]
    else:
        doc = {
            "id": current_id,
            "name": obj["name"],
            "uri": obj["uri"],
        }
        docs.append(doc)

len(docs), docs[:5]


(6054,
 [{'id': '0',
   'name': 'Franz Mehring',
   'uri': 'http://data.judaicalink.org/data/dbpedia/Franz_Mehring'},
  {'id': '1',
   'name': 'Hans Blüher',
   'uri': 'http://data.judaicalink.org/data/dbpedia/Hans_Blüher'},
  {'id': '2',
   'name': 'Wilhelm Börner (Philosoph)',
   'uri': 'http://data.judaicalink.org/data/dbpedia/Wilhelm_Börner_(Philosoph)'},
  {'id': '3',
   'name': 'Heinrich Mann',
   'uri': 'http://data.judaicalink.org/data/dbpedia/Heinrich_Mann'},
  {'id': '4',
   'name': 'Aschersleben',
   'uri': 'http://data.judaicalink.org/data/dbpedia/Aschersleben'}])

In [10]:
solr = pysolr.Solr(f"{SOLR_URL}/{CORE_NAME}", timeout=200)

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

# send in chunks
for chunk in chunks(docs, 1000):
    solr.add(chunk, commit=False)

# commit at the end
solr.commit()


'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n\n<lst name="responseHeader">\n  <int name="status">0</int>\n  <int name="QTime">199</int>\n</lst>\n</response>\n'

Test query

In [12]:
solr = pysolr.Solr(f"{SOLR_URL}/{CORE_NAME}", always_commit=False, timeout=10)
start = time.time()
res = solr.search('*:*', index=CORE_NAME, rows=10000)
end = time.time()
print("Search took", round(end-start, 2))

Search took 0.23


In [13]:
print(res.hits)

6054


In [17]:
for doc in res:
    print('Name:', doc['name'])
    print('URI:', doc['uri'])
    print()

Name: ['Franz Mehring']
URI: ['http://data.judaicalink.org/data/dbpedia/Franz_Mehring']

Name: ['Hans Blüher']
URI: ['http://data.judaicalink.org/data/dbpedia/Hans_Blüher']

Name: ['Wilhelm Börner (Philosoph)']
URI: ['http://data.judaicalink.org/data/dbpedia/Wilhelm_Börner_(Philosoph)']

Name: ['Heinrich Mann']
URI: ['http://data.judaicalink.org/data/dbpedia/Heinrich_Mann']

Name: ['Aschersleben']
URI: ['http://data.judaicalink.org/data/dbpedia/Aschersleben']

Name: ['Dresden']
URI: ['http://data.judaicalink.org/data/dbpedia/Dresden']

Name: ['Głogów']
URI: ['http://data.judaicalink.org/data/dbpedia/Głogów']

Name: ['Jelenia Góra']
URI: ['http://data.judaicalink.org/data/dbpedia/Jelenia_Góra']

Name: ['Wladimir Iljitsch Lenin']
URI: ['http://data.judaicalink.org/data/dbpedia/Wladimir_Iljitsch_Lenin']

Name: ['Pjotr Alexejewitsch Kropotkin']
URI: ['http://data.judaicalink.org/data/dbpedia/Pjotr_Alexejewitsch_Kropotkin']

Name: ['Rheine']
URI: ['http://data.judaicalink.org/data/dbpedia/R

Name: Korbach
URI: http://data.judaicalink.org/data/dbpedia/Korbach

Name: Mutěnín
URI: http://data.judaicalink.org/data/dbpedia/Mutěnín

Name: Michael Weingarten
URI: http://data.judaicalink.org/data/dbpedia/Michael_Weingarten

Name: Richard Weiner
URI: http://data.judaicalink.org/data/dbpedia/Richard_Weiner

Name: Edmund Eysler
URI: http://data.judaicalink.org/data/dbpedia/Edmund_Eysler

Name: Georges Clemenceau
URI: http://data.judaicalink.org/data/dbpedia/Georges_Clemenceau

Name: Zwolle
URI: http://data.judaicalink.org/data/dbpedia/Zwolle

Name: Josef Jakubowicz
URI: http://data.judaicalink.org/data/dbpedia/Josef_Jakubowicz

Name: Olyka
URI: http://data.judaicalink.org/data/dbpedia/Olyka

Name: Otto Erich Deutsch
URI: http://data.judaicalink.org/data/dbpedia/Otto_Erich_Deutsch

Name: Rudolf Hans Bartsch
URI: http://data.judaicalink.org/data/dbpedia/Rudolf_Hans_Bartsch

Name: Uman
URI: http://data.judaicalink.org/data/dbpedia/Uman

Name: Frank Stella
URI: http://data.judaicalink.or

URI: http://data.judaicalink.org/data/dbpedia/Poti

Name: Sławno
URI: http://data.judaicalink.org/data/dbpedia/Sławno

Name: Jonatan (Tannaite)
URI: http://data.judaicalink.org/data/dbpedia/Jonatan_(Tannaite)

Name: Tangermünde
URI: http://data.judaicalink.org/data/dbpedia/Tangermünde

Name: Stralsund
URI: http://data.judaicalink.org/data/dbpedia/Stralsund

Name: Bückeburg
URI: http://data.judaicalink.org/data/dbpedia/Bückeburg

Name: Salomo Löwisohn
URI: http://data.judaicalink.org/data/dbpedia/Salomo_Löwisohn

Name: Lübz
URI: http://data.judaicalink.org/data/dbpedia/Lübz

Name: Johann David Michaelis
URI: http://data.judaicalink.org/data/dbpedia/Johann_David_Michaelis

Name: Pyrzyce
URI: http://data.judaicalink.org/data/dbpedia/Pyrzyce

Name: Otto Stobbe
URI: http://data.judaicalink.org/data/dbpedia/Otto_Stobbe

Name: Szczecinek
URI: http://data.judaicalink.org/data/dbpedia/Szczecinek

Name: Rabbi Tarfon
URI: http://data.judaicalink.org/data/dbpedia/Rabbi_Tarfon

Name: Salzwedel
URI:

Name: Mödling
URI: http://data.judaicalink.org/data/dbpedia/Mödling

Name: Alex Katz
URI: http://data.judaicalink.org/data/dbpedia/Alex_Katz

Name: Thomas Murner
URI: http://data.judaicalink.org/data/dbpedia/Thomas_Murner

Name: Samuel Bak
URI: http://data.judaicalink.org/data/dbpedia/Samuel_Bak

Name: Halver
URI: http://data.judaicalink.org/data/dbpedia/Halver

Name: Anatoli Wassiljewitsch Lunatscharski
URI: http://data.judaicalink.org/data/dbpedia/Anatoli_Wassiljewitsch_Lunatscharski

Name: Benno Elkan
URI: http://data.judaicalink.org/data/dbpedia/Benno_Elkan

Name: Abraham Ris
URI: http://data.judaicalink.org/data/dbpedia/Abraham_Ris

Name: Hugo Barbeck
URI: http://data.judaicalink.org/data/dbpedia/Hugo_Barbeck

Name: Friedelsheim
URI: http://data.judaicalink.org/data/dbpedia/Friedelsheim

Name: Moses Goldschmidt
URI: http://data.judaicalink.org/data/dbpedia/Moses_Goldschmidt

Name: Seirijai
URI: http://data.judaicalink.org/data/dbpedia/Seirijai

Name: Emanuel Feuermann
URI: http://

URI: http://data.judaicalink.org/data/dbpedia/Barczewo

Name: William Wolff (Rabbiner)
URI: http://data.judaicalink.org/data/dbpedia/William_Wolff_(Rabbiner)

Name: Děčín
URI: http://data.judaicalink.org/data/dbpedia/Děčín

Name: Mladkov
URI: http://data.judaicalink.org/data/dbpedia/Mladkov

Name: Adolph Wiesner
URI: http://data.judaicalink.org/data/dbpedia/Adolph_Wiesner

Name: David Cohen de Lara
URI: http://data.judaicalink.org/data/dbpedia/David_Cohen_de_Lara

Name: Salomo Adret
URI: http://data.judaicalink.org/data/dbpedia/Salomo_Adret

Name: Bernhard Kellermann
URI: http://data.judaicalink.org/data/dbpedia/Bernhard_Kellermann

Name: Tuchola
URI: http://data.judaicalink.org/data/dbpedia/Tuchola

Name: Jizchak Blaser
URI: http://data.judaicalink.org/data/dbpedia/Jizchak_Blaser

Name: Nathan Marcus Adler
URI: http://data.judaicalink.org/data/dbpedia/Nathan_Marcus_Adler

Name: Rebecca Gratz
URI: http://data.judaicalink.org/data/dbpedia/Rebecca_Gratz

Name: Bauska
URI: http://data.jud

URI: http://data.judaicalink.org/data/dbpedia/Triesen

Name: Arthur Keith
URI: http://data.judaicalink.org/data/dbpedia/Arthur_Keith

Name: Johann Caspar Ulrich
URI: http://data.judaicalink.org/data/dbpedia/Johann_Caspar_Ulrich

Name: Else Rosenfeld
URI: http://data.judaicalink.org/data/dbpedia/Else_Rosenfeld

Name: Hans Prutz
URI: http://data.judaicalink.org/data/dbpedia/Hans_Prutz

Name: Walter Baumgartner (Theologe)
URI: http://data.judaicalink.org/data/dbpedia/Walter_Baumgartner_(Theologe)

Name: Julius Perlis (Schachspieler)
URI: http://data.judaicalink.org/data/dbpedia/Julius_Perlis_(Schachspieler)

Name: Železná Ruda
URI: http://data.judaicalink.org/data/dbpedia/Železná_Ruda

Name: Christian Knorr von Rosenroth
URI: http://data.judaicalink.org/data/dbpedia/Christian_Knorr_von_Rosenroth

Name: Franz Joseph Molitor
URI: http://data.judaicalink.org/data/dbpedia/Franz_Joseph_Molitor

Name: Elisabeth Schwarz (Schauspielerin)
URI: http://data.judaicalink.org/data/dbpedia/Elisabeth_Sch

URI: http://data.judaicalink.org/data/dbpedia/Helsingborg

Name: Oskar Holtzmann
URI: http://data.judaicalink.org/data/dbpedia/Oskar_Holtzmann

Name: Michael Baumgarten
URI: http://data.judaicalink.org/data/dbpedia/Michael_Baumgarten

Name: Jakob Philipp Fallmerayer
URI: http://data.judaicalink.org/data/dbpedia/Jakob_Philipp_Fallmerayer

Name: Hans Habe
URI: http://data.judaicalink.org/data/dbpedia/Hans_Habe

Name: Thomas Ebendorfer
URI: http://data.judaicalink.org/data/dbpedia/Thomas_Ebendorfer

Name: Chaim Halberstam
URI: http://data.judaicalink.org/data/dbpedia/Chaim_Halberstam

Name: Isaac Mayer Wise
URI: http://data.judaicalink.org/data/dbpedia/Isaac_Mayer_Wise

Name: Annette Kuhn
URI: http://data.judaicalink.org/data/dbpedia/Annette_Kuhn

Name: Max Schneidewin
URI: http://data.judaicalink.org/data/dbpedia/Max_Schneidewin

Name: Rudolf Christiani
URI: http://data.judaicalink.org/data/dbpedia/Rudolf_Christiani

Name: Konrad Schwenck
URI: http://data.judaicalink.org/data/dbpedia/Kon

Name: Gregor Gog
URI: http://data.judaicalink.org/data/dbpedia/Gregor_Gog

Name: Eleonore Sterling
URI: http://data.judaicalink.org/data/dbpedia/Eleonore_Sterling

Name: Albert Memmi
URI: http://data.judaicalink.org/data/dbpedia/Albert_Memmi

Name: Jorge Luis Borges
URI: http://data.judaicalink.org/data/dbpedia/Jorge_Luis_Borges

Name: Elia Kazan
URI: http://data.judaicalink.org/data/dbpedia/Elia_Kazan

Name: Ralph Marcus
URI: http://data.judaicalink.org/data/dbpedia/Ralph_Marcus

Name: Schraga Har-Gil
URI: http://data.judaicalink.org/data/dbpedia/Schraga_Har-Gil

Name: Sophie Scholl
URI: http://data.judaicalink.org/data/dbpedia/Sophie_Scholl

Name: Michail Moissejewitsch Botwinnik
URI: http://data.judaicalink.org/data/dbpedia/Michail_Moissejewitsch_Botwinnik

Name: Harry Zohn
URI: http://data.judaicalink.org/data/dbpedia/Harry_Zohn

Name: Francis Spellman
URI: http://data.judaicalink.org/data/dbpedia/Francis_Spellman

Name: Jacob Tsur
URI: http://data.judaicalink.org/data/dbpedia/Jaco

URI: http://data.judaicalink.org/data/dbpedia/J._Duncan_M._Derrett

Name: Großbothen
URI: http://data.judaicalink.org/data/dbpedia/Großbothen

Name: James Oppenheim
URI: http://data.judaicalink.org/data/dbpedia/James_Oppenheim

Name: Diest
URI: http://data.judaicalink.org/data/dbpedia/Diest

Name: Samuel Mendelsohn
URI: http://data.judaicalink.org/data/dbpedia/Samuel_Mendelsohn

Name: Sama Maani
URI: http://data.judaicalink.org/data/dbpedia/Sama_Maani

Name: Nestorius
URI: http://data.judaicalink.org/data/dbpedia/Nestorius

Name: Darius (Medien)
URI: http://data.judaicalink.org/data/dbpedia/Darius_(Medien)

Name: Fulcher von Chartres
URI: http://data.judaicalink.org/data/dbpedia/Fulcher_von_Chartres

Name: Johann Plenge
URI: http://data.judaicalink.org/data/dbpedia/Johann_Plenge

Name: Arnold Fruchtenbaum
URI: http://data.judaicalink.org/data/dbpedia/Arnold_Fruchtenbaum

Name: Pauline Klaiber-Gottschau
URI: http://data.judaicalink.org/data/dbpedia/Pauline_Klaiber-Gottschau

Name: Jakob

Name: Zollikon
URI: http://data.judaicalink.org/data/dbpedia/Zollikon

Name: Honfleur
URI: http://data.judaicalink.org/data/dbpedia/Honfleur

Name: Hans Rodenberg
URI: http://data.judaicalink.org/data/dbpedia/Hans_Rodenberg

Name: Heinz Kindermann (Theaterforscher)
URI: http://data.judaicalink.org/data/dbpedia/Heinz_Kindermann_(Theaterforscher)

Name: Attnang-Puchheim
URI: http://data.judaicalink.org/data/dbpedia/Attnang-Puchheim

Name: Odense
URI: http://data.judaicalink.org/data/dbpedia/Odense

Name: Tuszyn
URI: http://data.judaicalink.org/data/dbpedia/Tuszyn

Name: Karol Sidon
URI: http://data.judaicalink.org/data/dbpedia/Karol_Sidon

Name: Bruno Bettelheim
URI: http://data.judaicalink.org/data/dbpedia/Bruno_Bettelheim

Name: Johann Friedrich Herbart
URI: http://data.judaicalink.org/data/dbpedia/Johann_Friedrich_Herbart

Name: Uwe Berger
URI: http://data.judaicalink.org/data/dbpedia/Uwe_Berger

Name: Hendaye
URI: http://data.judaicalink.org/data/dbpedia/Hendaye

Name: Kotlas
URI: ht