# Skript to build the SOLR Core for cm_entities

In [1]:
import os, json, pprint, pysolr, time, pickle, requests
from typing import List, Dict, Any

In [2]:
root_path = "/data/scripts/cm/scripts/"
jl_output_path = "/data/scripts/jl/data/"
SOLR_URL = "http://localhost:8983/solr"
CORE_NAME = "cm_entities_beta"
CONFIG_SET = "_default"
CHUNK_SIZE = 200

In [3]:
# 1 entity label-uri mapping
name_to_uri = json.load(open(root_path + 'cooccurrence/output/name_to_uri.json', 'r', encoding="utf-8"))
uri_to_name = json.load(open(root_path + 'cooccurrence/output/uri_to_name.json', 'r', encoding="utf-8"))
# 2 entity pages inverted index
ep_inv_index = pickle.load(open(jl_output_path + 'entity_pages/ep_inv_index.pickle', 'rb'))
# 3 occurrence by journal
occurrence_by_journal = pickle.load(open(root_path + 'cooccurrence/output/occurrence_by_journal.pickle', 'rb'))
# 4 related entities
entity_correlation = pickle.load(open(root_path + 'cooccurrence/output/entity_correlation.pickle', 'rb'))
# 5 data types 
classified_entities = json.load(open(root_path + 'jl_linking/classified_entities.json', 'r', encoding="utf-8"))

In [4]:
# utilities
id_to_journal = json.load(open(root_path + 'cooccurrence/input/id_to_journal.json', 'r', encoding="utf-8"))

In [5]:
for name, uri in name_to_uri.items():
    print(name, uri)
    print(ep_inv_index[uri])
    print(occurrence_by_journal[uri])
    print(entity_correlation[uri])
    break

Franz Mehring http://data.judaicalink.org/data/dbpedia/Franz_Mehring
http://data.judaicalink.org/data/ep/1413226
[('3315709', 4), ('2432763', 4), ('4886575', 3), ('2316602', 3), ('2979076', 3), ('2827798', 2), ('2277379', 2), ('3062972', 2), ('2710055', 1), ('7506414', 1), ('2912821', 1), ('5367336', 1), ('3224737', 1), ('2613366', 1), ('2583875', 1), ('2908689', 1), ('2641551', 1), ('9072829', 1), ('2692297', 1), ('9038025', 1), ('2651273', 1), ('2259275', 1)]
{'http://data.judaicalink.org/data/dbpedia/Hans_Blüher': 0.013513513513513514, 'http://data.judaicalink.org/data/dbpedia/Paul_Lindau': 0.021164021164021163, 'http://data.judaicalink.org/data/dbpedia/Alexander_Moszkowski': 0.022222222222222223, 'http://data.judaicalink.org/data/dbpedia/Maximilian_Harden': 0.006666666666666667, 'http://data.judaicalink.org/data/dbpedia/Max_Reinhardt': 0.0030257186081694403, 'http://data.judaicalink.org/data/dbpedia/Friedrich_Engels': 0.01652892561983471, 'http://data.judaicalink.org/data/dbpedia/A

In [6]:
data = []
i = 0
for name, uri in name_to_uri.items():
    
    try:
        if i < 10:
            print(name)

        occ_by_journal = [(id_to_journal[el[0]], el[1]) for el in occurrence_by_journal[uri]]
        related_entities = sorted([(res_uri, uri_to_name[res_uri], score) for res_uri, score in entity_correlation[uri].items()], key=lambda x:x[2], reverse=True)
        label_name = name.replace(' ', '_')
        e_type = classified_entities[label_name]
        if len(e_type) == 0:
            e_type = "OTH"
        
        
        header = {'_id': str(i), '_index': 'cm_entities'}
        body = {'name': name, 
                'uri': uri,
                'e_type': e_type,
                'journal_occs': occ_by_journal,
                'related_entities': related_entities,
                'ep': ep_inv_index[uri]
               }


        data.append(json.dumps({"index": header}, ensure_ascii=False))
        data.append(json.dumps(body, ensure_ascii=False))
        i += 1
        
    except KeyError:
        pass
        #print(name, uri)
    

Franz Mehring
Hans Blüher
Wilhelm Börner (Philosoph)
Heinrich Mann
Aschersleben
Dresden
Głogów
Jelenia Góra
Wladimir Iljitsch Lenin
Pjotr Alexejewitsch Kropotkin


In [7]:
print(len(data))
print(data[1])

10966
{"name": "Franz Mehring", "uri": "http://data.judaicalink.org/data/dbpedia/Franz_Mehring", "e_type": "PER", "journal_occs": [["Die Welt ", 4], ["Jüdisches Volksblatt", 4], ["Freies Blatt ", 3], ["Im deutschen Reich ", 3], ["Wiener Morgenzeitung", 3], ["Freie Tribüne ", 2], ["Central-Verein-Zeitung ", 2], ["Die Wahrheit ", 2], ["Der neue Anfang ", 1], ["", 1], ["Menorah ", 1], ["Das jüdische Literaturblatt", 1], ["Allgemeine Zeitung des Judenthums ", 1], ["Jüdisch-liberale Zeitung / Vereinigung für das Liberale Judentum", 1], ["Ost und West ", 1], ["Neue jüdische Monatshefte ", 1], ["Der Jüdische Arbeiter ", 1], ["Das jüdische Echo ", 1], ["Neue jüdische Presse ", 1], ["Antisemitisches Jahrbuch für ... / herausgegeben von Dr. G. Stille", 1], ["Jüdische Rundschau ", 1], ["Zeitschrift für die Geschichte der Juden in Deutschland", 1]], "related_entities": [["http://data.judaicalink.org/data/dbpedia/Alexander_Moszkowski", "Alexander Moszkowski", 0.022222222222222223], ["http://data.ju

In [8]:
mappings = {
    'properties': {
        'name': {'type': 'text'},
        'uri': {'type': 'text'},
        'ep': {'type': 'text'},
        'e_type': {'type': 'text'},
        'journal_occs': {'type': 'text'},
        'related_entities': {'type': 'text'}
    }
}

Recreate the SOLR Core

In [None]:
# --- Configuration ---

def solr_core_exists(solr_url: str, core_name: str) -> bool:
    """
    Check if a given Solr core exists using the Core Admin STATUS action.
    """
    status_url = f"{solr_url}/admin/cores"
    params = {
        "action": "STATUS",
        "core": core_name,
        "wt": "json",
    }

    response = requests.get(status_url, params=params)
    response.raise_for_status()
    data = response.json()

    # If the core exists, it should appear in the "status" dict with some content
    status = data.get("status", {})
    return core_name in status and bool(status[core_name])


def solr_delete_core(solr_url: str, core_name: str) -> None:
    """
    Delete (unload) a Solr core and remove its index, data and instance directory.
    """
    unload_url = f"{solr_url}/admin/cores"
    params = {
        "action": "UNLOAD",
        "core": core_name,
        "deleteIndex": "true",
        "deleteDataDir": "true",
        "deleteInstanceDir": "true",
        "wt": "json",
    }

    print(f"Unloading and deleting core '{core_name}' …")
    response = requests.get(unload_url, params=params)
    response.raise_for_status()
    print(f"Core '{core_name}' successfully unloaded and deleted.")
    print(response.json())


def solr_create_core(solr_url: str, core_name: str, config_set: str) -> None:
    """
    Create a Solr core using a given configSet.
    """
    create_url = f"{solr_url}/admin/cores"
    params = {
        "action": "CREATE",
        "name": core_name,
        "configSet": config_set,
        "wt": "json",
    }

    print(f"Creating core '{core_name}' with configSet '{config_set}' …")
    response = requests.get(create_url, params=params)
    response.raise_for_status()
    print(f"Core '{core_name}' successfully created.")
    print(response.json())


def recreate_solr_core(solr_url: str, core_name: str, config_set: str) -> None:
    """
    Check if the core exists, delete it if necessary, and create it again.
    """
    try:
        if solr_core_exists(solr_url, core_name):
            print(f"Core '{core_name}' already exists.")
            solr_delete_core(solr_url, core_name)
            # Small delay to give Solr time to clean up
            time.sleep(2)
        else:
            print(f"Core '{core_name}' does not exist yet.")

        solr_create_core(solr_url, core_name, config_set)
    except requests.RequestException as e:
        print("Error while communicating with Solr:")
        print(e)

In [15]:
# --- Run ---
recreate_solr_core(SOLR_URL, CORE_NAME, CONFIG_SET)

Core 'cm_entities_beta' already exists.
Unloading and deleting core 'cm_entities_beta' …
Core 'cm_entities_beta' successfully unloaded and deleted.
{'responseHeader': {'status': 0, 'QTime': 129}}
Creating core 'cm_entities_beta' with configSet '_default' …
Core 'cm_entities_beta' successfully created.
{'responseHeader': {'status': 0, 'QTime': 150}, 'core': 'cm_entities_beta'}


In [10]:
print(data[:5])

['{"index": {"_id": "0", "_index": "cm_entities"}}', '{"name": "Franz Mehring", "uri": "http://data.judaicalink.org/data/dbpedia/Franz_Mehring", "e_type": "PER", "journal_occs": [["Die Welt ", 4], ["Jüdisches Volksblatt", 4], ["Freies Blatt ", 3], ["Im deutschen Reich ", 3], ["Wiener Morgenzeitung", 3], ["Freie Tribüne ", 2], ["Central-Verein-Zeitung ", 2], ["Die Wahrheit ", 2], ["Der neue Anfang ", 1], ["", 1], ["Menorah ", 1], ["Das jüdische Literaturblatt", 1], ["Allgemeine Zeitung des Judenthums ", 1], ["Jüdisch-liberale Zeitung / Vereinigung für das Liberale Judentum", 1], ["Ost und West ", 1], ["Neue jüdische Monatshefte ", 1], ["Der Jüdische Arbeiter ", 1], ["Das jüdische Echo ", 1], ["Neue jüdische Presse ", 1], ["Antisemitisches Jahrbuch für ... / herausgegeben von Dr. G. Stille", 1], ["Jüdische Rundschau ", 1], ["Zeitschrift für die Geschichte der Juden in Deutschland", 1]], "related_entities": [["http://data.judaicalink.org/data/dbpedia/Alexander_Moszkowski", "Alexander Mosz

In [11]:
def build_nested_docs_from_bulk_lines(lines: List[str]) -> List[Dict[str, Any]]:
    """
    Transform Elasticsearch bulk-like NDJSON into Solr nested documents.

    Input (alternating):
      {"index": {"_id": "0", "_index": "cm_entities"}}
      {"name": "...", "uri": "...", "e_type": "...",
       "journal_occs": [[journal, count], ...],
       "related_entities": [[rel_uri, rel_name, score], ...],
       "ep": "..."}

    Output (example):

      {
        "id": "0",              # or entity["ep"] / entity["uri"] if you prefer
        "name": "...",
        "uri": "...",
        "e_type": "...",
        "ep": "...",
        "_childDocuments_": [
          {
            "id": "0_j_0",
            "type": "journal_occurrence",
            "journal_name": "...",
            "journal_count": 4
          },
          {
            "id": "0_r_0",
            "type": "related_entity",
            "related_uri": "...",
            "related_name": "...",
            "related_score": 0.0123
          },
          ...
        ]
      }
    """
    docs: List[Dict[str, Any]] = []
    current_id: str | None = None

    for line in lines:
        obj = json.loads(line)

        # Meta line from Elasticsearch bulk ({"index": {...}})
        if "index" in obj:
            current_id = str(obj["index"]["_id"])
            continue

        # Safety check: if somehow no index line was seen before
        if current_id is None:
            # Skip or raise – here we skip silently
            continue

        entity = obj

        # Choose your parent id – could also be entity["uri"] or entity["ep"]
        parent_id = current_id

        parent_doc: Dict[str, Any] = {
            "id": parent_id,
            "name": entity.get("name"),
            "uri": entity.get("uri"),
            "e_type": entity.get("e_type"),
            "ep": entity.get("ep"),
        }

        child_docs: List[Dict[str, Any]] = []

        # Nested docs for journal occurrences
        for j_idx, (journal_name, count) in enumerate(entity.get("journal_occs", [])):
            child_docs.append(
                {
                    # each child needs its own id because Solr's id is required
                    "id": f"{parent_id}_j_{j_idx}",
                    "type": "journal_occurrence",
                    "journal_name": journal_name,
                    "journal_count": int(count),
                }
            )

        # Nested docs for related entities
        for r_idx, (rel_uri, rel_name, score) in enumerate(entity.get("related_entities", [])):
            child_docs.append(
                {
                    "id": f"{parent_id}_r_{r_idx}",
                    "type": "related_entity",
                    "related_uri": rel_uri,
                    "related_name": rel_name,
                    "related_score": float(score),
                }
            )

        if child_docs:
            parent_doc["_childDocuments_"] = child_docs

        docs.append(parent_doc)

    return docs


def iter_chunks(items: List[Any], size: int):
    """Yield successive chunks of `size` from list `items`."""
    for i in range(0, len(items), size):
        yield items[i : i + size]


def upload_nested_docs_to_solr(
    docs: List[Dict[str, Any]],
    solr_url: str = SOLR_URL,
    core_name: str = CORE_NAME,
    chunk_size: int = CHUNK_SIZE,
):
    """
    Upload nested documents to Solr in chunks using pysolr.
    """
    solr = pysolr.Solr(f"{solr_url}/{core_name}", timeout=200)

    total = len(docs)
    print(f"Indexing {total} parent docs (with children) into '{core_name}' …")

    start = time.time()
    for idx, chunk in enumerate(iter_chunks(docs, chunk_size), start=1):
        print(f"  Sending chunk {idx} ({len(chunk)} parent docs)…")
        solr.add(chunk, commit=False)

    solr.commit()
    end = time.time()
    print(f"Finished indexing in {end - start:.2f} seconds.")


In [16]:
nested_docs = build_nested_docs_from_bulk_lines(data)

Commit the docs in chunks

In [17]:
upload_nested_docs_to_solr(nested_docs)

Indexing 5483 parent docs (with children) into 'cm_entities_beta' …
  Sending chunk 1 (200 parent docs)…
  Sending chunk 2 (200 parent docs)…
  Sending chunk 3 (200 parent docs)…
  Sending chunk 4 (200 parent docs)…
  Sending chunk 5 (200 parent docs)…
  Sending chunk 6 (200 parent docs)…
  Sending chunk 7 (200 parent docs)…
  Sending chunk 8 (200 parent docs)…
  Sending chunk 9 (200 parent docs)…
  Sending chunk 10 (200 parent docs)…
  Sending chunk 11 (200 parent docs)…
  Sending chunk 12 (200 parent docs)…
  Sending chunk 13 (200 parent docs)…
  Sending chunk 14 (200 parent docs)…
  Sending chunk 15 (200 parent docs)…
  Sending chunk 16 (200 parent docs)…
  Sending chunk 17 (200 parent docs)…
  Sending chunk 18 (200 parent docs)…
  Sending chunk 19 (200 parent docs)…
  Sending chunk 20 (200 parent docs)…
  Sending chunk 21 (200 parent docs)…
  Sending chunk 22 (200 parent docs)…
  Sending chunk 23 (200 parent docs)…
  Sending chunk 24 (200 parent docs)…
  Sending chunk 25 (200 paren

Test query

In [18]:
solr = pysolr.Solr(f"{SOLR_URL}/{CORE_NAME}", always_commit=False, timeout=10)
start = time.time()
res = solr.search('name:Franz Mehring', index=CORE_NAME, rows=10000)
end = time.time()
print("Search took", round(end-start, 2))

Search took 0.03


In [19]:
print(res.hits)

33


In [20]:
for doc in res:
    print(doc)
    #for el in doc['journal_occs']:
    #    print(el)

{'id': '0', 'name': ['Franz Mehring'], 'uri': ['http://data.judaicalink.org/data/dbpedia/Franz_Mehring'], 'e_type': ['PER'], 'ep': ['http://data.judaicalink.org/data/ep/1413226'], '_version_': 1850516184714182656}
{'id': '183', 'name': ['Franz Schubert'], 'uri': ['http://data.judaicalink.org/data/dbpedia/Franz_Schubert'], 'e_type': ['PER'], 'ep': ['http://data.judaicalink.org/data/ep/1548524'], '_version_': 1850516186656145408}
{'id': '230', 'name': ['Franz Rosenzweig'], 'uri': ['http://data.judaicalink.org/data/dbpedia/Franz_Rosenzweig'], 'e_type': ['PER'], 'ep': ['http://data.judaicalink.org/data/ep/1004530'], '_version_': 1850516187021049856}
{'id': '294', 'name': ['Franz Landsberger'], 'uri': ['http://data.judaicalink.org/data/dbpedia/Franz_Landsberger'], 'e_type': ['PER'], 'ep': ['http://data.judaicalink.org/data/ep/2491471'], '_version_': 1850516187230765056}
{'id': '313', 'name': ['Franz Kafka'], 'uri': ['http://data.judaicalink.org/data/dbpedia/Franz_Kafka'], 'e_type': ['PER'],

## Write bulk file for export

In [22]:
with open("cm_entities_bulk.jsonl", "w", encoding="utf-8") as f:
    for line in data:
        f.write(line + "\n")
