# Load all Compact Memory Text to SOLR Core "cm"

Requires JSON formatted files, not the txt files 

In [1]:
import os, json, pysolr, requests, time


In [2]:
SOLR_URL = "http://localhost:8983/solr"
CORE_NAME = "cm_beta"
CHUNK_SIZE = 200
CONFIG_SET = "_default"

In [3]:
def solr_core_exists(solr_url: str, core_name: str) -> bool:
    """
    Check if a given Solr core exists using the Core Admin STATUS action.
    """
    status_url = f"{solr_url}/admin/cores"
    params = {
        "action": "STATUS",
        "core": core_name,
        "wt": "json",
    }

    response = requests.get(status_url, params=params)
    response.raise_for_status()
    data = response.json()

    # If the core exists, it should appear in the "status" dict with some content
    status = data.get("status", {})
    return core_name in status and bool(status[core_name])


def solr_delete_core(solr_url: str, core_name: str) -> None:
    """
    Delete (unload) a Solr core and remove its index, data and instance directory.
    """
    unload_url = f"{solr_url}/admin/cores"
    params = {
        "action": "UNLOAD",
        "core": core_name,
        "deleteIndex": "true",
        "deleteDataDir": "true",
        "deleteInstanceDir": "true",
        "wt": "json",
    }

    print(f"Unloading and deleting core '{core_name}' …")
    response = requests.get(unload_url, params=params)
    response.raise_for_status()
    print(f"Core '{core_name}' successfully unloaded and deleted.")
    print(response.json())


def solr_create_core(solr_url: str, core_name: str, config_set: str) -> None:
    """
    Create a Solr core using a given configSet.
    """
    create_url = f"{solr_url}/admin/cores"
    params = {
        "action": "CREATE",
        "name": core_name,
        "configSet": config_set,
        "wt": "json",
    }

    print(f"Creating core '{core_name}' with configSet '{config_set}' …")
    response = requests.get(create_url, params=params)
    response.raise_for_status()
    print(f"Core '{core_name}' successfully created.")
    print(response.json())


def recreate_solr_core(solr_url: str, core_name: str, config_set: str) -> None:
    """
    Check if the core exists, delete it if necessary, and create it again.
    """
    try:
        if solr_core_exists(solr_url, core_name):
            print(f"Core '{core_name}' already exists.")
            solr_delete_core(solr_url, core_name)
            # Small delay to give Solr time to clean up
            time.sleep(2)
        else:
            print(f"Core '{core_name}' does not exist yet.")

        solr_create_core(solr_url, core_name, config_set)
    except requests.RequestException as e:
        print("Error while communicating with Solr:")
        print(e)

In [4]:
# --- Run the logic ---
recreate_solr_core(SOLR_URL, CORE_NAME, CONFIG_SET)

Core 'cm_beta' already exists.
Unloading and deleting core 'cm_beta' …
Core 'cm_beta' successfully unloaded and deleted.
{'responseHeader': {'status': 0, 'QTime': 45}}
Creating core 'cm_beta' with configSet '_default' …
Core 'cm_beta' successfully created.
{'responseHeader': {'status': 0, 'QTime': 428}, 'core': 'cm_beta'}


In [5]:
solr = pysolr.Solr(f"{SOLR_URL}/{CORE_NAME}", timeout=200)

In [6]:
def iter_json_files(path):
    """Yield JSON files recursively."""
    for root, _, files in os.walk(path):
        for f in files:
            if f.endswith(".jsonl"):
                yield os.path.join(root, f)


def load_docs_from_file(path):
    """
    Load documents from a JSON file.

    Supports:
    - NDJSON: one JSON object per line
    - A single JSON object
    - A JSON array of objects

    Skips broken JSON lines and prints a warning.
    """
    docs = []

    with open(path, "r", encoding="utf-8", errors="replace") as f:
        content = f.read().strip()

    if not content:
        return docs

    # Heuristic: if there are multiple lines and it does NOT start with '[' or '{',
    # we treat it as NDJSON. But logstash-NDJSON ist meist: one JSON per line.
    lines = content.splitlines()
    if len(lines) > 1:
        # Try NDJSON first
        ndjson_ok = True
        for lineno, line in enumerate(lines, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                ndjson_ok = False
                # If we break early, we will try full-file JSON parsing below
                break

            if isinstance(obj, dict):
                docs.append(obj)
            elif isinstance(obj, list):
                docs.extend(obj)
            else:
                # unsupported top-level type
                pass

        if ndjson_ok and docs:
            return docs
        # if NDJSON failed, fall through and try full JSON

    # Fallback: treat entire file as a single JSON document
    try:
        data = json.loads(content)
    except json.JSONDecodeError as e:
        print(f"[WARN] Could not parse JSON file {path}: {e}")
        return []

    if isinstance(data, dict):
        return [data]
    elif isinstance(data, list):
        return data
    else:
        print(f"[WARN] Unexpected top-level JSON type in {path}: {type(data)}")
        return []


def upload_docs(docs):
    """Upload a chunk of docs to Solr."""
    if not docs:
        return
    solr.add(docs, commit=False)


def bulk_index_directory(path):
    """
    Recursively scan `path` for JSON files and bulk index them into Solr.
    """
    buffer = []
    for file in iter_json_files(path):
        print(f"Reading {file} …")
        docs = load_docs_from_file(file)
        print(f"  -> {len(docs)} docs parsed")

        for doc in docs:
            buffer.append(doc)
            if len(buffer) >= CHUNK_SIZE:
                print(f"Sending chunk of {len(buffer)} docs to Solr …")
                upload_docs(buffer)
                buffer = []

    # last chunk
    if buffer:
        print(f"Sending final chunk of {len(buffer)} docs to Solr …")
        upload_docs(buffer)

    solr.commit()
    print("Finished import.")

# RUN

In [7]:
bulk_index_directory("/data/cm/output/solr/")

Reading /data/cm/output/solr/2710055/2710056/2710059/2710055_2710056_2710059.jsonl …
  -> 18 docs parsed
Reading /data/cm/output/solr/2710055/2710056/2710060/2710055_2710056_2710060.jsonl …
  -> 24 docs parsed
Reading /data/cm/output/solr/2710055/2710056/2710057/2710055_2710056_2710057.jsonl …
  -> 20 docs parsed
Reading /data/cm/output/solr/2710055/2710056/2710058/2710055_2710056_2710058.jsonl …
  -> 27 docs parsed
Reading /data/cm/output/solr/2710055/2710056/2710062/2710055_2710056_2710062.jsonl …
  -> 21 docs parsed
Reading /data/cm/output/solr/2710055/2710056/2710061/2710055_2710056_2710061.jsonl …
  -> 20 docs parsed
Reading /data/cm/output/solr/9616701/9618237/9618238/9616701_9618237_9618238.jsonl …
  -> 140 docs parsed
Sending chunk of 200 docs to Solr …
Reading /data/cm/output/solr/9616701/9618237/9641358/9616701_9618237_9641358.jsonl …
  -> 2 docs parsed
Reading /data/cm/output/solr/9616701/9618237/9641359/9616701_9618237_9641359.jsonl …
  -> 2 docs parsed
Reading /data/cm/out