In [2]:

import bz2
import json
import os
import pickle as pkl
import re
import requests
import uuid
from collections import OrderedDict
from typing import List, Callable
from io import BytesIO

import numpy as np

from harmony.matching.negator import negate
from harmony.schemas.requests.text import Instrument, Question
from harmony_api.constants import (
    GOOGLE_GECKO_MULTILINGUAL,
    GOOGLE_GECKO_003,
    OPENAI_3_LARGE,
    OPENAI_ADA_02,
    HUGGINGFACE_MPNET_BASE_V2,
    HUGGINGFACE_MINILM_L12_V2,
    AZURE_OPENAI_ADA_02,
    AZURE_OPENAI_3_LARGE,
)
from harmony_api.core.settings import get_settings
from harmony_api.services import azure_openai_embeddings
from harmony_api.services import google_embeddings
from harmony_api.services import hugging_face_embeddings
from harmony_api.services import openai_embeddings
from harmony_api.services.azure_openai_embeddings import (
    HARMONY_API_AVAILABLE_AZURE_OPENAI_MODELS_LIST,
)
from harmony_api.services.google_embeddings import (
    HARMONY_API_AVAILABLE_GOOGLE_MODELS_LIST,
)
from harmony_api.services.openai_embeddings import (
    HARMONY_API_AVAILABLE_OPENAI_MODELS_LIST,
)
from harmony_api.services.vectors_cache import VectorsCache

settings = get_settings()

dir_path = os.path.dirname(os.path.realpath(__file__))

# Cache
vectors_cache = VectorsCache()

def get_catalogue_data_default() -> dict:
    """
    Get catalogue data default.

    Check if the files are available in the current directory, if not, download them from Azure Blob Storage.
    """

    all_questions = []
    all_instruments = []
    instrument_idx_to_question_idx = []

    # All questions
    all_questions_ever_seen_json = "all_questions_ever_seen.json"
    if os.path.isfile(all_questions_ever_seen_json):
        with open(all_questions_ever_seen_json, "r", encoding="utf-8") as file:
            all_questions = json.loads(file.read())
    else:
        if settings.AZURE_STORAGE_URL:
            with requests.get(
                url=f"{settings.AZURE_STORAGE_URL}/catalogue_data/{all_questions_ever_seen_json}",
                stream=True,
            ) as response:
                if response.ok:
                    buffer = BytesIO()
                    for chunk in response.iter_content(chunk_size=1024):
                        buffer.write(chunk)
                    all_questions = json.loads(buffer.getvalue().decode("utf-8"))
                    buffer.close()

    # Instrument index to question indexes
    instrument_idx_to_question_idxs_json = "instrument_idx_to_question_idxs.json"
    if os.path.isfile(instrument_idx_to_question_idxs_json):
        with open(instrument_idx_to_question_idxs_json, "r", encoding="utf-8") as file:
            instrument_idx_to_question_idx = json.loads(file.read())
    else:
        if settings.AZURE_STORAGE_URL:
            with requests.get(
                url=f"{settings.AZURE_STORAGE_URL}/catalogue_data/{instrument_idx_to_question_idxs_json}",
                stream=True,
            ) as response:
                if response.ok:
                    buffer = BytesIO()
                    for chunk in response.iter_content(chunk_size=1024):
                        buffer.write(chunk)
                    instrument_idx_to_question_idx = json.loads(
                        buffer.getvalue().decode("utf-8")
                    )
                    buffer.close()

    # All instruments
    all_instruments_preprocessed_json = "all_instruments_preprocessed.json"
    if os.path.isfile(all_instruments_preprocessed_json):
        with open(all_instruments_preprocessed_json, "r", encoding="utf-8") as file:
            for line in file:
                instrument = json.loads(line)
                all_instruments.append(instrument)
    else:
        if settings.AZURE_STORAGE_URL:
            with requests.get(
                url=f"{settings.AZURE_STORAGE_URL}/catalogue_data/{all_instruments_preprocessed_json}",
                stream=True,
            ) as response:
                if response.ok:
                    buffer = BytesIO()
                    for chunk in response.iter_content(chunk_size=1024):
                        buffer.write(chunk)
                    for line in buffer.getvalue().decode("utf-8").splitlines():
                        instrument = json.loads(line)
                        all_instruments.append(instrument)
                    buffer.close()

    return {
        "all_questions": all_questions,
        "all_instruments": all_instruments,
        "instrument_idx_to_question_idx": instrument_idx_to_question_idx,
    }

ValidationError: 104 validation errors for Instrument
file_section
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
study
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
sweep
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
metadata
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.0.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.0.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.0.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.0.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.0.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.1.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.1.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.1.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.1.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.1.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.2.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.2.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.2.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.2.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.2.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.3.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.3.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.3.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.3.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.3.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.4.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.4.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.4.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.4.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.4.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.5.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.5.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.5.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.5.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.5.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.6.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.6.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.6.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.6.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.6.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.7.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.7.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.7.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.7.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.7.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.8.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.8.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.8.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.8.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.8.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.9.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.9.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.9.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.9.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.9.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.10.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.10.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.10.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.10.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.10.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.11.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.11.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.11.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.11.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.11.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.12.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.12.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.12.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.12.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.12.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.13.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.13.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.13.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.13.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.13.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.14.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.14.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.14.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.14.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.14.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.15.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.15.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.15.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.15.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.15.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.16.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.16.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.16.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.16.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.16.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.17.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.17.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.17.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.17.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.17.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.18.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.18.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.18.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.18.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.18.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
questions.19.question_intro
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.19.instrument_id
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.19.instrument_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/string_type
questions.19.topics_auto
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/list_type
questions.19.nearest_match_from_mhc_auto
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type