In [None]:
import glob
import json
import os
from abc import ABC, abstractmethod
from typing import List

import dotenv
from elasticsearch import Elasticsearch
from langchain.chat_models import ChatAnthropic
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores.elasticsearch import (
    ApproxRetrievalStrategy,
    ElasticsearchStore,
)
from pydantic import BaseModel, TypeAdapter

from redbox.llm.llm_base import LLMHandler
from redbox.models import Chunk, File
from redbox.storage import ElasticsearchStorageHandler

In [None]:
if os.path.basename(os.getcwd()) != "10ds-ai-redbox":
    os.chdir("..")
    print(os.getcwd())

In [None]:
dotenv.load_dotenv(".env")
# Grab it as a dictionary too for convenience
ENV = dotenv.dotenv_values(".env")

model_params = {"max_tokens": 4096, "temperature": 0.2}

llm = ChatAnthropic(
    anthropic_api_key=ENV["ANTHROPIC_API_KEY"],
    max_tokens=model_params["max_tokens"],
    temperature=model_params["temperature"],
    streaming=True,
)

es = Elasticsearch(
    hosts=[{"host": "localhost", "port": 9200, "scheme": "http"}],
    basic_auth=(ENV["ELASTIC_USER"], ENV["ELASTIC_PASSWORD"]),
)

print(json.dumps(es.info().body, indent=4))

In [None]:
file_names = os.listdir("data/dev/file/")
file_paths = [os.path.join("data/dev/file/", file_name) for file_name in file_names]

chunk_names = os.listdir("data/dev/chunks/")
chunk_paths = [
    os.path.join("data/dev/chunks/", chunk_name) for chunk_name in chunk_names
]

files = []


for file_path in file_paths:
    with open(file_path, "r") as f:
        file = File(**json.load(f))
        files.append(file)

In [None]:
# embedding_function = SentenceTransformerEmbeddings()
# vector_store = ElasticsearchStore(
#     es_url="http://localhost:9200",
#     es_user=ENV["ELASTIC_USER"],
#     es_password=ENV["ELASTIC_PASSWORD"],
#     index_name="redbox-vector",
#     embedding=embedding_function,
#     strategy=ApproxRetrievalStrategy(hybrid=True),
# )

In [None]:
# llm_handler = LLMHandler(llm=llm, vector_store=vector_store, user_uuid="dev")

In [None]:
# llm_handler.add_chunks_to_vector_store(chunks)

In [None]:
# results = llm_handler.vector_store.similarity_search("Civil Service pay", k=10)

In [None]:
# for doc in results:
#     print(f'{doc.metadata["filename"]} | Characters: {len(doc.page_content)}')

In [None]:
storage_handler = ElasticsearchStorageHandler(es_client=es, root_index="redbox-data")

In [None]:
storage_handler.write_items(files)