# Index segments

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

from models.load_utils import load_docs_from_jsonl
from models.index_utils import embed_documents, get_doc_id, index_documents, VoyageAIEmbedder

In [None]:
# configure
split_paths = [
    "../data/split/bgbmm/2023-12-19.jsonl",
    "../data/split/conference/2023-12-19.jsonl",
    "../data/split/dc_historical_context/2023-12-19.jsonl",
    "../data/split/dc_people/2023-12-19.jsonl",
    "../data/split/dc_places/2023-12-19.jsonl",
    "../data/split/dc_podcasts/2023-12-19.jsonl",
    "../data/split/dc_verse_level/2023-12-19.jsonl",
    "../data/split/encyclopedia/2023-12-19.jsonl",
    "../data/split/evidence_central/2023-12-19.jsonl",
    "../data/split/fair/2023-12-19.jsonl",
    "../data/split/knowhys/2023-12-19.jsonl",
    "../data/split/pdfs/2023-12-19.jsonl",
    "../data/split/pearl_of_great_price/2023-12-19.jsonl",
    "../data/split/redeemer_of_israel/2023-12-19.jsonl",
    "../data/split/tnt/2023-12-19.jsonl",
]
index_name = "scqa"
batch_size = 80
delay = 0.2
text_field = "text"

today = datetime.today().strftime('%Y-%m-%d')
ids_path = f"../data/exports/indexed-ids-{today}.txt"

embedding_model, embedding_len, embedding_metric = ("text-embedding-ada-002", 1536, "cosine")
# embedding_model, embedding_len, embedding_metric = ["voyage-01", 1024, "cosine"]

## Initialize embedder

In [None]:
embedder = OpenAIEmbeddings(
    model=embedding_model,
    openai_api_key=os.environ['OPENAI_API_KEY'],
)
# embedder = VoyageAIEmbedder(
#     model=embedding_model
# )

## Initialize vector store

In [None]:
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_ENV'],
)

In [None]:
# pinecone.delete_index(index_name)
# pinecone.list_indexes()

In [None]:
# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    print('Creating index')
    pinecone.create_index(
        name=index_name,
        metric=embedding_metric,
        dimension=embedding_len
    )

index = pinecone.Index(index_name)

In [None]:
index_stats = index.describe_index_stats()
total_vector_count = index_stats["total_vector_count"]
index_stats

## Index splits

In [None]:
ids = set()
for split_path in split_paths:
    docs = load_docs_from_jsonl(split_path)
    print(split_path, len(docs))
    embeddings = embed_documents(embedder, docs, batch_size=batch_size, delay=delay)
    print('  embeddings', len(embeddings))
    index_documents(index, embeddings, docs, batch_size)
    for doc in docs:
        ids.add(get_doc_id(doc))

In [None]:
len(ids)

## Save ids

In [None]:
with open(ids_path, "w") as f:
    for _id in ids:
        f.write(f"{_id}\n")

## Stats

In [None]:
index_stats = index.describe_index_stats()
print("added", index_stats["total_vector_count"] - total_vector_count)
index_stats

## Test index

In [None]:
query = "What do I need to do to qualify to serve a mission?"

vectorstore = Pinecone(index, embedder, text_field)
query_result = vectorstore.similarity_search(query, k=10)
for ix, result in enumerate(query_result):
    print(ix, result.metadata["url"], result.metadata["title"], result.page_content[:80])