# Index segments

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

from models.load_utils import load_docs_from_jsonl
from models.index_utils import embed_documents, index_documents, VoyageAIEmbedder

In [None]:
# configure
split_path = "../data/split/conference/2023-11-27.jsonl"
index_name = "scqa"
batch_size = 80
delay = 0.1
text_field = "text"

# embedding_model, embedding_len, embedding_metric = ("text-embedding-ada-002", 1536, "cosine")
embedding_model, embedding_len, embedding_metric = ["voyage-01", 1024, "cosine"]

## Initialize embedder

In [None]:
# embedder = OpenAIEmbeddings(
#     model=embedding_model,
#     openai_api_key=os.environ['OPENAI_API_KEY'],
# )
embedder = VoyageAIEmbedder(
    model=embedding_model
)

## Initialize vector store

In [None]:
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_ENV'],
)

# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    print('Creating index')
    pinecone.create_index(
        name=index_name,
        metric=embedding_metric,
        dimension=embedding_len
    )

index = pinecone.Index(index_name)

In [None]:
index_stats = index.describe_index_stats()
total_vector_count = index_stats["total_vector_count"]
index_stats

## Read splits

In [None]:
docs = load_docs_from_jsonl(split_path)
len(docs)

In [None]:
docs[0]

## Get embeddings

In [None]:
embeddings = embed_documents(embedder, docs, batch_size=batch_size, delay=delay)
len(embeddings)

## Index splits

In [None]:
index_documents(index, embeddings, docs, batch_size)

In [None]:
index_stats = index.describe_index_stats()
print("added", index_stats["total_vector_count"] - total_vector_count)
index_stats

## Test index

In [None]:
query = "What do I need to do to qualify to serve a mission?"

vectorstore = Pinecone(index, embedder, text_field)
query_result = vectorstore.similarity_search(query, k=10)
for ix, result in enumerate(query_result):
    print(ix, result.metadata["url"], result.metadata["title"], result.page_content[:80])