# Index segments

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from pinecone import Pinecone
from tqdm.autonotebook import tqdm

from models.load_utils import load_docs_from_jsonl
from models.index_utils import index_documents, embed_documents

  from tqdm.autonotebook import tqdm


In [3]:
# configure
split_path = "../data/split/conference/2025-09-17.jsonl"
index_name = "conf-ada-002-svrless"
batch_size = 100
text_field = "text"
embedding_model, embedding_len, embedding_metric = ("text-embedding-ada-002", 1536, "cosine")

## Initialize embedder

In [4]:
embedder = OpenAIEmbeddings(
    model=embedding_model,
    openai_api_key=os.environ['OPENAI_API_KEY'],
)

## Initialize vector store

In [5]:
api_key=os.getenv("PINECONE_API_KEY")
# Initialize a Pinecone client with your API key
pinecone = Pinecone(api_key=api_key)

In [6]:
print(pinecone.list_indexes())

[{
    "name": "conf-ada-002-svrless",
    "metric": "cosine",
    "host": "conf-ada-002-svrless-29e6444.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1",
            "source_collection": "conf-ada-002-archive-ycjx"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}]


In [8]:
# First, check if our index already exists. If it doesn't, we create it
if len(pinecone.list_indexes()) == 0:
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric=embedding_metric,
        dimension=embedding_len
    )

index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 34305}},
 'total_vector_count': 34305,
 'vector_type': 'dense'}

## Read splits

In [9]:
docs = load_docs_from_jsonl(split_path)
len(docs)

1149

In [15]:
docs[0]

Document(page_content='My dear brothers and sisters, my thoughts today are on the gathering of Israel, what President Russell M. Nelson calls "the most important thing taking place on earth today. Nothing else compares in magnitude, nothing else compares in importance, nothing else compares in majesty."\n\nThe gathering is the ultimate recognition that "the worth of souls is great in the sight of God." It is as simple as that. We are gathering God\'s children in these last days that they might have "blessings poured out upon their heads" and the promises of "the riches of eternity." It follows that to gather Israel we need missionaries—many more than are serving. Today I am speaking to the many seasoned seniors in the Church who could serve as missionaries. The Lord needs you. We need you in New York and Chicago, Australia and Africa, Thailand and Mexico, and everywhere in between.\n\nLet me take you back to the year 2015. I was a newly called member of the Quorum of the Twelve Apostle

## Index splits

In [None]:
# get embeddings
embeddings = embed_documents(embedder, docs, batch_size)


  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
#  index documents to pinecone
index_documents(index, embeddings, docs, batch_size)

  0%|          | 0/12 [00:00<?, ?it/s]

In [18]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 35454}},
 'total_vector_count': 35454,
 'vector_type': 'dense'}

## Test index

In [None]:
from langchain.vectorstores import Pinecone as LangchainPinecone
query = "What does President Nelson say is the most important thing taking place on earth today?"

vectorstore = LangchainPinecone(index, embedder, text_field)
query_result = vectorstore.similarity_search(query, k=3)
query_result