# Index segments

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [1]:
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from tqdm.autonotebook import tqdm

from models.load_utils import load_docs_from_jsonl
from models.index_utils import index

ModuleNotFoundError: No module named 'langchain'

In [None]:
# configure
split_path = "../data/split/output/2023-09-24.jsonl"
index_name = "conf-ada-002"
batch_size = 100
text_field = "text"
embedding_model, embedding_len, embedding_metric = ("text-embedding-ada-002", 1536, "cosine")

## Initialize embedder

In [None]:
embedder = OpenAIEmbeddings(
    model=embedding_model,
    openai_api_key=os.environ['OPENAI_API_KEY'],
)

## Initialize vector store

In [None]:
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'], 
    environment=os.environ['PINECONE_ENV'],
)

# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric=embedding_metric,
        dimension=embedding_len
    )

index = pinecone.Index(index_name)
index.describe_index_stats()

## Read splits

In [None]:
docs = load_docs_from_jsonl(split_path)
len(docs)

In [None]:
docs[0]

## Index splits

In [None]:
# index conf talks
index(index, embedder, docs, batch_size)

index.describe_index_stats()

## Test index

In [None]:
query = "What are the blessings of keeping the sabbath day holy?"

vectorstore = Pinecone(index, embedder, text_field)
query_result = vectorstore.similarity_search(query, k=3)
query_result