# Index segments

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
import json
import os
import pickle
import re

from IPython.display import display, Markdown
import openai
import pinecone
from sentence_transformers import SentenceTransformer
import spacy
from tqdm.autonotebook import tqdm

from models.data_utils import get_paragraph_texts_and_ids, get_segment_texts_and_ids
from models.segment_train import get_mpnet_embedder, get_openai_embedder,\
    syntactic_paragraph_features, predict_using_features_and_ensemble

In [None]:
# configure
data_dir = '../data/pre_process'
segment_path = '../data/segment/output/2023-04-21.json'

# pinecone
pinecone_key = os.environ['PINECONE_KEY']
pinecone_region = 'us-west1-gcp'
pinecone_index = 'conf-ada-002'
pinecone_batch_size = 32

# index embedder
embedding_model = 'text-embedding-ada-002'
embedding_len = 1536
embedding_metric = 'cosine'

In [None]:
# init pinecone
pinecone.init(
    api_key=pinecone_key,
    environment=pinecone_region,
)

In [None]:
# init openai
openai.organization = os.environ['OPENAI_ORG']
openai.api_key = os.environ['OPENAI_KEY']
openai.Engine.list()

## Read segments

In [None]:
with open(segment_path, 'r') as f:
    segments = json.load(f)
print(len(segments))
segments[0]

## Add segments to index

In [None]:
# delete pinecone index
pinecone.delete_index(pinecone_index)

# create pineconde index
pinecone.create_index(pinecone_index, 
                      pod_type='p1.x1',
                      dimension=embedding_len, 
                      metric=embedding_metric,
                     )

# connect to index
index = pinecone.Index(pinecone_index)

In [None]:
# index conf talks
count = 0  # we'll use the count to create unique IDs
for i in tqdm(range(0, len(segments), pinecone_batch_size)):
    # set end position of batch
    i_end = min(i+pinecone_batch_size, len(segments))
    # get batch of lines and IDs
    index_batch = segments[i: i+pinecone_batch_size]
    lines_batch = [index_entry['text'] for index_entry in index_batch]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=embedding_model)
    embeds = [record['embedding'] for record in res['data']]
    # upsert batch
    to_upsert = zip(ids_batch, embeds, index_batch)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

## Test index

In [None]:
# create embedding for query
query = "What are the blessings of keeping the sabbath day holy?"

query_embedding = openai.Embedding.create(input=query, engine=embedding_model)['data'][0]['embedding']

In [None]:
# query pinecone
res = index.query([query_embedding], top_k=5, include_metadata=True)
res