In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
import json
import os
import pickle
import re

from IPython.display import display, Markdown
import openai
import pinecone
from sentence_transformers import SentenceTransformer
import spacy
from tqdm import tqdm

from models.data_utils import get_paragraph_texts_and_ids, get_segment_texts_and_ids
from models.segment_train import get_mpnet_embedder, get_openai_embedder,\
    syntactic_paragraph_features, predict_using_features_and_ensemble

In [None]:
# configure
data_dir = '../data/pre_process'
segment_model_path = '../data/segment/model/2023-04-12.pkl'
segment_threshold = 0.55
max_segment_len = 500

# pinecone
pinecone_key = os.environ['PINECONE_KEY']
pinecone_region = 'us-west1-gcp'
pinecone_index = 'conf-ada-002'
pinecone_batch_size = 32

# spacy
parser = spacy.load("en_core_web_sm")

# mpnet embedder for segmentation
mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
mpnet_embedder = get_mpnet_embedder(mpnet)

# openai embedder for segmentation
openai.organization = os.environ['OPENAI_ORG']
openai.api_key = os.environ['OPENAI_KEY']
openai.Engine.list()
openai_embedder = get_openai_embedder(openai)

# index embedder
embedding_model = 'text-embedding-ada-002'
embedding_len = 1536
embedding_metric = 'cosine'

In [None]:
# load segment classifier
with open(segment_model_path,'rb') as f:
    clf = pickle.load(f)

In [None]:
# create pinecone index
pinecone.init(
    api_key=pinecone_key,
    environment=pinecone_region,
)
# check if index already exists (only create index if not)
if pinecone_index not in pinecone.list_indexes():
    pinecone.create_index(pinecone_index, dimension=embedding_len, metric=embedding_metric)
# connect to index
index = pinecone.Index(pinecone_index)

## Read and prepare conference talks

In [None]:
# create segmenter
predictor = predict_using_features_and_ensemble(syntactic_paragraph_features,
                                                openai_embedder,
                                                mpnet_embedder,
                                                parser,
                                                clf,
                                                segment_threshold)

In [None]:
# read conf talks
index_entries = []
for filename in tqdm(os.path.join(data_dir, f)
                 for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))):
    with open(filename, 'r') as f:
        data = json.load(f)

    # get paragraphs
    paragraph_texts_and_ids = get_paragraph_texts_and_ids(data['content'])
    paragraphs = [paragraph_text_id[0] for paragraph_text_id in paragraph_texts_and_ids]

    # get segments
    segmentation = predictor(paragraphs)
    segment_texts_and_ids = get_segment_texts_and_ids(paragraph_texts_and_ids,
                                                      segmentation,
                                                      max_segment_len = max_segment_len,
                                                     )

    # create index_entry for each segment
    for segment_text_and_id in segment_texts_and_ids:
        index_entries.append({
            'year': data['year'],
            'month': data['month'],
            'url': data['url'],
            'anchor': segment_text_and_id[1],
            'title': data['title'],
            'author': data['author'],
            'text': segment_text_and_id[0],
        })
len(index_entries)

In [None]:
index_entries[0]

## Get embeddings for and index conference talks

In [None]:
# index conf talks
count = 0  # we'll use the count to create unique IDs
for i in tqdm(range(0, len(index_entries), pinecone_batch_size)):
    # set end position of batch
    i_end = min(i+pinecone_batch_size, len(index_entries))
    # get batch of lines and IDs
    index_batch = index_entries[i: i+pinecone_batch_size]
    lines_batch = [index_entry['text'] for index_entry in index_batch]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(inpuhttp://localhost:8888/notebooks/notebooks/20_index.ipynb#t=lines_batch, engine=embedding_model)
    embeds = [record['embedding'] for record in res['data']]
    # upsert batch
    to_upsert = zip(ids_batch, embeds, index_batch)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

## Test index

In [None]:
# create embedding for query
query = "What are the blessings of keeping the sabbath day holy?"

query_embedding = openai.Embedding.create(input=query, engine=embedding_model)['data'][0]['embedding']

In [None]:
# query pinecone
res = index.query([query_embedding], top_k=5, include_metadata=True)
res