In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
import json
import os
import re

import openai
import pinecone
from tqdm.auto import tqdm

In [None]:
# configure
data_dir = '../data/interim'

pinecone_key = os.environ['PINECONE_KEY']
pinecone_region = 'us-west1-gcp'
pinecone_index = 'conf-ada-002'
pinecone_batch_size = 32

openai.organization = os.environ['OPENAI_ORG']
openai.api_key = os.environ['OPENAI_KEY']

embedding_model = 'text-embedding-ada-002'
embedding_len = 1536

In [None]:
# check we have authenticated with openai
openai.Engine.list()

In [None]:
# create pinecone index
pinecone.init(
    api_key=pinecone_key,
    environment=pinecone_region,
)
# check if index already exists (only create index if not)
if pinecone_index not in pinecone.list_indexes():
    pinecone.create_index(pinecone_index, dimension=embedding_len)
# connect to index
index = pinecone.Index(pinecone_index)

## Read and prepare conference talks

In [None]:
def _clean(text):
    # remove headers
    text = re.sub(r'[^\n]+\n-{4,}(\n|$)', '', text)
    # remove images
    text = re.sub(r'!\[\]\(\)\s+Image[^\n]+\n', '', text)
    # remove links
    text = re.sub(r'\[[^\]]+\]\([^\)]+\)', '', text)
    # remove newlines, tabs, and extra spaces
    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip()
    return text


def get_paragraphs(contents):
    for paragraph in contents.split('\n\n\n'):
        paragraph = _clean(paragraph)
        if not paragraph:
            continue
        yield paragraph

In [None]:
# read conf talks
contents = []
for filename in (os.path.join(data_dir, f)
                 for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))):
    with open(filename, 'r') as f:
        data = json.load(f)
        for paragraph in get_paragraphs(data['content']):
            contents.append({
                'year': data['year'],
                'month': data['month'],
                'url': data['url'],
                'title': data['title'],
                'author': data['author'],
                'text': paragraph,
            })
len(contents)

## Get embeddings for and index conference talks

In [None]:
# index conf talks
count = 0  # we'll use the count to create unique IDs
for i in tqdm(range(0, len(contents), pinecone_batch_size)):
    # set end position of batch
    i_end = min(i+pinecone_batch_size, len(contents))
    # get batch of lines and IDs
    contents_batch = contents[i: i+pinecone_batch_size]
    lines_batch = [content['text'] for content in contents_batch]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=embedding_model)
    embeds = [record['embedding'] for record in res['data']]
    # upsert batch
    to_upsert = zip(ids_batch, embeds, contents_batch)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

## Test index

In [None]:
# create embedding for query
query = "What does the Lord want me to do?"

query_embedding = openai.Embedding.create(input=query, engine=embedding_model)['data'][0]['embedding']

In [None]:
# query pinecone
res = index.query([query_embedding], top_k=5, include_metadata=True)
res