In [1]:
# Standard Libraries
import json
import os
import csv
import shutil
from itertools import islice
import concurrent.futures
import yaml

# Third-Party Libraries
import pandas as pd
import numpy as np
from PyPDF2 import PdfReader
import tiktoken
from dotenv import load_dotenv
import pyperclip

# OpenAI Libraries
from openai import OpenAI

# Google Cloud Identity and Credentials
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud import functions_v1
from google.api_core.exceptions import Conflict

In [3]:
# Saving this as a variable to reference in function app in later step
openai_api_key = json.load(
    open("openai.json")
)["key"]
openai_client = OpenAI(api_key=openai_api_key)
embeddings_model = "text-embedding-3-small"  # We'll use this by default, but you can change to your text-embedding-3-large if desired

# Use default credentials
credentials = service_account.Credentials.from_service_account_file(
    # os.path.join(os.path.dirname(os.path.abspath(__file__)), "google.json")
    "google.json"
)
project_id = "cse-144-project"
region = "us-central1"  # e.g: "us-central1"

In [5]:
def batched(iterable, n):
    """Batch data into tuples of length n. The last batch may be shorter."""
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


def chunked_tokens(text, chunk_length, encoding_name="cl100k_base"):
    # Get the encoding object for the specified encoding name. OpenAI's tiktoken library, which is used in this notebook, currently supports two encodings: 'bpe' and 'cl100k_base'. The 'bpe' encoding is used for GPT-3 and earlier models, while 'cl100k_base' is used for newer models like GPT-4.
    encoding = tiktoken.get_encoding(encoding_name)
    # Encode the input text into tokens
    tokens = encoding.encode(text)
    # Create an iterator that yields chunks of tokens of the specified length
    chunks_iterator = batched(tokens, chunk_length)
    # Yield each chunk from the iterator
    yield from chunks_iterator

In [6]:
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = "cl100k_base"


def generate_embeddings(text, model):
    # Generate embeddings for the provided text using the specified model
    embeddings_response = openai_client.embeddings.create(model=model, input=text)
    # Extract the embedding data from the response
    embedding = embeddings_response.data[0].embedding
    return embedding


def len_safe_get_embedding(
    text,
    model=embeddings_model,
    max_tokens=EMBEDDING_CTX_LENGTH,
    encoding_name=EMBEDDING_ENCODING,
):
    # Initialize lists to store embeddings and corresponding text chunks
    chunk_embeddings = []
    chunk_texts = []
    # Iterate over chunks of tokens from the input text
    for chunk in chunked_tokens(
        text, chunk_length=max_tokens, encoding_name=encoding_name
    ):
        # Generate embeddings for each chunk and append to the list
        chunk_embeddings.append(generate_embeddings(chunk, model=model))
        # Decode the chunk back to text and append to the list
        chunk_texts.append(tiktoken.get_encoding(encoding_name).decode(chunk))
    # Return the list of chunk embeddings and the corresponding text chunks
    return chunk_embeddings, chunk_texts

In [7]:
categories = [
    "authentication",
    "models",
    "techniques",
    "tools",
    "setup",
    "billing_limits",
    "other",
]

client = bigquery.Client(credentials=credentials, project=project_id)

In [11]:
query = "What is prompt engineering?"
print(f"Query: {query}")
category = "models"

embedding_query = generate_embeddings(query, embeddings_model)
embedding_query_list = ", ".join(map(str, embedding_query))

query = f"""
WITH search_results AS (
  SELECT query.id AS query_id, base.id AS base_id, distance
  FROM VECTOR_SEARCH(
    TABLE oai_docs.embedded_data, 'content_vector',
    (SELECT ARRAY[{embedding_query_list}] AS content_vector, 'query_vector' AS id),
    top_k => 2, distance_type => 'COSINE', options => '{{"use_brute_force": true}}')
)
SELECT sr.query_id, sr.base_id, sr.distance, ed.text, ed.title
FROM search_results sr
JOIN oai_docs.embedded_data ed ON sr.base_id = ed.id
ORDER BY sr.distance ASC
"""

query_job = client.query(query)
results = query_job.result()  # Wait for the job to complete

for row in results:
    print(
        f"query_id: {row['query_id']}, base_id: {row['base_id']}, distance: {row['distance']}, text_truncated: {row['text'][0:100]}"
    )
    print()

Query: What is prompt engineering?
query_id: query_vector, base_id: 0_0, distance: 0.67017727196806, text_truncated: 

# Latency optimization

This guide covers the core set of principles you can apply to improve late

query_id: query_vector, base_id: 36_0, distance: 0.7026942763690709, text_truncated: 

Prompt engineering

This guide shares strategies and tactics for getting better results from large

