In [1]:
import openai
import pandas as pd
import itertools

from dotenv import dotenv_values
config = dotenv_values("/Users/julianboyce/Documents/work/gpt/.env")
openai.api_key = config["OPENAI_API_KEY"]

In [2]:
# Sanity check your api key is here
########### print(openai.api_key)

In [3]:
import mechanicalsoup

# Specify the URL you want to scrape
url = 'https://www.the-atlantic-pacific.com/2023/05/30/dresses-with-sandals/'

# Create a browser object
browser = mechanicalsoup.StatefulBrowser()

# Use the browser to get the page
browser.open(url)

# Parse the page's HTML content with BeautifulSoup
page = browser.page

# Don't forget to close the browser when you are done
browser.close()

In [4]:
#page

In [5]:
# Now you can find elements using BeautifulSoup syntax
# For example, let's say you want to scrape the title of the post
title = page.find("h1").get_text()
print("Title:", title)

Title: Dresses with Sandals: How to Create a Chic Summer Outfit


In [6]:
# Now you can find elements using BeautifulSoup syntax
# For example, let's say you want to scrape the title of the post
body = page.find("body").get_text()
# print("Title:", body)

In [7]:
import re
# Remove consecutive newlines if there are more than 2
cleaned_body = re.sub(r'\n{1,}', '<<>>', body)

In [8]:
# cleaned_body

In [9]:
def removeShortTextEntries(cleaned_body):
    # Split the text into segments based on '<<>>' delimiter
    segments = cleaned_body.split('<<>>')

    # Rebuild the text by including only segments that have 20 characters or more
    # or are the first segment
    result = segments[0]
    for segment in segments[1:]:
        if len(segment) >= 100:
            result += '####' + segment
        else:
            result += ''

    return result
    
    

In [10]:
short_body = removeShortTextEntries(cleaned_body)

In [11]:
short_body

'####An easy sundress and a chic pair of sandals will forever be a look that just screams “summer”. No matter your age nor your location, pairing dresses with sandals is simply a no-brainer. You know I love a good styling session – my posts on how to style wide-leg pants and how to style a midi skirt are evidence of this. So of course I couldn’t resist putting together some favorite dress and sandal combinations for summer. Today I’m highlighting everything from trendy platform sandals with mod dresses to minimalist chic sandals with slip dresses. There’s a little bit of everything because we all know sticking to just one style wouldn’t be any fun!####While you know I love to mix and match separates, dresses are a must-have for me, especially when the temps soar. Why? Simply put, dresses are so easy! You can quickly throw on a great statement earring or some simple layered necklaces and a great flat, and just like that, your outfit is complete. This combination also makes packing a bre

In [12]:
from f1_utilities import WebSection
from typing import Dict, List

### Create a list of the body
sections = short_body.split('####')

webSections: List[WebSection] = []
## Remove any blank entries
webSections = [WebSection(text=item) for item in sections if item]

In [13]:
webSections

[WebSection(text='An easy sundress and a chic pair of sandals will forever be a look that just screams “summer”. No matter your age nor your location, pairing dresses with sandals is simply a no-brainer. You know I love a good styling session – my posts on how to style wide-leg pants and how to style a midi skirt are evidence of this. So of course I couldn’t resist putting together some favorite dress and sandal combinations for summer. Today I’m highlighting everything from trendy platform sandals with mod dresses to minimalist chic sandals with slip dresses. There’s a little bit of everything because we all know sticking to just one style wouldn’t be any fun!'),
 WebSection(text='While you know I love to mix and match separates, dresses are a must-have for me, especially when the temps soar. Why? Simply put, dresses are so easy! You can quickly throw on a great statement earring or some simple layered necklaces and a great flat, and just like that, your outfit is complete. This combi

### Cost Estimation for the embeddings

In [14]:
import tiktoken

chat_model = "gpt-3.5-turbo"
embedding_enc = tiktoken.encoding_for_model("text-embedding-ada-002")
enc = tiktoken.encoding_for_model(chat_model)

total_tokens = sum([len(embedding_enc.encode(str(section))) for section in sections])

# $0.0004 per 1000 tokens
cost = total_tokens * (0.0004 / 1000)
print(f"Estimated Cost ${cost:.5f}")

Estimated Cost $0.00043


In [15]:
from utilities import num_tokens_from_messages, get_embedding, get_n_nearest_neighbors, memoize_to_sqlite

embeddings: Dict[WebSection, List[float]] = {
    section: get_embedding(section) for section in sections if section
}

Cached result found for 4130c940e29bb6082e14e95cee3647875c60a06b68f393f0584d4d4def53fccf. Returning it.
Cached result found for 313efecf80bc91d8f874c1abbadac458f17e5f5ce7c4547b54a86f39a472e203. Returning it.
Cached result found for 20cc511c75e5195625cf2c7ccf40ce467c6c5104fa41b76b3ccbe5bac83e268c. Returning it.
Cached result found for d5dc51142d951f869b81cef80bf5bfe5209944ba340724038b7fc6e311eb3314. Returning it.
Cached result found for ed5dda531bad7047ff25f41c7453360f6a8367735bb01a64fecb778256271b78. Returning it.
Cached result found for 1313fe29741c73723d0032e2e3d6b2d9802e3a19d782caceb2e25adf1a244a78. Returning it.
Cached result found for 0ccd8807773543e2dee6d68b1b539ea00be7cbd503fad0cd31b10d941bdefcd1. Returning it.
Cached result found for b0f33c3d898c3aca8389172fe7082ac16eb4de77042c2d0cc36875c3f6a3f1bb. Returning it.
Cached result found for de9e3f79a86f4b7f17f404b2380fdbb919c5a4d66b7e6dfd687ec2197cb95725. Returning it.
Cached result found for cde2bc487fb3c6f592e40c4d8e17301680d907a2

In [16]:
# embeddings

In [17]:
def get_messages(context: List[WebSection], question: str) -> List[Dict[str, str]]:
    context_str = "\n\n".join([f"Body:\n{x}" for x in context])
    return [
        {"role": "system", "content": """
You will receive a question from the user and some context to help you answer the question.

Evaluate the context and provide an answer if you can confidently answer the question.

If you are unable to provide a confident response, kindly state that it is the case and explain the reason.

Prioritize offering an "I don't know" response over conveying potentially false information.

The user will only see your response and not the context you've been provided. Thus, respond in precise detail, directly repeating the information that you're referencing from the context.
""".strip()},
        {"role": "user", "content": f"""
Using the following information as context, I'd like you to answer a question.

{context_str}

Please answer the following question: {question}
""".strip()}
    ]

In [18]:
from typing import Optional


MAX_CONTEXT_WINDOW = 4097
MINIMUM_RESPONSE_SPACE = 1000
MAX_PROMPT_SIZE = MAX_CONTEXT_WINDOW - MINIMUM_RESPONSE_SPACE


def ask_embedding_store(question: str, embeddings: Dict[WebSection, List[float]], max_documents: int) -> str:
    """
    Fetch necessary context from our embedding store, striving to fit the top max_documents
    into the context window (or fewer if the total token count exceeds the limit)

    :param question: The question to ask
    :param embeddings: A dictionary of Section objects to their corresponding embeddings
    :param max_documents: The maximum number of documents to use as context
    :return: GPT's response to the question given context provided in our embedding store
    """
    query_embedding = get_embedding(question)

    nearest_neighbors = get_n_nearest_neighbors(query_embedding, embeddings, max_documents)
    messages: Optional[List[Dict[str, str]]] = None

    base_token_count = num_tokens_from_messages(get_messages([], question), chat_model)
#    print(nearest_neighbors[0])
    token_counts = [len(enc.encode(document.replace("\n", " "))) for document, _ in nearest_neighbors]
    cumulative_token_counts = list(itertools.accumulate(token_counts))
    indices_within_limit = [True for x in cumulative_token_counts if x <= (MAX_PROMPT_SIZE - base_token_count)]
    most_messages_we_can_fit = len(indices_within_limit)

    context = [x[0] for x in nearest_neighbors[: most_messages_we_can_fit + 1]]

#     debug_str = "\n".join([f"{x[0].location}: {x[1]}" for x in nearest_neighbors[: most_messages_we_can_fit + 1]])
#     print(f"Using {most_messages_we_can_fit} documents as context:\n" + debug_str)
    messages = get_messages(context, question)

#     print(f"Prompt: {messages[-1]['content']}")
    result = openai.ChatCompletion.create(model=chat_model, messages=messages)
    return result.choices[0].message["content"]

In [21]:
ask_embedding_store("Can you tell me 3 popular types of sandals and explain why they are popular", embeddings, 5)

Computing embedding for Can you tell me 3 popular types of sandals and exp


'Yes, based on the context provided, three popular types of sandals are:\n\n1. Slides: Slides are popular because they are easy to wear and can be versatile. They can be dressed up or down and come in various designs such as classic and feminine with bows. They are also available in different colors including on-trend raffia option and are budget-friendly.\n\n2. Minimalist sandals: This summer, minimalist sandals have become popular. This is because they offer simplicity and sleekness in an outfit, and can easily complement any outfit. Minimalist sandals come in various colors such as black and tan, with some having an extra detail like the black pearl sandal.\n\n3. Platforms: Platforms are another popular type of sandal because they can add height and elongate any look. They look great paired with mod dress silhouettes which are also trending at the moment. Platforms come in different styles including trendy ones and can be suitable for different age groups.'