In [153]:
# Please replace pip with your version of pip.
# Tested with pip3.10

# pip install selenium
# pip install beautifulsoup4

In [154]:
import openai
import pandas as pd
import itertools

from dotenv import dotenv_values
config = dotenv_values("/Users/julianboyce/Documents/work/gpt/.env")
openai.api_key = config["OPENAI_API_KEY"]

In [155]:
# Sanity check your api key is here
########### print(openai.api_key)

In [156]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
#options.headless = True
#options.add_argument('--headless')
options.add_argument("--window-size=1920,1200")

driver = webdriver.Chrome(options=options)

driver.get("https://www.cbre.com/insights/briefs/global-real-estate-investment-continues-to-fall-in-q1")
#print(driver.page_source)  # prints the source HTML of the page

page_source = driver.page_source

driver.quit()

In [157]:
from bs4 import BeautifulSoup

# Assuming driver.page_source contains your HTML
soup = BeautifulSoup(page_source, 'html.parser')

# Find the <body> tag
body = soup.find('body')

# Now, `body` contains the contents of the <body> tag.
# You can print it out:
#print(body.prettify())

body = str(body)

In [158]:
blacklist_tags = [
    'script',    # JavaScript scripts
    'noscript',  # Alternate content when JavaScript is not enabled
    'style',     # CSS styles
    'link',      # External resource links, often CSS
    'meta',      # Metadata about the HTML document
    'head',      # Container for all the head elements, may contain scripts, styles, meta information
    'footer',    # Footer of the document or a section
    'header',    # Container for introductory or navigation, can contain some data
    'form',      # HTML form for user input
    'iframe',    # Inline frame used to embed another document within the current HTML document
    'svg',       # Container for SVG graphics
    'path',      # SVG path
    'nav',       # Navigation links
    'input',     # Input control
    'button',    # Clickable button
    'textarea',  # Multiline input control
    'select',    # Drop-down list
    'option',    # Option in a drop-down list
    'label'      # Caption for an item in a user interface
]

In [159]:
def removeUnwantedTags(soup):
    # Remove unwanted tags
    for tag in soup.find_all(blacklist_tags):
        tag.decompose()  # remove tag and its contents

    # Cleaned text
    content = soup.get_text(strip=True, separator='####')
    return content

In [160]:
parsed_content = removeUnwantedTags(soup)

In [161]:
#parsed_content

In [162]:
def removeShortTextEntries(parsed_content, length):
    # Split the text into segments based on '<<>>' delimiter
    segments = parsed_content.split('####')

    # Rebuild the text by including only segments that have 100 characters or more
    # or are the first segment
    result = ''
    for segment in segments[0:]:
        if len(segment) >= length:
            # Debug the length vs. quality of text
            #print(segment)
            #print(len(segment))
            result += '####' + segment
        else:
            result += ''

    return result
    
    

In [163]:
short_body = removeShortTextEntries(parsed_content, 100)

In [164]:
# short_body

In [165]:
from f1_utilities import WebSection
from typing import Dict, List

### Create a list of the body
sections = short_body.split('####')

webSections: List[WebSection] = []
## Remove any blank entries
webSections = [WebSection(text=item) for item in sections if item]

In [166]:
webSections

[WebSection(text='Global commercial real estate investment volume fell by 55% year-over-year in Q1 2023 to US$147 billion. Volume fell by 56% in the Americas, 64% in Europe and 20% in Asia-Pacific.'),
 WebSection(text='Investment totals for all property sectors fell year-over-year in Q1. Multifamily and office fell the most (-64%) and retail the least (-32%).'),
 WebSection(text='High interest rates, tight credit conditions and the prospect of a moderate recession in the U.S. will further reduce commercial real estate investment activity in 2023. However, activity should begin to pick up later in the year as inflation falls and economic uncertainty subsides.'),
 WebSection(text='Due to high interest rates and slow economic growth, CBRE forecasts that global investment volume will decrease by 26% in 2023.'),
 WebSection(text='Global commercial real estate investment fell by 55% year-over-year in Q1 to US$147 billion. Investment fell by 56% in the Americas, 64% in Europe and 20% in Asia-

### Cost Estimation for the embeddings

In [167]:
import tiktoken

chat_model = "gpt-3.5-turbo"
embedding_enc = tiktoken.encoding_for_model("text-embedding-ada-002")
enc = tiktoken.encoding_for_model(chat_model)

total_tokens = sum([len(embedding_enc.encode(str(section))) for section in sections])

# $0.0004 per 1000 tokens
cost = total_tokens * (0.0004 / 1000)
print(f"Estimated Cost ${cost:.5f}")

Estimated Cost $0.00048


In [168]:
from utilities import num_tokens_from_messages, get_embedding, get_n_nearest_neighbors, memoize_to_sqlite

embeddings: Dict[WebSection, List[float]] = {
    section: get_embedding(section) for section in sections if section
}

Cached result found for 980adebfda141f42bb934d540d7d7ccd57390f7e55fc3b6bb12f574b1524d1be. Returning it.
Cached result found for 107f47120ae4108a4d5c7ceae53ec828b7324c01eb26eeb7b452863d5b01c427. Returning it.
Cached result found for d33088522803d5039024a5256a7a8e1731cf3ca6fea907b324683d5087df1b98. Returning it.
Cached result found for 7f983fcb232cdbeda6488d103b9c3dccdb181013a86f08b983aae20c90470fd4. Returning it.
Cached result found for 0a5c4d3d2575f6448151c7c56fe6d340e03d7e93bef99bacc43956641b8eda7e. Returning it.
Cached result found for 9f4987d0bd8fd7f11aa29f1ed807efdc115bf47ae2d1488e94ae726f55d27b66. Returning it.
Cached result found for 1bf5dfd5a296609395793bba3689d0936e7643096759c6081cab6134c189b091. Returning it.
Cached result found for 00e1744d33e0292c6688fbf1a08ca26e630b269619ba4a29629b65237b4b7c87. Returning it.
Cached result found for cb93ebb041365862d0c61e31933a54e6efb7086c86f7af21ca7ea0c3170c0915. Returning it.
Cached result found for 13ad96bc87ab8ee60d7831ec343260bc131c3c86

In [169]:
#embeddings

In [170]:
def get_messages(context: List[WebSection], question: str) -> List[Dict[str, str]]:
    context_str = "\n\n".join([f"Body:\n{x}" for x in context])
    return [
        {"role": "system", "content": """
You will receive a question from the user and some context to help you answer the question.

Evaluate the context and provide an answer if you can confidently answer the question.

If you are unable to provide a confident response, kindly state that it is the case and explain the reason.

Prioritize offering an "I don't know" response over conveying potentially false information.

The user will only see your response and not the context you've been provided. Thus, respond in precise detail, directly repeating the information that you're referencing from the context.
""".strip()},
        {"role": "user", "content": f"""
Using the following information as context, I'd like you to answer a question.

{context_str}

Please answer the following question: {question}
""".strip()}
    ]

In [171]:
from typing import Optional


MAX_CONTEXT_WINDOW = 4097
MINIMUM_RESPONSE_SPACE = 1000
MAX_PROMPT_SIZE = MAX_CONTEXT_WINDOW - MINIMUM_RESPONSE_SPACE


def ask_embedding_store(question: str, embeddings: Dict[WebSection, List[float]], max_documents: int) -> str:
    """
    Fetch necessary context from our embedding store, striving to fit the top max_documents
    into the context window (or fewer if the total token count exceeds the limit)

    :param question: The question to ask
    :param embeddings: A dictionary of Section objects to their corresponding embeddings
    :param max_documents: The maximum number of documents to use as context
    :return: GPT's response to the question given context provided in our embedding store
    """
    query_embedding = get_embedding(question)

    nearest_neighbors = get_n_nearest_neighbors(query_embedding, embeddings, max_documents)
    messages: Optional[List[Dict[str, str]]] = None

    base_token_count = num_tokens_from_messages(get_messages([], question), chat_model)
#    print(nearest_neighbors[0])
    token_counts = [len(enc.encode(document.replace("\n", " "))) for document, _ in nearest_neighbors]
    cumulative_token_counts = list(itertools.accumulate(token_counts))
    indices_within_limit = [True for x in cumulative_token_counts if x <= (MAX_PROMPT_SIZE - base_token_count)]
    most_messages_we_can_fit = len(indices_within_limit)

    context = [x[0] for x in nearest_neighbors[: most_messages_we_can_fit + 1]]

#     debug_str = "\n".join([f"{x[0].location}: {x[1]}" for x in nearest_neighbors[: most_messages_we_can_fit + 1]])
#     print(f"Using {most_messages_we_can_fit} documents as context:\n" + debug_str)
    messages = get_messages(context, question)

#     print(f"Prompt: {messages[-1]['content']}")
    result = openai.ChatCompletion.create(model=chat_model, messages=messages)
    return result.choices[0].message["content"]

In [172]:
ask_embedding_store("Can you give me a summary of the Q1 global real estate markets", embeddings, 5)

Cached result found for 590bce5bce31bf7cf431942695cc4b4ae3d6a74bbba9654f7127b71a69127694. Returning it.


'In Q1, global commercial real estate investment volume fell by 55% year-over-year to US$147 billion, with the Americas experiencing a 56% decrease, Europe experiencing a 64% decrease, and the Asia-Pacific region experiencing a 20% decrease. The decrease in investment activity is largely attributed to high interest rates, tight credit conditions, and a worsening economic outlook. All property sectors experienced declines in investment volume with multifamily and office sectors being the most affected, and retail being the least affected. CBRE forecasts a 26% reduction in global investment volume for the full year, with the Americas and Europe having larger decreases than APAC. Specifically, multifamily investment in Europe fell by 63% year-over-year in Q1 to US$8 billion, but it is expected to remain resilient due to strong fundamentals.'