## 1. Creating Embeddings

In [20]:
import csv
import umap.umap_ as umap
from scipy import spatial
from sklearn.preprocessing import StandardScaler

In [21]:
from requests.exceptions import HTTPError, ConnectionError, Timeout
from openai import OpenAI
import tiktoken
from itertools import islice
import numpy as np
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type

In [22]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = "https://www.paulgraham.com/articles.html"

# Fetch the content of the page
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all links to blog posts
# Assuming the blog post links are in <a> tags within <td> tags
links = soup.find_all('a', href=True)

# Extract and print the URLs
blog_post_urls = []
for link in links:
    href = link['href']
    # Check if the link is a relative URL, then prepend the base URL
    if not href.startswith('http'):
        href = url.rsplit('/', 1)[0] + '/' + href
    blog_post_urls.append(href)

links = []

# Save the blog post URLs
for post_url in blog_post_urls:
    links.append(post_url)


In [47]:
# EMBEDDING A LINK

import math

# Get num tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Set up OpenAI


# Do embeddings with batching

# Vanilla embedding function
def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):
    response = client.embeddings.create(input=text_or_tokens, model=model)
    return response.data[0].embedding

# # Breaks up a sequence into chunks
# def batched(iterable, n):
#     """Batch data into tuples of length n. The last batch may be shorter."""
#     # batched('ABCDEFG', 3) --> ABC DEF G
#     if n < 1:
#         raise ValueError('n must be at least one')
#     it = iter(iterable)
#     while (batch := tuple(islice(it, n))):
#         yield batch

# # Encodes string into tokens, break into tokens
# def chunked_tokens(text, encoding_name, chunk_length):
#     encoding = tiktoken.get_encoding(encoding_name)
#     tokens = encoding.encode(text)
#     chunks_iterator = batched(tokens, chunk_length)
#     yield from chunks_iterator

def chunk_tokens(text, encoding_name, chunk_length):
    chunked_tokens = []
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    num_chunks = math.ceil(len(tokens) / chunk_length)
    for i in range(num_chunks):
        start = i * chunk_length
        end = min((i + 1) * chunk_length, len(text))
        chunk = tokens[start:end]
        chunked_tokens.append(chunk)
    return chunked_tokens
     
# Get safe embedding
def len_safe_get_embedding(text, model=EMBEDDING_MODEL, max_tokens=EMBEDDING_CTX_LENGTH, encoding_name=EMBEDDING_ENCODING, average=True):
    
    chunked_tokens = chunk_tokens(text, encoding_name=EMBEDDING_ENCODING, chunk_length=EMBEDDING_CTX_LENGTH)
    # chunk_embeddings = client.embeddings.create(input = chunked_tokens, model=model).data[0].embedding
    chunk_embeddings = client.embeddings.create(input=chunked_tokens, model=EMBEDDING_MODEL).data
    chunk_embeddings = [chunk.embedding for chunk in chunk_embeddings]
    chunk_lens = [len(chunk) for chunk in chunked_tokens]

    # for chunk in chunked_tokens(text, encoding_name=encoding_name, chunk_length=max_tokens):
    #     chunk_embeddings.append(get_embedding(chunk, model=model))
    #     chunk_lens.append(len(chunk))

    if average:
        chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
        chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings)  # normalizes length to 1
        chunk_embeddings = chunk_embeddings.tolist()
    return chunk_embeddings

In [52]:
# EMBEDDING LINKS ------------------------------------------------------

import concurrent.futures

@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), retry=retry_if_exception_type((ConnectionError, Timeout)))
def robust_get(url):
    return requests.get(url)

def process_link(url):
    print(f"Processing URL: {url}")
    try:
        link_response = robust_get(url)
        if link_response.status_code == 200:
            html = link_response.text
            soup = BeautifulSoup(html, 'html.parser')
            text = ' '.join(soup.stripped_strings)
            if text:
                return len_safe_get_embedding(text, model="text-embedding-3-small")
    except Exception as e:
        print(f"Failed to process URL {url}: {e}")
    return None

# with concurrent.futures.ProcessPoolExecutor() as executor:
#         for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
#             print('%d is prime: %s' % (number, prime))

# for openai
def fetch_and_process_pages(links):
    embeddings_list = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        embeddings_list = executor.map(process_link, links)
    embeddings_array = np.array(embeddings_list)
    # average_embedding = np.mean(embeddings_array, axis=0)
    # normalized_average_embedding = average_embedding / np.linalg.norm(average_embedding)
    return embeddings_array

# list of strings
def fetch_and_process_text(links):
    links_text = []
    for url in links:
        link_response = robust_get(url)
        if link_response.status_code == 200:
            html = link_response.text
            soup = BeautifulSoup(html, 'html.parser')
            text = ' '.join(soup.stripped_strings)
            if text:
                links_text.append(text)
    # average_embedding = np.mean(embeddings_array, axis=0)
    # normalized_average_embedding = average_embedding / np.linalg.norm(average_embedding)
    return links_text

In [46]:
sample_text = fetch_and_process_text(['https://www.paulgraham.com/reddits.html'])
sample_text[0] *= 6
chunks = chunked_tokens(sample_text[0], encoding_name=EMBEDDING_ENCODING, chunk_length=EMBEDDING_CTX_LENGTH)
print(len(chunks[0]))
print(len(chunks))
chunk_embeddings = client.embeddings.create(input = chunks, model=EMBEDDING_MODEL).data
chunk_embeddings = [chunk.embedding for chunk in chunk_embeddings]
print(chunk_embeddings)

8191
2
[[0.042258914560079575, -0.027704106643795967, -0.08932778239250183, 0.03520014509558678, 0.048974115401506424, -0.017849940806627274, -0.0011126931058242917, 0.015007692389190197, -0.022737981751561165, 0.0381985604763031, 0.030593205243349075, -0.022613046690821648, -0.017006635665893555, -0.007386720739305019, -0.002584571484476328, -0.013024365529417992, -0.03223296254873276, -0.0573134571313858, -0.042415082454681396, 0.007242265623062849, 0.04363318532705307, 0.03663688525557518, -0.004431251436471939, 0.029734283685684204, 0.007574121467769146, 0.049536317586898804, -0.026111198589205742, 0.03541877865791321, 0.02397170476615429, -0.039291732013225555, 0.02625175006687641, -0.039853934198617935, -0.05834416300058365, -0.0022097695618867874, 0.004204808734357357, 0.018615160137414932, 0.01173598412424326, -0.0473812073469162, 0.055002178996801376, 0.03541877865791321, 0.019474081695079803, -0.04363318532705307, 0.013071215711534023, -0.00537606468424201, -0.017959257587790

In [53]:
# Create a dictionary to store embeddings for each person
pg_embeddings = fetch_and_process_pages(links)

Process SpawnProcess-11:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/process.py", line 249, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_link' on <module '__main__' (built-in)>
Process SpawnProcess-12:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.

OSError: handle is closed

## 2. Semantic Search

In [None]:
query_text = "Why is Sam Altman good?"

query_embeddings = np.array([len_safe_get_embedding(query_text, model="text-embedding-3-small")])

def cosine_similarity(vector1, vector2):
    """
    Compute the cosine similarity between two vectors.

    Parameters:
    vector1 (numpy array): The first vector.
    vector2 (numpy array): The second vector.

    Returns:
    float: The cosine similarity between the two vectors.
    """
    dot_product = np.dot(vector1, np.transpose(vector2))
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    return dot_product / (norm_vector1 * norm_vector2)

similarities = cosine_similarity(query_embeddings, pg_embeddings)
retrieved_doc_id = np.argmax(similarities)
print(links[retrieved_doc_id])

https://www.paulgraham.com/5founders.html


In [None]:
from mixedbread_ai.client import MixedbreadAI

mxbai = MixedbreadAI(api_key={YOUR_API_KEY_HERE})

def get_embeddings(texts, model, prompt=None):
    res = mxbai.embeddings(
        input=texts,
        model=model,
        prompt=prompt
    )
    embeddings = [entry.embedding for entry in res.data]
    return np.array(embeddings)

model_name = "mixedbread-ai/mxbai-embed-large-v1"

corpus_texts = fetch_and_process_text(links)
print(corpus_texts)

corpus_embeddings = get_embeddings(corpus_texts, model_name)

KeyboardInterrupt: 

In [None]:
query_text = "Why is Sam Altman good?"
query_embeddings = get_embeddings([query_text], model_name)
similarities = cosine_similarity(query_embeddings, corpus_embeddings)
retrieved_doc_id = np.argmax(similarities)
print(corpus_texts[retrieved_doc_id])

Some Heroes April 2008 There are some topics I save up because they'll be so much fun to
write about.  This is one of them: a list of my heroes. I'm not claiming this is a list of the n most admirable people.
Who could make such a list, even if they wanted to? Einstein isn't on the list, for example, even though he probably
deserves to be on any shortlist of admirable people.  I once asked
a physicist friend if Einstein was really as smart as his fame
implies, and she said that yes, he was.  So why isn't he on the
list?  Because I had to ask.  This is a list of people who've
influenced me, not people who would have if I understood their work. My test was to think of someone and ask "is this person my
hero?"  It often returned surprising answers.  For example,
it returned false for Montaigne, who was arguably the inventor of
the essay.  Why?  When I thought
about what it meant to call someone a hero, it meant I'd decide what
to do by asking what they'd do in the same situation.  That's 