In [110]:
from openai import OpenAI
from datetime import datetime
import hashlib
import re
import os
from tqdm import tqdm
import numpy as np

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [111]:
pinecone_key = os.environ.get('PINECONE_API_KEY', False)
if not pinecone_key:
    print("PROBLEM: no PINECONE_API_KEY in environment")
api_key=os.environ.get("OPENAI_API_KEY", False)
if api_key:
    client = OpenAI(api_key=api_key)
else:
    print("PROBLEM: no OPENAI_API_KEY in environment")

PINECONE_INDEX = 'pinecone-ttc'
NAMESPACE = 'default'
EMBEDDING_MODEL = 'text-embedding-ada-002'

In [112]:
from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key=pinecone_key, environment="gcp-starter")

In [113]:
def get_response_from_documents(documents, model=EMBEDDING_MODEL):
    return client.embeddings.create(input = documents, model = model)

def get_embeddings_from_response(resp):
    lst = list()
    for item in resp.data:
        lst.append(item.embedding)
    return lst

def get_embedding_from_document(document):
    return get_embeddings_from_response(get_response_from_documents([document]))[0]
    
embedding_dimension = len(get_embedding_from_document('these are some words to test the embedding dimension'))

In [114]:
list_indexes = pc.list_indexes()
list_names = [x['name'] for x in list_indexes.indexes]

In [115]:
if not PINECONE_INDEX in list_names:
    pc.create_index(name=PINECONE_INDEX, dimension=embedding_dimension, metric='cosine', spec=PodSpec(environment='gcp-starter', pod_type='p1.x1'))

pinecone_index = pc.Index(PINECONE_INDEX)

In [116]:
def hashlib_sha(somestring):
    sha = hashlib.sha256()
    sha.update(somestring.encode())
    return sha.hexdigest()

print(hashlib_sha('Make a hash of this string'))

768d2bb50eca2cb375e681635b7a4082b65a8868d1be9fae718d26382d5c947b


In [117]:
def upsert_documents(documents, model=EMBEDDING_MODEL, namespace=NAMESPACE):
    now = datetime.utcnow()

    embeddings = get_embeddings_from_response(get_response_from_documents(documents, model=model))

    pinecone_list = list()
    for emb, doc in zip(embeddings, documents):
        pinecone_list.append(
            {
                "id": hashlib_sha(doc),
                "values": emb,
                "metadata": {"doc": doc, "timestamp": now}
            }
        ) 

    total_upserted = pinecone_index.upsert(
        vectors=pinecone_list,
        namespace=namespace
    )['upserted_count']

    return total_upserted

In [118]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
with urlopen('https://terebess.hu/english/tao/ron.html') as response:
    soup = BeautifulSoup(response, 'html.parser')
    started = False
    this_integ = False
    ttc_lst = list()
    accumulated = ""
    for anchor in soup.find_all('p'):
        next_text = anchor.text
        if next_text == "1":
            started = True
        try:
            this_integ = int(next_text)
        except ValueError:
            try:
                this_integ = int(next_text.split(".")[0])
            except ValueError:
                this_integ = False
        if started and not this_integ:
            if accumulated == "" and not next_text.isspace():
                accumulated = next_text
            elif accumulated != "" and not next_text.isspace():
                accumulated += next_text
            else:
                if "PART" not in accumulated:
                    ttc_lst.append(accumulated)
                accumulated = ""
    for i, elt in enumerate(ttc_lst):
        print(i+1)
        print(elt)
        print("-----")

1
If you can talk 
  about it,
  it ain't Tao.
  If it has a name,
  it's just another thing.Tao doesn't 
  have a name.
  Names are for ordinary things.Stop wanting 
  stuff. It keeps you from seeing what's real.
  When you want stuff, all you see are things.These two statements 
  have the same meaning.
  Figure them out, and you've got it made.
-----
2
If something 
  is beautiful, something else must be ugly.
  If something is good, something else must be bad.You can't have 
  something without nothing.
  If no task is difficult, 
  then no task is easy.
  Things are up high 
  because other things are down low.
  You know when you're listening to music 
  because you don't hear noise.
  And something else came first, so this must be next.The Masters 
  get the job done without moving a muscle
  and signify without saying a word.
  When things around them fall apart, they stay cool.
  They don't own much, 
  but they use whatever's at hand.
  They do the work without expecting any 

In [119]:
upsert_documents(ttc_lst)

  now = datetime.utcnow()


81

In [121]:
def choose_document_from_pinecone(query, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding_from_document(query)

    return pinecone_index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=True
    ).get('matches')

# This may require a slight wait, while pinecone does the upsert
choose_document_from_pinecone('What is the Tao?', top_k = 1)

[{'id': '0d6b85428119191c5a2c9118d144306c61defa5ae88345536a62b050bf67f727',
  'metadata': {'doc': 'Tao is the source \r\n'
                      '  of all living things,\r\n'
                      '  and they are nourished \r\n'
                      "  by Tao's power.\r\n"
                      '  They are influenced \r\n'
                      '  by the other living things around them,\r\n'
                      '  and they are shaped \r\n'
                      '  by their circumstances.Everything respects \r\n'
                      '  Tao\r\n'
                      '  and honors its power.\r\n'
                      "  That's just the way it is.Tao gives life \r\n"
                      '  to all things, \r\n'
                      '  and its power watches out for them,\r\n'
                      '  cares for them, helps them grow, \r\n'
                      '  protects them, and comforts them.Create something \r\n'
                      '  \r\n'
                      '  without 

In [123]:
print("Enter your question:")
x = input()
pinecone_response = choose_document_from_pinecone(x)
text = pinecone_response[0]["metadata"]["doc"]
query = f"""
I have a question as well as a quote from the Tao Te Ching.  Please answer the question using the quote, and explain how the quote
suggests your answer.  The question is: {x}
And the quote from the Tao Te Ching is:
-----------
{text}
-----------
Now please answer the question with reference to the information in the quote."""
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": query,
        }
    ],
    model="gpt-3.5-turbo",
)
print("ANSWER TO YOUR QUESTION")
print(chat_completion)
print("I USED THE FOLLOWING QUOTE FROM THE TAO TE CHING")
print(text)

Enter your question:


 What is the ideal way to teach a student about machine learning?


ANSWER TO YOUR QUESTION
ChatCompletion(id='chatcmpl-8iQK12ZkDaep55R71RJKWKIRHbOfT', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The ideal way to teach a student about machine learning is by following the principles outlined in the quote from the Tao Te Ching. The quote highlights the importance of practice, mastery, resourcefulness, and respect for both teachers and students.\n\nMachine learning requires a significant amount of practice and hands-on experience. By emphasizing the need for practice, the quote suggests that the ideal way to teach a student about machine learning is by providing them with ample opportunities to work on real-world problems, experiment with different algorithms, and gain practical experience. This hands-on approach will allow the student to develop the skills necessary to understand and apply machine learning concepts effectively.\n\nAdditionally, the quote emphasizes the importance of mastery and res

In [125]:
import hashlib

def delete_documents(documents, namespace=NAMESPACE):
    hashes = [hashlib_sha(doc) for doc in documents]
    return pinecone_index.delete(ids=hashes, namespace=namespace)

delete_documents(ttc_lst)

In [126]:
choose_document_from_pinecone('this should return nothing, because all documents have been deleted')

[]