In [1]:
from openai import OpenAI
from datetime import datetime
import hashlib
import re
import os
from tqdm import tqdm
import numpy as np

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [2]:
pinecone_key = os.environ.get('PINECONE_API_KEY', False)
if not pinecone_key:
    print("PROBLEM: no PINECONE_API_KEY in environment")
api_key=os.environ.get("OPENAI_API_KEY", False)
if api_key:
    client = OpenAI(api_key=api_key)
else:
    print("PROBLEM: no OPENAI_API_KEY in environment")

PINECONE_INDEX = 'pinecone-ttc'
NAMESPACE = 'default'
EMBEDDING_MODEL = 'text-embedding-ada-002'

In [3]:
from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key=pinecone_key, environment="gcp-starter")

In [4]:
def get_response_from_documents(documents, model=EMBEDDING_MODEL):
    return client.embeddings.create(input = documents, model = model)

def get_embeddings_from_response(resp):
    lst = list()
    for item in resp.data:
        lst.append(item.embedding)
    return lst

def get_embedding_from_document(document):
    return get_embeddings_from_response(get_response_from_documents([document]))[0]
    
embedding_dimension = len(get_embedding_from_document('these are some words to test the embedding dimension'))

In [5]:
list_indexes = pc.list_indexes()
list_names = [x['name'] for x in list_indexes.indexes]

In [6]:
if not PINECONE_INDEX in list_names:
    pc.create_index(name=PINECONE_INDEX, dimension=embedding_dimension, metric='cosine', spec=PodSpec(environment='gcp-starter', pod_type='p1.x1'))

pinecone_index = pc.Index(PINECONE_INDEX)

In [7]:
def hashlib_sha(somestring):
    sha = hashlib.sha256()
    sha.update(somestring.encode())
    return sha.hexdigest()

print(hashlib_sha('Make a hash of this string'))

768d2bb50eca2cb375e681635b7a4082b65a8868d1be9fae718d26382d5c947b


In [8]:
def upsert_documents(documents, model=EMBEDDING_MODEL, namespace=NAMESPACE):
    now = datetime.utcnow()

    embeddings = get_embeddings_from_response(get_response_from_documents(documents, model=model))

    pinecone_list = list()
    for emb, doc in zip(embeddings, documents):
        pinecone_list.append(
            {
                "id": hashlib_sha(doc),
                "values": emb,
                "metadata": {"doc": doc, "timestamp": now}
            }
        ) 

    total_upserted = pinecone_index.upsert(
        vectors=pinecone_list,
        namespace=namespace
    )['upserted_count']

    return total_upserted

In [9]:
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen

ttc_lst = list()
with urlopen('https://en.wikisource.org/wiki/Translation:Tao_Te_Ching') as response:
    soup = BeautifulSoup(response, 'html.parser')
    num = 1
    for anchor in soup.find_all('p'):
        if(re.match("[a-zA-Z]", anchor.text[0]) and "Note:" not in anchor.text):
            print(num)
            print("---------------")
            print(anchor.text)
            num += 1
            ttc_lst.append(str(num) + ". " + anchor.text)
            print("---------------")
        if("Truthful words are not pleasant," in anchor.text):
            break

1
---------------
The Dao that can be stated, is not the eternal Dao;
The name that can be named is not the eternal name.
The unnamed is the origin of heaven and earth;
The named is the mother of the myriad things.
Therefore,
Constantly having no desire in order to view its commencement;
Constantly having desire in order to view its termination.
These two have the same origin, but they differ in name;
Both are called Mystery.
Mystery after Mystery, is the gate to all wonders.

---------------
2
---------------
Everyone knows what beauty is;
   That is because there is ugliness;
Everyone knows what goodness is;
   That is because there is evil.
Therefore, 
   Being and nothing give birth to one another,
   Hard and easy are mutually formed,
   Long and short shape each other,
   High and low complement each other,
   Music and voice are harmonized with each other,
   Front and back follow one another.
Hence,
   The sage focuses on non-action in his works,
   Practices not-saying in his 

In [10]:
upsert_documents(ttc_lst)

  now = datetime.utcnow()


81

In [13]:
def choose_document_from_pinecone(query, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding_from_document(query)

    return pinecone_index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=True
    ).get('matches')

# This may require a slight wait, while pinecone does the upsert
choose_document_from_pinecone('What is the Tao?', top_k = 1)

[{'id': '4d75c2066d99d1a3fae1b9a40098de023b8bcee195616c18e1bf1f5f72267d72',
  'metadata': {'doc': '5. The Dao is empty, \n'
                      'But when using it, it is impossible to use it up.\n'
                      'It is profound, seems like the root of the myriad '
                      'things. \n'
                      'Blunts its own sharpness.\n'
                      'Unravels its own fetters.\n'
                      'Harmonises its own light.\n'
                      'Mixes with its own dust.\n'
                      'It is unclear, but seems to have existed there. \n'
                      'I do not know whose son it is,\n'
                      'Maybe it was already created before the creator.\n',
               'timestamp': '2024-01-18T20:59:11.469012'},
  'score': 0.865768135,
  'values': []}]

In [14]:
print("Enter your question:")
x = input()
pinecone_response = choose_document_from_pinecone(x)
text = pinecone_response[0]["metadata"]["doc"]
query = f"""
I have a question as well as a quote from the Tao Te Ching.  Please answer the question using the quote, and explain how the quote
suggests your answer.  The question is: {x}
And the quote from the Tao Te Ching is:
-----------
{text}
-----------
Now please answer the question with reference to the information in the quote."""
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": query,
        }
    ],
    model="gpt-3.5-turbo",
)
print("ANSWER TO YOUR QUESTION")
print(chat_completion)
print("I USED THE FOLLOWING QUOTE FROM THE TAO TE CHING")
print(text)

Enter your question:


 Do students learn better by reading books or by performing homework exercises?


ANSWER TO YOUR QUESTION
ChatCompletion(id='chatcmpl-8iTmXFC0ZgYg7flYjPAG4sclKzLoa', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='According to the quote from the Tao Te Ching, students may learn better by performing homework exercises rather than just reading books. The quote emphasizes the importance of practice and taking action. It suggests that through practice (in this case, doing homework exercises), there is always something to gain, implying that active engagement with the material leads to growth and learning. On the other hand, the quote implies that those who are preoccupied (potentially with just reading books) cannot master the universe (representing knowledge or understanding). \n\nIn applying this to the question at hand, students may benefit more from performing homework exercises rather than solely relying on reading books. While reading books can provide valuable information, it is through practical application, s

In [125]:
import hashlib

def delete_documents(documents, namespace=NAMESPACE):
    hashes = [hashlib_sha(doc) for doc in documents]
    return pinecone_index.delete(ids=hashes, namespace=namespace)

delete_documents(ttc_lst)

In [None]:
choose_document_from_pinecone("This should produce no output, because