### SAP Machine Learning Embedding in OpenAI - step 00
##### Author: Sergiu Iatco. May, 2023
https://people.sap.com/iatco.sergiu <br>
https://www.linkedin.com/in/sergiuiatco/ <br>

#### Resources:
https://pypi.org/project/gpt-index/ <br>
https://github.com/jerryjliu/llama_index/blob/main/examples/langchain_demo/LangchainDemo.ipynb <br>
https://github.com/jerryjliu/llama_index/tree/main/examples <br>
https://github.com/jerryjliu/llama_index/blob/main/examples/vector_indices/SimpleIndexDemo-ChatGPT.ipynb <br>
https://gpt-index.readthedocs.io/en/stable/reference/service_context.html <br>
https://gpt-index.readthedocs.io/en/stable/reference/service_context/embeddings.html <br>
https://gpt-index.readthedocs.io/en/stable/getting_started/starter_example.html store and load <br>
https://gpt-index.readthedocs.io/en/latest/guides/primer/usage_pattern.html <br>

In [1]:
# !pip install llama-index

In [2]:
import os
from IPython.core.debugger import set_trace
# os.environ["OPENAI_API_KEY"] = '<OPENAI_API_KEY>'

In [3]:
import os
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
from llama_index import StorageContext, load_index_from_storage
import shutil
import pathlib

import logging
import sys

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)

logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# There are five standard levels for logging in Python, listed here in increasing order of severity:
# DEBUG: Detailed information, typically of interest only when diagnosing problems.
# INFO: Confirmation that things are working as expected.
# WARNING: An indication that something unexpected happened or indicative of some problem in the near future (e.g., ‘disk space low’). The software is still working as expected.
# ERROR: Due to a more serious problem, the software has not been able to perform some function.
# CRITICAL: A very serious error, indicating that the program itself may be unable to continue running.

class llama_context():
    def __init__(self, path=None):
        
        if path!=None:
            self.path = path
        else:
            self.path = ''
        
        perisit_sub_dir = "storage"
        self.perisit_dir = os.path.join(self.path, perisit_sub_dir)
        if not os.path.exists(self.perisit_dir):
            os.makedirs(self.perisit_dir)
        data_sub_dir = "data"
        self.data_dir = os.path.join(self.path, data_sub_dir)
        self.data_dir_counter = 0
        
        self.cost_model_ada = "ada" # https://openai.com/pricing
        self.cost_model_davinci = "davinci" # https://openai.com/pricing
        self.price_ada_1k_tokens = 0.0004
        self.price_davinci_1k_tokens = 0.03 

        
    def load_data(self):
        self.documents = SimpleDirectoryReader(self.data_dir).load_data()
        print(f"Documents loaded: {len(self.documents)}.")
    def create_vector_store(self):
        self.index = GPTVectorStoreIndex.from_documents(self.documents)
        print("GPTVectorStoreIndex complete.")
    def save_index(self):
        self.index.storage_context.persist(persist_dir=self.perisit_dir)
        print(f"Index saved in path {self.perisit_dir}.")
    def load_index(self):
        storage_context = StorageContext.from_defaults(persist_dir=self.perisit_dir)
        self.index = load_index_from_storage(storage_context)
    def start_query_engine(self):
        self.query_engine = self.index.as_query_engine()
        print("Query_engine started.")
    def post_question(self, question, sleep = None):
        if sleep == None:
            self.sleep = 0 # trial 20s
        self.response_cls = self.query_engine.query(question)
        self.response = self.response_cls.response

    def del_data_dir(self):
        path = self.data_dir
        try:
            shutil.rmtree(path)
            print(f"{path} deleted successfully!")
        except OSError as error:
            print(f"Error deleting {path}: {error}")

    def copy_file_to_data_dir(self, file_extension ='.txt', verbose = 0):

        path_from = self.path
        path_to = self.data_dir

        if not os.path.exists(path_to):
            os.makedirs(path_to)

        for filename in os.listdir(path_from):
            if filename.endswith(file_extension):
                source_path = os.path.join(path_from, filename)
                dest_path = os.path.join(path_to, filename)
                shutil.copy(source_path, dest_path)
                if verbose == 1:
                    print(f"File {filename} copied successfully!")
    
        path_to_lib = pathlib.Path(path_to)
        path_to_lib_files = path_to_lib.glob(f"*{file_extension}")
        print(f"Files {len(list(path_to_lib_files))} copied in {path_to}.")
 
    def copy_path_from_to_data_dir(self, path_from, file_extension ='.txt', verbose = 0):

        path_to = self.data_dir # default data folder for llama
        start_counter = self.data_dir_counter
        
        if not os.path.exists(path_to):
            os.makedirs(path_to)

        padding_n = 5
        path_from_lib = pathlib.Path(path_from)
        path_from_lib_files = path_from_lib.glob(f"**/*{file_extension}")

        files_copied_n = 0
        counter = None
        for counter, file in enumerate(path_from_lib_files, start_counter):
            filename_path = os.path.split(file)[0] # path only
            filename = os.path.split(file)[1] # filename only
            filename_with_index = f'{str(counter).zfill(padding_n)}_{filename}'
            file_to_data_dir = os.path.join(path_to, filename_with_index)
            shutil.copy(file, file_to_data_dir)
            
            if os.path.exists(file_to_data_dir):
                files_copied_n += 1
                if verbose == 1:
                    print(f"File {filename} -> copied successfully!")
            else:
                if verbose == 1:
                    print(f"File {filename} was not copied!")
        
#         if 'counter' in locals(): 
        if counter != None: 
            self.data_dir_counter = counter + 1 # start from last
        
        print(f"Files: {files_copied_n} copied to folder: {path_to}!")

    def estimate_tokens(self, text):
        words = text.split()

        num_words = int(len(words))
        tokens = int(( num_words / 0.75 ))
        tokens_1k = tokens / 1000
        cost_ada = tokens_1k * self.price_ada_1k_tokens
        cost_davinci = tokens_1k * self.price_davinci_1k_tokens
        return tokens, cost_ada, cost_davinci
    
    def estimate_cost(self):
        total_tokens = 0
        total_cost_ada = 0
        total_cost_davinci = 0
        costs_rounding = 8
        
        for doc in self.documents:
            text = doc.get_text()
            tokens, cost_ada, cost_davinci = self.estimate_tokens(text)
            total_tokens += tokens
            
            total_cost_ada += cost_ada
            total_cost_ada = round(total_cost_ada, costs_rounding)
            
            total_cost_davinci += cost_davinci
            total_cost_davinci = round(total_cost_davinci, costs_rounding)
            
        self.total_tokens = total_tokens
        self.total_cost_ada = total_cost_ada
        self.total_cost_davinci = total_cost_davinci
        print(f"Total tokens: {self.total_tokens}")
        print(f"Total estimated costs with model {self.cost_model_ada }: ${self.total_cost_ada}")
        print(f"Total estimated costs with model {self.cost_model_davinci }: ${self.total_cost_davinci}")
        

In [4]:
import datetime

def time_now():
    now = datetime.datetime.now()
    formatted = now.strftime('%Y-%m-%d %H:%M:%S')
    print(formatted)

time_now()

2023-05-16 16:02:37


In [5]:
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

path_llama = "llama_mvp"
lct = llama_context(path=path_llama)

display(lct.path)
display(lct.data_dir)
display(lct.perisit_dir)

'llama_mvp'

'llama_mvp\\data'

'llama_mvp\\storage'

In [6]:
%time
# Delete data directory
time_now()
run_create_save = True
if run_create_save:
    lct.del_data_dir()

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:37
llama_mvp\data deleted successfully!


In [7]:
%time
time_now()
# Copy files from source to data directory
run_create_save = True
if run_create_save:
    path_from = "llama_mvp/source"
    
    lct.copy_path_from_to_data_dir(path_from) # default extension *.txt

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:37
Files: 1 copied to folder: llama_mvp\data!


In [8]:
vars(lct).keys()

dict_keys(['path', 'perisit_dir', 'data_dir', 'data_dir_counter', 'cost_model_ada', 'cost_model_davinci', 'price_ada_1k_tokens', 'price_davinci_1k_tokens'])

In [9]:
%time
time_now()
# Load documents
run_create_save = True
if run_create_save:
    lct.load_data()

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:37
Documents loaded: 1.


In [10]:
%time
time_now()
# Estimate costs
run_create_save = True
if run_create_save:
    lct.estimate_cost()

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:37
Total tokens: 6
Total estimated costs with model ada: $2.4e-06
Total estimated costs with model davinci: $0.00018


In [11]:
# https://platform.openai.com/account/api-keys
%time
time_now()
# Vector create does embedding and costs tokens
run_create_save = True
if run_create_save:
    lct.create_vector_store()

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:37


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens


> [build_index_from_nodes] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 7 tokens


> [build_index_from_nodes] Total embedding token usage: 7 tokens
GPTVectorStoreIndex complete.


In [12]:
%time
time_now()
# Save index
run_create_save = True
if run_create_save:
    lct.save_index()

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:45
Index saved in path llama_mvp\storage.


In [13]:
%time
time_now()
# Method load_index() costs as method create_vector_store() but you don't need to upload data
run_load = True
if run_load:
    lct.load_index()

INFO:llama_index.indices.loading:Loading all indices.


CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:45
Loading all indices.


In [14]:
# help(lct.index.vector_store)

In [15]:
# dir(lct)

In [16]:
help(lct)

Help on llama_context in module __main__ object:

class llama_context(builtins.object)
 |  llama_context(path=None)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  copy_file_to_data_dir(self, file_extension='.txt', verbose=0)
 |  
 |  copy_path_from_to_data_dir(self, path_from, file_extension='.txt', verbose=0)
 |  
 |  create_vector_store(self)
 |  
 |  del_data_dir(self)
 |  
 |  estimate_cost(self)
 |  
 |  estimate_tokens(self, text)
 |  
 |  load_data(self)
 |  
 |  load_index(self)
 |  
 |  post_question(self, question, sleep=None)
 |  
 |  save_index(self)
 |  
 |  start_query_engine(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [17]:
lct.__dict__

{'path': 'llama_mvp',
 'perisit_dir': 'llama_mvp\\storage',
 'data_dir': 'llama_mvp\\data',
 'data_dir_counter': 1,
 'cost_model_ada': 'ada',
 'cost_model_davinci': 'davinci',
 'price_ada_1k_tokens': 0.0004,
 'price_davinci_1k_tokens': 0.03,
 'documents': [Document(text='Bogdan was born in 1990', doc_id='474e23e8-b69e-4649-ab16-707e3ed776bc', embedding=None, doc_hash='a472dc9c2915860c7858effa860ef5151281645eec14ecbea825b3ea608e1327', extra_info=None)],
 'total_tokens': 6,
 'total_cost_ada': 2.4e-06,
 'total_cost_davinci': 0.00018,
 'index': <llama_index.indices.vector_store.base.GPTVectorStoreIndex at 0x21f34d511c0>}

In [18]:
%time
time_now()
# Start query engine
lct.start_query_engine()

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:45
Query_engine started.


In [19]:
len(lct.documents)

1

In [20]:
%time
time_now()
question = "What is content about?"
lct.post_question(question)
print(lct.response)

CPU times: total: 0 ns
Wall time: 0 ns
2023-05-16 16:02:46


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 5 tokens


> [retrieve] Total embedding token usage: 5 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 54 tokens


> [get_response] Total LLM token usage: 54 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

The content is about Bogdan and the year he was born.


In [21]:
question = "How old is he?"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 5 tokens


> [retrieve] Total embedding token usage: 5 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 49 tokens


> [get_response] Total LLM token usage: 49 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

Bogdan is 30 years old.


In [22]:
question = "What date is today?"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 5 tokens


> [retrieve] Total embedding token usage: 5 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 50 tokens


> [get_response] Total LLM token usage: 50 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

Today's date is August 8, 2020.


In [23]:
from datetime import date
today = date.today()

question = f"Consider current date {today}"
print(question)
lct.post_question(question)
print(lct.response)

Consider current date 2023-05-16


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 9 tokens


> [retrieve] Total embedding token usage: 9 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 53 tokens


> [get_response] Total LLM token usage: 53 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

Bogdan is 33 years old.


In [24]:
question = "Where is the name commonly used as a given name?"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 11 tokens


> [retrieve] Total embedding token usage: 11 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 71 tokens


> [get_response] Total LLM token usage: 71 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

The name Bogdan is commonly used as a given name in Eastern European countries such as Romania, Bulgaria, and Ukraine.
