### SAP Machine Learning Embedding in OpenAI - step 05
##### Author: Sergiu Iatco. May, 2023
https://people.sap.com/iatco.sergiu <br>
https://www.linkedin.com/in/sergiuiatco/ <br>

#### Collected content for embedding.
Blogs:<br>
https://blogs.sap.com/2022/11/07/sap-community-call-sap-hana-cloud-machine-learning-challenge-i-quit-how-to-prevent-employee-churn/
https://blogs.sap.com/2022/11/28/i-quit-how-to-predict-employee-churn-sap-hana-cloud-machine-learning-challenge/
https://blogs.sap.com/2022/12/22/sap-hana-cloud-machine-learning-challenge-2022-the-winners-are/

https://blogs.sap.com/2023/01/09/sap-hana-cloud-machine-learning-challenge-i-quit-understanding-metrics/

Documentation:<br>
https://help.sap.com/doc/1d0ebfe5e8dd44d09606814d83308d4b/2.0.04/en-US/hana_ml.dataframe.html
https://help.sap.com/doc/1d0ebfe5e8dd44d09606814d83308d4b/2.0.07/en-US/pal/algorithms/hana_ml.algorithms.pal.trees.HybridGradientBoostingClassifier.html

GitHub Notebooks:<br>
https://github.com/SAP-samples/hana-ml-samples/tree/main/Python-API/usecase-examples/sapcommunity-hanaml-challenge<br>
https://github.com/itsergiu/sapcommunity-hanaml-challenge<br>

In [1]:
# !pip install llama-index

In [2]:
import os
from IPython.core.debugger import set_trace
# os.environ["OPENAI_API_KEY"] = '<OPENAI_API_KEY>'

In [3]:
import os
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
from llama_index import StorageContext, load_index_from_storage
import shutil
import pathlib

import logging
import sys

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)

logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# There are five standard levels for logging in Python, listed here in increasing order of severity:
# DEBUG: Detailed information, typically of interest only when diagnosing problems.
# INFO: Confirmation that things are working as expected.
# WARNING: An indication that something unexpected happened or indicative of some problem in the near future (e.g., ‘disk space low’). The software is still working as expected.
# ERROR: Due to a more serious problem, the software has not been able to perform some function.
# CRITICAL: A very serious error, indicating that the program itself may be unable to continue running.

class llama_context():
    def __init__(self, path=None):
        
        if path!=None:
            self.path = path
        else:
            self.path = ''
        
        perisit_sub_dir = "storage"
        self.perisit_dir = os.path.join(self.path, perisit_sub_dir)
        if not os.path.exists(self.perisit_dir):
            os.makedirs(self.perisit_dir)
        data_sub_dir = "data"
        self.data_dir = os.path.join(self.path, data_sub_dir)
        self.data_dir_counter = 0
        
        self.cost_model_ada = "ada" # https://openai.com/pricing
        self.cost_model_davinci = "davinci" # https://openai.com/pricing
        self.price_ada_1k_tokens = 0.0004
        self.price_davinci_1k_tokens = 0.03 

        
    def load_data(self):
        self.documents = SimpleDirectoryReader(self.data_dir).load_data()
        print(f"Documents loaded: {len(self.documents)}.")
    def create_vector_store(self):
        self.index = GPTVectorStoreIndex.from_documents(self.documents)
        print("GPTVectorStoreIndex complete.")
    def save_index(self):
        self.index.storage_context.persist(persist_dir=self.perisit_dir)
        print(f"Index saved in path {self.perisit_dir}.")
    def load_index(self):
        storage_context = StorageContext.from_defaults(persist_dir=self.perisit_dir)
        self.index = load_index_from_storage(storage_context)
    def start_query_engine(self):
        self.query_engine = self.index.as_query_engine()
        print("Query_engine started.")
    def post_question(self, question, sleep = None):
        if sleep == None:
            self.sleep = 0 # trial 20s
        self.response_cls = self.query_engine.query(question)
        self.response = self.response_cls.response

    def del_data_dir(self):
        path = self.data_dir
        try:
            shutil.rmtree(path)
            print(f"{path} deleted successfully!")
        except OSError as error:
            print(f"Error deleting {path}: {error}")

    def copy_file_to_data_dir(self, file_extension ='.txt', verbose = 0):

        path_from = self.path
        path_to = self.data_dir

        if not os.path.exists(path_to):
            os.makedirs(path_to)

        for filename in os.listdir(path_from):
            if filename.endswith(file_extension):
                source_path = os.path.join(path_from, filename)
                dest_path = os.path.join(path_to, filename)
                shutil.copy(source_path, dest_path)
                if verbose == 1:
                    print(f"File {filename} copied successfully!")
    
        path_to_lib = pathlib.Path(path_to)
        path_to_lib_files = path_to_lib.glob(f"*{file_extension}")
        print(f"Files {len(list(path_to_lib_files))} copied in {path_to}.")
 
    def copy_path_from_to_data_dir(self, path_from, file_extension ='.txt', verbose = 0):

        path_to = self.data_dir # default data folder for llama
        start_counter = self.data_dir_counter
        
        if not os.path.exists(path_to):
            os.makedirs(path_to)

        padding_n = 5
        path_from_lib = pathlib.Path(path_from)
        path_from_lib_files = path_from_lib.glob(f"**/*{file_extension}")

        files_copied_n = 0
        counter = None
        for counter, file in enumerate(path_from_lib_files, start_counter):
            filename_path = os.path.split(file)[0] # path only
            filename = os.path.split(file)[1] # filename only
            filename_with_index = f'{str(counter).zfill(padding_n)}_{filename}'
            file_to_data_dir = os.path.join(path_to, filename_with_index)
            shutil.copy(file, file_to_data_dir)
            
            if os.path.exists(file_to_data_dir):
                files_copied_n += 1
                if verbose == 1:
                    print(f"File {filename} -> copied successfully!")
            else:
                if verbose == 1:
                    print(f"File {filename} was not copied!")
        
#         if 'counter' in locals(): 
        if counter != None: 
            self.data_dir_counter = counter + 1 # start from last
        
        print(f"Files: {files_copied_n} copied to folder: {path_to}!")

    def estimate_tokens(self, text):
        words = text.split()

        num_words = int(len(words))
        tokens = int(( num_words / 0.75 ))
        tokens_1k = tokens / 1000
        cost_ada = tokens_1k * self.price_ada_1k_tokens
        cost_davinci = tokens_1k * self.price_davinci_1k_tokens
        return tokens, cost_ada, cost_davinci
    
    def estimate_cost(self):
        total_tokens = 0
        total_cost_ada = 0
        total_cost_davinci = 0
        costs_rounding = 8
        
        for doc in self.documents:
            text = doc.get_text()
            tokens, cost_ada, cost_davinci = self.estimate_tokens(text)
            total_tokens += tokens
            
            total_cost_ada += cost_ada
            total_cost_ada = round(total_cost_ada, costs_rounding)
            
            total_cost_davinci += cost_davinci
            total_cost_davinci = round(total_cost_davinci, costs_rounding)
            
        self.total_tokens = total_tokens
        self.total_cost_ada = total_cost_ada
        self.total_cost_davinci = total_cost_davinci
        print(f"Total tokens: {self.total_tokens}")
        print(f"Total estimated costs with model {self.cost_model_ada }: ${self.total_cost_ada}")
        print(f"Total estimated costs with model {self.cost_model_davinci }: ${self.total_cost_davinci}")
        

In [4]:
import datetime

def time_now():
    now = datetime.datetime.now()
    formatted = now.strftime('%Y-%m-%d %H:%M:%S')
    print(formatted)

# time_now()

In [5]:
path_llama = "llama_challenge"
lct = llama_context(path=path_llama)

display(lct.path)
display(lct.data_dir)
display(lct.perisit_dir)

'llama_challenge'

'llama_challenge\\data'

'llama_challenge\\storage'

In [6]:
%time
# time_now()
run_load_create_save = True
if run_load_create_save:
    lct.del_data_dir()

CPU times: total: 0 ns
Wall time: 0 ns
Error deleting llama_challenge\data: [WinError 3] The system cannot find the path specified: 'llama_challenge\\data'


In [7]:
%time
# time_now()
# run_load_create_save = False
if run_load_create_save:

    path_from1 = "llama_challenge//html_challenge"
    path_from2 = "llama_challenge//ipynb_blog"
    path_from3 = "llama_challenge//ipynb_hana_ml_samples//Python-API//usecase-examples//sapcommunity-hanaml-challenge"

    lct.copy_path_from_to_data_dir(path_from1) # default extension *.txt
    lct.copy_path_from_to_data_dir(path_from2) # default extension *.txt
    lct.copy_path_from_to_data_dir(path_from3) # default extension *.txt

CPU times: total: 0 ns
Wall time: 0 ns
Files: 6 copied to folder: llama_challenge\data!
Files: 1 copied to folder: llama_challenge\data!
Files: 5 copied to folder: llama_challenge\data!


In [8]:
vars(lct).keys()

dict_keys(['path', 'perisit_dir', 'data_dir', 'data_dir_counter', 'cost_model_ada', 'cost_model_davinci', 'price_ada_1k_tokens', 'price_davinci_1k_tokens'])

In [9]:
%time
# time_now()
run_load_create_save = True
if run_load_create_save:
    lct.load_data()

CPU times: total: 0 ns
Wall time: 0 ns
Documents loaded: 12.


In [10]:
%time
# time_now()
run_load_create_save = True
if run_load_create_save:
    lct.estimate_cost()

CPU times: total: 0 ns
Wall time: 0 ns
Total tokens: 43819
Total estimated costs with model ada: $0.0175276
Total estimated costs with model davinci: $1.31457


In [11]:
# https://platform.openai.com/account/api-keys
%time
# time_now()
# run_load_create_save = False
if run_load_create_save:
    lct.create_vector_store()

CPU times: total: 0 ns
Wall time: 0 ns


Token indices sequence length is longer than the specified maximum sequence length for this model (1025 > 1024). Running this sequence through the model will result in indexing errors
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens


> [build_index_from_nodes] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 147741 tokens


> [build_index_from_nodes] Total embedding token usage: 147741 tokens
GPTVectorStoreIndex complete.


In [16]:
%time
# time_now()
# run_load_create_save = False
if run_load_create_save:
    lct.save_index()

CPU times: total: 0 ns
Wall time: 0 ns
Index saved in path llama_challenge\storage.


In [17]:
%time
# time_now()
lct.load_index()

CPU times: total: 0 ns
Wall time: 0 ns


INFO:llama_index.indices.loading:Loading all indices.


Loading all indices.


In [18]:
# lct.index.vector_store.__dict__

In [19]:
# help(lct.index.vector_store)

In [20]:
help(lct.index.vector_store)

Help on SimpleVectorStore in module llama_index.vector_stores.simple object:

class SimpleVectorStore(llama_index.vector_stores.types.VectorStore)
 |  SimpleVectorStore(*args, **kwds)
 |  
 |  Simple Vector Store.
 |  
 |  In this vector store, embeddings are stored within a simple, in-memory dictionary.
 |  
 |  Args:
 |      simple_vector_store_data_dict (Optional[dict]): data dict
 |          containing the embeddings and doc_ids. See SimpleVectorStoreData
 |          for more details.
 |  
 |  Method resolution order:
 |      SimpleVectorStore
 |      llama_index.vector_stores.types.VectorStore
 |      typing.Protocol
 |      typing.Generic
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, data: Union[llama_index.vector_stores.simple.SimpleVectorStoreData, NoneType] = None, **kwargs: Any) -> None
 |      Initialize params.
 |  
 |  __subclasshook__ = _proto_hook(other)
 |      # Set (or override) the protocol subclass hook.
 |  
 |  add(self, embedding_

In [21]:
dir(lct)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'copy_file_to_data_dir',
 'copy_path_from_to_data_dir',
 'cost_model_ada',
 'cost_model_davinci',
 'create_vector_store',
 'data_dir',
 'data_dir_counter',
 'del_data_dir',
 'documents',
 'estimate_cost',
 'estimate_tokens',
 'index',
 'load_data',
 'load_index',
 'path',
 'perisit_dir',
 'post_question',
 'price_ada_1k_tokens',
 'price_davinci_1k_tokens',
 'save_index',
 'start_query_engine',
 'total_cost_ada',
 'total_cost_davinci',
 'total_tokens']

In [22]:
help(lct)

Help on llama_context in module __main__ object:

class llama_context(builtins.object)
 |  llama_context(path=None)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  copy_file_to_data_dir(self, file_extension='.txt', verbose=0)
 |  
 |  copy_path_from_to_data_dir(self, path_from, file_extension='.txt', verbose=0)
 |  
 |  create_vector_store(self)
 |  
 |  del_data_dir(self)
 |  
 |  estimate_cost(self)
 |  
 |  estimate_tokens(self, text)
 |  
 |  load_data(self)
 |  
 |  load_index(self)
 |  
 |  post_question(self, question, sleep=None)
 |  
 |  save_index(self)
 |  
 |  start_query_engine(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [23]:
# lct.vector_store.to_dict()
%time
# time_now()
lct.start_query_engine()

CPU times: total: 0 ns
Wall time: 0 ns
Query_engine started.


In [24]:
len(lct.documents)

12

In [25]:
# inspecting the object lct
lct.__dict__

{'path': 'llama_challenge',
 'perisit_dir': 'llama_challenge\\storage',
 'data_dir': 'llama_challenge\\data',
 'data_dir_counter': 12,
 'cost_model_ada': 'ada',
 'cost_model_davinci': 'davinci',
 'price_ada_1k_tokens': 0.0004,
 'price_davinci_1k_tokens': 0.03,
 'documents': [Document(text='\nSAP HANA Cloud Machine Learning Challenge “I quit!” – understanding metrics | SAP Blogs\n \nSkip to Content\nSAP Community Log-in UpdateIn a few months, SAP Community will switch to SAP Universal ID as the only option to login. Don’t wait, create your SAP Universal ID now! If you have multiple accounts, use the Consolidation Tool to merge your content.Get started with SAP Universal ID\nHome\nCommunity\nAsk a Question\nWrite a Blog Post\nLogin / Sign-up\n \nTechnical Articles\n \nSergiu Iatco\nJanuary 9, 2023\n15 minute read\nSAP HANA Cloud Machine Learning Challenge “I quit!” – understanding metrics\n4        \n13        \n1,402        \nAuthor: Sergiu Iatco\xa0\nI participated in the SAP HANA ML C

In [26]:
# What embedding stores - vector_store.json
dir(lct.index.vector_store)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_data',
 '_is_protocol',
 '_is_runtime_protocol',
 'add',
 'client',
 'delete',
 'from_persist_dir',
 'from_persist_path',
 'get',
 'is_embedding_query',
 'persist',
 'query',
 'stores_text']

In [27]:
dir(lct.index)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_add_nodes_to_index',
 '_aget_node_embedding_results',
 '_async_add_nodes_to_index',
 '_build_index_from_nodes',
 '_delete',
 '_docstore',
 '_get_node_embedding_results',
 '_index_struct',
 '_insert',
 '_is_protocol',
 '_service_context',
 '_storage_context',
 '_use_async',
 '_vector_store',
 'as_query_engine',
 'as_retriever',
 'build_index_from_nodes',
 'delete',
 'docstore',
 'from_documents',
 'index_id',
 'index_struct',
 'index_struct_cls',
 'insert',
 'insert_nodes',
 'refresh',
 'ser

In [28]:
%time
# time_now()
question = "What is content about?"
lct.post_question(question)
print(lct.response)

CPU times: total: 0 ns
Wall time: 0 ns


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 5 tokens


> [retrieve] Total embedding token usage: 5 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 958 tokens


> [get_response] Total LLM token usage: 958 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

The content is about SAP HANA and its related technologies, such as SAP HANA Cloud's Auto ML capabilities, SAP HANA Python Client API for Machine Learning Algorithms, and SAP HANA Predictive Analysis Library (PAL). It also includes information about a book related to SAP HANA and a blog post about SAP HANA Machine Learning with ABAP Managed Database Procedures in SAP BW/4HANA.


In [29]:
question = "Who organized the Community Call?"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 6 tokens


> [retrieve] Total embedding token usage: 6 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1252 tokens


> [get_response] Total LLM token usage: 1252 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

The SAP HANA Cloud Machine Learning Challenge team organized the Community Call.


In [30]:
question = "What problem participants must solve?"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 6 tokens


> [retrieve] Total embedding token usage: 6 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1840 tokens


> [get_response] Total LLM token usage: 1840 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

Participants must solve the problem of predicting employee churn.


In [31]:
question = "Explain data for predicting employee churn"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 7 tokens


> [retrieve] Total embedding token usage: 7 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1936 tokens


> [get_response] Total LLM token usage: 1936 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

Data for predicting employee churn can include information about the employee such as their job title, years of experience, salary, performance reviews, and other factors that may influence their decision to stay or leave the company. Additionally, data can be collected from the company itself, such as the onboarding process, company culture, learning opportunities, and other factors that may influence employee churn. By analyzing this data, patterns can be identified that can help predict employee churn and inform decisions about how to improve the company's retention rate.


In [32]:
question = "Can you tell me which machine learning models were utilized by the participants?"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 14 tokens


> [retrieve] Total embedding token usage: 14 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1787 tokens


> [get_response] Total LLM token usage: 1787 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

The participants utilized the HybridGradientBoostingTree model for their machine learning.


In [33]:
question = "Which are the top 5 important features discoverd by the model?"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 13 tokens


> [retrieve] Total embedding token usage: 13 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 2085 tokens


> [get_response] Total LLM token usage: 2085 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

The top 5 important features discovered by the model are: SICKDAYS, HRTRAINING, PREVIOUS_CAREER_PATH, LINKEDIN, and FUNCTIONALAREACHANGETYPE.


In [34]:
question = "Python full code SAP HANA Machine learning HGBT example"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 11 tokens


> [retrieve] Total embedding token usage: 11 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1925 tokens


> [get_response] Total LLM token usage: 1925 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

The following code is an example of using the SAP HANA Python Client API for Machine Learning Algorithms to implement a HGBT (Hierarchical Gradient Boosting Tree) model. 

# Import the necessary libraries
import hana_ml
from hana_ml.algorithms.apl.hgbt import HGBT

# Create a connection to the SAP HANA system
connection_context = hana_ml.dataframe.ConnectionContext(address='<hostname>:<port>',
                                                        user='<username>',
                                                        password='<password>')

# Load the data into a dataframe
df = connection_context.table('<schema>.<table>')

# Create the HGBT model
hgbt = HGBT(conn_context=connection_context)

# Fit the model
hgbt.fit(data=df, key='<key_column>', label='<label_column>')

# Make predictions
predictions = hgbt.predict(data=df)

# Evaluate the model
hgbt.evaluate(data=df, label='


In [35]:
question = "Python full code hana_ml dataframe example"
lct.post_question(question)
print(lct.response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 10 tokens


> [retrieve] Total embedding token usage: 10 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1913 tokens


> [get_response] Total LLM token usage: 1913 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens

In [1]:
# Import the necessary libraries
import hana_ml
import pandas as pd

# Load the CSV file into a Python object (Pandas DataFrame)
df_data = pd.read_csv(r'Emp_Churn_Train.csv', sep = ',')

# Create a connection to the HANA system
connection_context = hana_ml.dataframe.ConnectionContext(address='<HANA_SYSTEM_ADDRESS>', port=<HANA_SYSTEM_PORT>, user='<HANA_SYSTEM_USER>', password='<HANA_SYSTEM_PASSWORD>')

# Create a dataframe object from the Pandas DataFrame
df_remote = connection_context.table('EMP_CHURN_TRAIN', schema='<HANA_SYSTEM_SCHEMA>', data=df_data)

# Create training and testing set
from hana_ml.algorithms.pal import partition
hdf_train, hdf_test, hdf_val = partition.train_test_val_split( random_seed = 1017
