In [1]:
import gradio as gr
import json
import openai
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
import configparser
import time
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.linear_model import LinearRegression
from datasets import load_dataset






This program uses openAI via API.  It requires an API key to be saved in the config file, and the GPT version can be changed.

In [2]:
llmModel = "gpt-3.5-turbo"

config = configparser.ConfigParser()
config.read('config.ini')

openai.api_key =config['DEFAULT']['openai_api_key']

'''
Save your openAI key in config.ini in the below format : 

[DEFAULT]
openai_api_key = .....

'''


'\nSave your openAI key in config.ini in the below format : \n\n[DEFAULT]\nopenai_api_key = .....\n\n'

Load a dataset for the analysis.  Here we'll use a classic of machine learning libraries.  This dataset doesn't have the description loaded on Huggingface in the metadata, so I load a brief description into the local variable dsInfo 

In [3]:
dataset = load_dataset("scikit-learn/auto-mpg",split='train')
df=pd.DataFrame(dataset)


dsInfo = "car mpg statistics"

Defintions of the analysis functions that we want to enable the application with.  Here they are coded in python, and then described in JSON for the LLM.  A future improvement would be to convert these to pydantic functions, which is a more concise method of creating the JSON schema directly from the function definition.

In [6]:
def filterDataset(target,targetOperator,targetValue):
    print(f"At filter database, got {target} and {targetOperator} and {targetValue}!!!")


def aggGroupBy(target,groupList=None,functionList=['mean'],filterConditions=None,precision=2):
    if ',' in functionList:
        functionList=functionList.split(',')
        functionList = list([x.strip() for x in functionList])
    if type(groupList)!=list:
        groupList = [groupList]
    if type(functionList)!=list:
        functionList=[functionList]
    if groupList is None:
        report=df.agg({target:functionList}).round(precision)
    if filterConditions is None:
        report=df.groupby(groupList).agg({target:functionList}).round(precision)
    else:
        print(filterConditions)
        report=df.groupby(groupList).agg({target:functionList}).round(precision)
    return report.head(20).to_csv()


def minOrMaxCase(target,aggFunction):
    if aggFunction == 'max':
        rowOrder = False
    else:
        rowOrder=True
    if target=='acceleration' and rowOrder==False:
        rowOrder = True
    elif target=='acceleration':
        rowOrder = False
    report=df.sort_values(target,ascending=rowOrder).head(1)
    report = json.dumps(report.to_dict(orient='records')[0])
    return report


def multipleRegression(indVar=['displacement'],depVar='mpg'):
    if ',' in indVar:
        indVar = [x.strip() for x in indVar.split(',')]
    if type(indVar) !=list:
        indVar = [indVar]
    y=df[depVar].values.reshape(-1,1)
    X=df[indVar]#.values.reshape(-1,1)
    reg = LinearRegression().fit(X, y)
    score = round(reg.score(X, y),3)
    coef = reg.coef_
    intercept = round(reg.intercept_[0],3)
    formula = depVar+" = "+str(intercept)+' '
    for z in zip(coef[0],indVar):
        if z[0]>0:
            formula=formula+"+"
        formula = formula+str(z[0].round(3))+"*"+z[1]+' '
    report = f"""The independent variable(s) explain {score} of the variance in {depVar}.The R2 or R-squared value is {depVar}. The regression formula is {formula}"""
    return report


# JSON Schema for above functions


functions = [
    {
        "name": "aggGroupBy",
        "description": '''Makes an aggregated report from the dataset.
          Lets the user specify a variable to be analyzed, one or more grouping variables, 
          and one or more aggregation functions from (min,max,mean,std).  You can also use pandas to filter the dataframe
          by specifying filterConditions.''',
        "parameters": {
            "type": "object",
            "properties": {
                "groupList": {
                    "type": "string",
                    "description": "List of variable names to group by",
                },
                "target": {"type": "string","description":"the variable name to be analyzed and aggregated"},
            
            "functionList": {"type": "string","description":"list of pandas aggregation functions, chosen from min, max, mean, std"},
            
            "filterConditions": {"type": "string","description":"pandas where conditions to filter the dataset with"},
            },},
            "required": ["target", "functionList"],
        },
    {
        "name": "minOrMaxCase",
        "description": '''Finds one case from the dataframe with a max or min value on the specified target variable.''',
        "parameters": {
            "type": "object",
            "properties": {
                "aggFunction": {
                    "type": "string",
                    "description": "Specifies if the user wants the argmax or argmin.  return either 'min' or 'max'",
                },
                "target": {"type": "string","description":"the variable name to be analyzed and aggregated"},
            },
            "aggregations": {"type": "string","description":"list of pandas aggregation functions, including min, max"},
            
            "filterConditions": {"type": "string","description":"pandas where conditions to filter the dataset with"},
            },
            "required": ["target"],
        },
        {
        "name": "multipleRegression",
        "description": '''runs a linear multiple regression to determine the linear relationship between the dependent \n
                            target variable and one or more independent explanatory variables. \n
                            Returns the R2 (R-squared) value and the linear formula.''',
        "parameters": {
            "type": "object",
            "properties": {
                "indVar": {
                    "type": "string",
                    "description": "one or more independent variables to explain the dependent variable via linear multiple regression",
                },
                "depVar": {"type": "string","description":"one variable to be explained by regression by the independent variable(s)"},
            },
            },
            "required": ["indVar","depVar"],
        },
                {
        "name": "filterDataset",
        "description": '''filters the dataset using conditional logic of GT, GTE, LT, LTE against one variable with one parameter value''',
        "parameters": {
            "type": "object",
            "properties": {
                "target": {
                    "type": "string",
                    "description": "one valid variable name to filter the dataset by",
                },
                "targetOperator": {"type": "string","description":"valid values are GT, GTE, LT or LTE, and represent the condition to use against the target"},
            },
            "targetValues": {"type": "string","description":"value to use in the conditional with the target and targetOperator"},
            },
            },
            "required": ["indVar","depVar"],
        }
    
]



The chat stream needs to be initialized with the LLM.  Here the initial system instructions are defined, using some metadata from the dataframe we are loading.

In [7]:
varnames = ', '.join(list(df.columns))


initialMessages = [
        {  
        "role": "system",
        "content": """You are a helpful assistant that analyzes a dataset about """+dsInfo+"""for the user. 
        If the user's question is not directly related to the dataset, politely reject it.
        You can use function calls to get data, or respond in sentences when proovided the data.
        If you don't know how to apply to a function, apoligize and say you don't know how to do that.
          You can answer questions about these variable names :"""+varnames+"""
        All parameter values must be one of the specified variable names.
        Do not make up information, and keep your responses concise.
        If you identify the need to reply with a function call, but some required parameter are missing,
          reply with a follow up question asking for missing parameters.
          You can also answer questions about the dataset metadata, and explain statistical analyses."""
    },
]

messages = initialMessages


Defining functions that the chatbot will use to control flow and direct logic to and from the LLM

In [8]:
def clearItOut():
    ''' Clear the message history and reinitialize with the system instructions.'''
    global messages
    messages = initialMessages


def classifyReponse(response):
    ''' Determine if the response from the LLM is providing content, or requesting a function to be run.'''
    if response['choices'][0]['message']['content'] is not None:
        return 'content'
    elif 'function_call' in response['choices'][0]['message']:
        return 'function_call'
    

def prepResponse(response):
    ''' Either extracts content for the bot message, or runs a function and provides that output as the bot message'''
    cr = classifyReponse(response)
    if cr=='function_call':
        f = response['choices'][0]['message']['function_call']['name']
        kwargs = json.loads(response['choices'][0]['message']['function_call']['arguments'])
        function_response = globals()[f](**kwargs)
        #return formatted message and logic to call llm again
        if type(function_response) is list:
            function_response = ','.join(function_response)
        message = {
            "role": "function",
            "name": f,
            "content": function_response,
        }
        action = 'llm'
        bot_message = None
    elif cr=='content':
        newContent = response['choices'][0]['message']['content']
        #return formatted message and logic to send to ui
        message = {
            "role": "assistant",
            "content": newContent,
        }
        action = 'ui'
        bot_message = newContent
    return bot_message, message,action

def callLLM(messages,functions):
    ''' Helper function to make the call to the LLM '''
    raw_message = openai.ChatCompletion.create(
                model=llmModel,
                messages=messages,
                functions = functions, temperature = 0.
            )
    return raw_message





In [9]:

#Keep track of the time the UI is started, so the history can be occassionally cleared out.
startUpTime = dt.datetime.now()


# Main chatbot loop

with gr.Blocks(title="Conversational BI Demo",theme='YenLai/Superhuman') as demo:
    gr.Markdown("Use the textbox below to ask questions about the "+dsInfo+" dataset.")
    gr.Markdown("It contains information on these fields:"+varnames)
    chatbot = gr.Chatbot()
    #msg = gr.Textbox("What is the car with the best mileage?") #Pre load a question in the user input.
    msg = gr.Textbox("Run a regression of mpg on cylinders and displacement")
    clear = gr.Button("Clear")
    gr.Markdown("To learn about this demo, go to my Github at https://github.com/jonathanmanly/runGearGuruconvBI")

    def user(user_message, history):
        if ((dt.datetime.now()-startUpTime).seconds)/60>10:
            clearItOut()
        return "", history + [[user_message, None]]

    def bot(history):
        raw_message=''
        messages.append({'role':'user',"content":history[-1][0]})
        raw_message = callLLM(messages,functions)
        bot_message, history_message,action = prepResponse(raw_message)
        messages.append(history_message)
        if action =='llm':
            raw_message = callLLM(messages,functions)
            #print(raw_message)
            bot_message, history_message,action = prepResponse(raw_message)
            messages.append(history_message)
        
        history[-1][1] = ""
        for character in bot_message:
            history[-1][1] += character
            time.sleep(0.001)
            yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(clearItOut, None, chatbot, queue=False)
    
demo.queue()
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/home/jon/.local/lib/python3.10/site-packages/gradio/queueing.py", line 388, in call_prediction
    output = await route_utils.call_process_api(
  File "/home/jon/.local/lib/python3.10/site-packages/gradio/route_utils.py", line 219, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/jon/.local/lib/python3.10/site-packages/gradio/blocks.py", line 1437, in process_api
    result = await self.call_function(
  File "/home/jon/.local/lib/python3.10/site-packages/gradio/blocks.py", line 1123, in call_function
    prediction = await utils.async_iteration(iterator)
  File "/home/jon/.local/lib/python3.10/site-packages/gradio/utils.py", line 512, in async_iteration
    return await iterator.__anext__()
  File "/home/jon/.local/lib/python3.10/site-packages/gradio/utils.py", line 505, in __anext__
    return await anyio.to_thread.run_sync(
  File "/home/jon/.local/lib/python3.10/site-packages/anyio/to_thread.py", line 33, 

In [10]:
messages

[{'role': 'system',
  'content': "You are a helpful assistant that analyzes a dataset about car mpg statisticsfor the user. \n        If the user's question is not directly related to the dataset, politely reject it.\n        You can use function calls to get data, or respond in sentences when proovided the data.\n        If you don't know how to apply to a function, apoligize and say you don't know how to do that.\n          You can answer questions about these variable names :mpg, cylinders, displacement, horsepower, weight, acceleration, model year, origin, car name\n        All parameter values must be one of the specified variable names.\n        Do not make up information, and keep your responses concise.\n        If you identify the need to reply with a function call, but some required parameter are missing,\n          reply with a follow up question asking for missing parameters.\n          You can also answer questions about the dataset metadata, and explain statistical analys

In [12]:
kwargs

NameError: name 'kwargs' is not defined