In [1]:
import dspy
from dotenv import load_dotenv
load_dotenv("D:\\gitFolders\\python_de_learners_data\\.env")

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# starting wit the analytical planner signature
class analytical_planner(dspy.Signature):
    """ You are data analytics planner agent. You have access to three inputs
    1. Datasets
    2. Data Agent descriptions
    3. User-defined Goal
    You take these three inputs to develop a comprehensive plan to achieve the user-defined goal from the data & Agents available.
    In case you think the user-defined goal is infeasible you can ask the user to redefine or add more description to the goal.

    Give your output in this format:
    plan: Agent1->Agent2->Agent3
    plan_desc = Use Agent 1 for this reason, then agent2 for this reason and lastly agent3 for this reason.

    You don't have to use all the agents in response of the query 
    """
# Input fields and their descriptions
    dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df_name,columns  set df as copy of df_name")
    Agent_desc = dspy.InputField(desc= "The agents available in the system")
    goal = dspy.InputField(desc="The user defined goal ")
# Output fields and their description
    plan = dspy.OutputField(desc="The plan that would achieve the user defined goal")
    plan_desc= dspy.OutputField(desc="The reasoning behind the chosen plan")

In [3]:
class preprocessing_agent(dspy.Signature):
    """ You are a data pre-processing agent, your job is to take a user-defined goal and available dataset,
    to build an exploratory analytics pipeline. You do this by outputing the required Python code. 
    You will only use numpy and pandas, to perform pre-processing and introductory analysis
    """
    dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df_name,columns  set df as copy of df_name")
    goal = dspy.InputField(desc="The user defined goal ")
    commentary = dspy.OutputField(desc="The comments about what analysis is being performed")
    code = dspy.OutputField(desc ="The code that does the data preprocessing and introductory analysis")

class statistical_analytics_agent(dspy.Signature):
    """ You are a statistical analytics agent. 
    Your task is to take a dataset and a user-defined goal, and output 
    Python code that performs the appropriate statistical analysis to achieve that goal.
    You should use the Python statsmodel library"""
    dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df_name,columns  set df as copy of df_name")
    goal = dspy.InputField(desc="The user defined goal for the analysis to be performed")
    commentary = dspy.OutputField(desc="The comments about what analysis is being performed")
    code = dspy.OutputField(desc ="The code that does the statistical analysis using statsmodel")

class sk_learn_agent(dspy.Signature):
# Prompt
    """You are a machine learning agent. 
    Your task is to take a dataset and a user-defined goal, and output Python code that performs the appropriate machine learning analysis to achieve that goal. 
    You should use the scikit-learn library."""
# Input Fields
    dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df_name,columns. set df as copy of df_name")
    goal = dspy.InputField(desc="The user defined goal ")
# Output Fields
    commentary = dspy.OutputField(desc="The comments about what analysis is being performed")
    code = dspy.OutputField(desc ="The code that does the Exploratory data analysis")

## I worked on the data-viz agent and already optimized using DSPy.
## The only big difference is that this agents takes another input of styling index

In [4]:
class code_combiner_agent(dspy.Signature):
    """ You are a code combine agent, taking Python code output from many agents and combining the operations into 1 output
    You also fix any errors in the code"""
    agent_code_list =dspy.InputField(desc="A list of code given by each agent")
    refined_complete_code = dspy.OutputField(desc="Refined complete code base")

In [5]:
# The same signature used in Data Viz agent post
class Data_Viz(dspy.Signature):
    """
    You are AI agent who uses the goal to generate data visualizations in Plotly.
    You have to use the tools available to your disposal
    {dataframe_index}
    {styling_index}

    You must give an output as code, in case there is no relevant columns, just state that you don't have the relevant information
    """
    goal = dspy.InputField(desc="user defined goal which includes information about data and chart they want to plot")
    dataframe_context = dspy.InputField(desc=" Provides information about the data in the data frame. Only use column names and dataframe_name as in this context")
    styling_context = dspy.InputField(desc='Provides instructions on how to style your Plotly plots')
    code= dspy.OutputField(desc="Plotly code that visualizes what the user needs according to the query & dataframe_index & styling_context")

# An optional agent that checks if the user-defined goal works well
class goal_refiner_agent(dspy.Signature):
    """You take a user-defined goal given to a AI data analyst planner agent, 
    you make the goal more elaborate using the datasets available and agent_desc"""
    dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df_name,columns  set df as copy of df_name")
    Agent_desc = dspy.InputField(desc= "The agents available in the system")
    goal = dspy.InputField(desc="The user defined goal ")
    refined_goal = dspy.OutputField(desc='Refined goal that helps the planner agent plan better')

In [None]:
# This module takes only one input on initiation
class auto_analyst(dspy.Module):
    def __init__(self,agents):
# Defines the available agents, their inputs, and description
        self.agents = {}
        self.agent_inputs ={}
        self.agent_desc =[]
        i =0
        for a in agents:
            name = a.__pydantic_core_schema__['schema']['model_name']
# Using CoT prompting as from experience it helps generate better responses
            self.agents[name] = dspy.ChainOfThought(a)
            agent_inputs[name] ={x.strip() for x in str(agents[i].__pydantic_core_schema__['cls']).split('->')[0].split('(')[1].split(',')}
            self.agent_desc.append(str(a.__pydantic_core_schema__['cls']))
            i+=1
# Defining the planner, refine_goal & code combiner agents seperately
# as they don't generate the code & analysis they help in planning, 
# getting better goals & combine the code
        self.planner = dspy.ChainOfThought(analytical_planner)
        self.refine_goal = dspy.ChainOfThought(goal_refiner_agent)
        self.code_combiner_agent = dspy.ChainOfThought(code_combiner_agent)
# these two retrievers are defined using llama-index retrievers
# you can customize this depending on how you want your agents
        self.dataset =dataframe_index.as_retriever(k=1)
        self.styling_index = style_index.as_retriever(similarity_top_k=1)
        
    def forward(self, query):
# This dict is used to quickly pass arguments for agent inputs
        dict_ ={}
# retrieves the relevant context to the query
        dict_['dataset'] = self.dataset.retrieve(query)[0].text
        dict_['styling_index'] = self.styling_index.retrieve(query)[0].text
        dict_['goal']=query
        dict_['Agent_desc'] = str(self.agent_desc)
# output_dictionary that stores all agent outputs
        output_dict ={}
# this comes up with the plan
        plan = self.planner(goal =dict_['goal'], dataset=dict_['dataset'], Agent_desc=dict_['Agent_desc'] )
        output_dict['analytical_planner'] = plan
        plan_list =[]
        code_list =[]
# if the planner worked as intended it should give agents seperated by ->
        if plan.plan.split('->'):
            plan_list = plan.plan.split('->')
# in case the goal is unclear, it sends it to refined goal agent
        else:
            refined_goal = self.refine_goal(dataset=data, goal=goal, Agent_desc= self.agent_desc)
            forward(query=refined_goal)
# passes the goal and other inputs to all respective agents in the plan
        for p in plan_list:
            inputs = {x:dict_[x] for x in agent_inputs[p.strip()]}
            output_dict[p.strip()]=self.agents[p.strip()](**inputs)
# creates a list of all the generated code, to be combined as 1 script
            code_list.append(output_dict[p.strip()].code)
# Stores the last output
        output_dict['code_combiner_agent'] = self.code_combiner_agent(agent_code_list = str(code_list))
        
        return output_dict
# you can store all available agent signatures as a list
agents =[preprocessing_agent, statistical_analytics_agent, sk_learn_agent,data_viz_agent]

# Define the agentic system
auto_analyst_system = auto_analyst(agents)

# the system is preloaded with Chicago crime data
goal = "What is the cause of crime in Chicago?"

# Asking the agentic system to perform analysis for this query
output = auto_analyst_system(query = goal)