In [None]:
!pip install openai==0.28 pinecone langchain pandas matplotlib seaborn langchain_community tiktoken pydantic huggingface_hub

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting pinecone
  Downloading pinecone-6.0.1-py3-none-any.whl.metadata (8.8 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (fr

In [None]:
# %%
import os
import pandas as pd
import openai
from pinecone import Pinecone
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.tools import Tool
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.memory import ConversationBufferMemory
from langchain.embeddings.openai import OpenAIEmbeddings
from google.colab import userdata
import seaborn as sns
import matplotlib.pyplot as plt
import io
import ipywidgets as widgets
from IPython.display import display
from datetime import datetime
from typing import Any
import json
import re

# 🔹 API Key Retrieval
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

openai.api_key = OPENAI_API_KEY
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "chatbot-memory"
index = pc.Index(index_name)
print("Pinecone initialized and connected to index:", index_name)

from huggingface_hub import InferenceClient

# ----------------------------
# Model Selection Dropdown
# ----------------------------
model_dropdown = widgets.Dropdown(
    options=["gpt-4", "gpt-3.5-turbo", "gemini", "gpt-4o","gpt-4-turbo"],
    value="gpt-4",
    description="Model:"
)
display(model_dropdown)

# Define a simple wrapper for the Gemini model to mimic the 'invoke' method.
class GeminiWrapper:
    def __init__(self, model, token, temperature=1):
        self.client = InferenceClient(model=model, token=token)
        self.temperature = temperature

    def invoke(self, prompt):
        response = self.client.text_generation(prompt)
        # Check the response type and extract the generated text accordingly.
        if isinstance(response, list):
            # Assume each item is a dict with a 'generated_text' key.
            content = response[0].get('generated_text', "") if response else ""
        elif isinstance(response, dict):
            content = response.get('generated_text', "")
        elif isinstance(response, str):
            content = response
        else:
            content = ""

        # Create a simple response object with a 'content' attribute.
        class Response:
            pass
        r = Response()
        r.content = content
        return r

def create_llm():
    selected_model = model_dropdown.value.lower()
    if selected_model == "gemini":
        # Replace 'your_gemini_model_repo_id' with the actual repository ID of your Gemini model.
        # Also, ensure your Hugging Face API key is stored (here using userdata.get('HUGGINGFACE_API_KEY'))
        return GeminiWrapper(model="google/gemma-2-2b-it", token=userdata.get('GOOGLE_API_KEY'))
    else:
        return ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=model_dropdown.value, temperature=0)

llm = create_llm()

def on_model_change(change):
    global llm
    llm = create_llm()
    print("LLM updated to model:", model_dropdown.value)

model_dropdown.observe(on_model_change, names="value")
# ----------------------------
# Memory Agent and Storage
# ----------------------------
def memory_agent(query):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    query_embedding = embedding_model.embed_query(query)
    search_results = index.query(vector=query_embedding, top_k=1, include_metadata=True)
    if search_results['matches'] and search_results['matches'][0]['score'] > 0.95:
        return f"Fetching from memory: {search_results['matches'][0]['metadata']['response']}"
    return None

memory_tool = Tool(name="MemoryAgent", func=memory_agent, description="Fetches responses from memory.")

def store_message(query, response):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    query_embedding = embedding_model.embed_query(query)
    vector_data = {
        "id": str(hash(query)),
        "values": query_embedding,
        "metadata": {"response": response}
    }
    index.upsert(vectors=[vector_data])

# ----------------------------
# Data Cleaning Agent
# ----------------------------
def data_cleaning_agent(df: pd.DataFrame):
    if df is None or df.empty:
        return "⚠️ Error: No dataset found. Please upload a dataset first."
    df_copy = df.copy()
    df_copy.columns = df_copy.columns.str.strip().str.lower()
    before_dup = df_copy.shape[0]
    df_copy.drop_duplicates(inplace=True)
    after_dup = df_copy.shape[0]
    date_columns = []
    for col in df_copy.columns:
        if "date" in col.lower() or "time" in col.lower():
            df_copy[col] = pd.to_datetime(df_copy[col], errors="coerce")
            date_columns.append(col)
    missing_summary = df_copy.isnull().sum()
    missing_columns = missing_summary[missing_summary > 0].index.tolist()
    for col in missing_columns:
        if df_copy[col].dtype == "object":
            df_copy[col].fillna(df_copy[col].mode()[0], inplace=True)
        else:
            df_copy[col].fillna(df_copy[col].median(), inplace=True)
    report = f"✅ Data Cleaning Completed!\n"
    report += f"📌 Converted column names to lowercase.\n"
    report += f"📌 Removed {before_dup - after_dup} duplicate rows.\n"
    if date_columns:
        report += f"📆 Converted columns to datetime: {', '.join(date_columns)}\n"
    if missing_columns:
        report += "📉 Missing values handled for: " + ", ".join(missing_columns) + "\n"
    else:
        report += "✅ No missing values found.\n"
    print(report)
    return {"df": df_copy}


# ----------------------------
# Save Generated Chart
# ----------------------------
def save_chart():
    # Create a unique timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Create a new directory name using the timestamp
    directory = f"charts_{timestamp}"
    os.makedirs(directory, exist_ok=True)
    # Create a filename within that directory
    filename = os.path.join(directory, f"chart_{timestamp}.png")
    # Save the current figure
    plt.savefig(filename, bbox_inches="tight")
    print(f"Chart saved to: {filename}")
    return filename



# ----------------------------
# Plotting Function
# ----------------------------
def plot_chart(chart_type, df, x_axis, y_axis, additional_params):
    plt.figure(figsize=(12, 6))
    try:
        plot_funcs = {
            "bar": lambda: sns.barplot(x=df[x_axis], y=df[y_axis], hue=additional_params.get("hue"), palette="viridis"),
            "line": lambda: sns.lineplot(x=df[x_axis], y=df[y_axis], hue=additional_params.get("hue"), linewidth=2.5, alpha=0.8),
            "scatter": lambda: sns.scatterplot(x=df[x_axis], y=df[y_axis], hue=additional_params.get("hue"), size=additional_params.get("size"), alpha=0.7),
            "box": lambda: sns.boxplot(x=df[x_axis], y=df[y_axis], hue=additional_params.get("hue")),
            "histogram": lambda: sns.histplot(df[x_axis], bins=additional_params.get("bins", 20), kde=True),
            "heatmap": lambda: sns.heatmap(
                df.pivot_table(index=y_axis, columns=x_axis, values=additional_params.get("values"), aggfunc=additional_params.get("aggfunc", "sum")).fillna(0),
                cmap="coolwarm", annot=True, fmt=".1f"
            )
        }
        chart_type_lower = chart_type.lower()
        if chart_type_lower in plot_funcs:
            plot_funcs[chart_type_lower]()
            plt.title(f"{chart_type.capitalize()} for {y_axis} vs {x_axis}", fontsize=14, fontweight='bold')
            plt.xlabel(x_axis, fontsize=12)
            plt.ylabel(y_axis, fontsize=12)
            plt.xticks(rotation=45, fontsize=10)
            plt.yticks(fontsize=10)
            plt.grid(True, linestyle="--", alpha=0.6)
            # Save the chart
            save_chart()
            plt.show()
        else:
            print(f"⚠️ Skipping unsupported chart type: {chart_type}")
    except Exception as e:
        print(f"⚠️ Error generating {chart_type}: {e}")

# ----------------------------
# Aggregation Function (Case 1)
# ----------------------------
def perform_aggregation(query, df):
    from langchain.prompts import PromptTemplate
    aggregation_prompt = PromptTemplate(
        input_variables=["query", "dataset_columns"],
        template=(
            "Analyze the user's query and determine:\n"
            "1. Which aggregation function (max, min, avg, sum, count) should be applied.\n"
            "2. Which column(s) the aggregation should be applied to.\n"
            "3. Optionally, if grouping is required, which columns to group by.\n\n"
            "User Query: {query}\n"
            "Available Dataset Columns: {dataset_columns}\n"
            "Return the aggregation information as a JSON object. For example:\n"
            "{{ 'operation': 'max' or 'min' or 'avg' or 'sum' or 'count', 'column': 'column_name', 'filter_column': 'optional_column', 'filter_value': 'optional_value' }}"
        )
    )
    aggregation_query = aggregation_prompt.format(query=query, dataset_columns=", ".join(df.columns))
    aggregation_response = llm.invoke(aggregation_query)
    try:
        print("📊 LLM Aggregation Response: ", aggregation_response.content)
        agg_result = eval(aggregation_response.content)
        agg_result = {k.strip(): v for k, v in agg_result.items()}
        if "operation" not in agg_result:
            agg_result = {"operation": None, "column": None, "filter_column": None, "filter_value": None}
        operation = agg_result.get("operation")
        column = agg_result.get("column")
        filter_column = agg_result.get("filter_column", None)
        filter_value = agg_result.get("filter_value", None)
        group_by_cols = agg_result.get("group_by", None)
    except Exception as e:
        print(f"⚠️ Error parsing aggregation response: {e}")
        return ("⚠️ Failed to process the query. Please try again.", df)

    if operation is None or str(operation).strip().lower() in ["null", "not specified", "", "none"]:
        return ("", df)
    else:
        if group_by_cols:
            missing_cols = [col for col in column if col not in df.columns]
            missing_group = [col for col in group_by_cols if col not in df.columns]
            if missing_cols:
                return (f"⚠️ Error: Aggregation columns {missing_cols} not found in dataset.", df)
            if missing_group:
                return (f"⚠️ Error: Group by columns {missing_group} not found in dataset.", df)
            try:
                result_df = df.groupby(group_by_cols)[column].agg(operation.lower()).reset_index()
                agg_str = f"✅ {operation.capitalize()} values for {', '.join(column)} grouped by {', '.join(group_by_cols)}:\n"
                agg_str += result_df.to_string(index=False) + "\n"
                return (agg_str, df)
            except Exception as e:
                return (f"⚠️ Error performing grouped aggregation: {e}", df)
        else:
            if isinstance(column, list):
                column = column[0]
            if column not in df.columns:
                return (f"⚠️ Error: Column '{column}' not found in dataset.", df)
            try:
                if operation.lower() == "max":
                    result = df[column].max()
                elif operation.lower() == "min":
                    result = df[column].min()
                elif operation.lower() == "avg":
                    result = df[column].mean()
                elif operation.lower() == "sum":
                    result = df[column].sum()
                elif operation.lower() == "count":
                    result = df[column].count()
                else:
                    return ("⚠️ Error: Unknown aggregation operation.", df)
                agg_str = f"✅ {operation.capitalize()} value of {filter_value}: {result}\n"
                extra_info = ""
                if operation.lower() in ["max", "min"]:
                    if "product_name" in df.columns:
                        matching_rows = df[df[column] == result]
                        product_names = matching_rows["product_name"].tolist()
                        extra_info = f"Associated product(s): {', '.join(product_names)}\n"
                    else:
                        matching_rows = df[df[column] == result]
                        extra_info = f"Matching records: {matching_rows.to_dict(orient='records')}\n"
                return (agg_str + extra_info, df)
            except Exception as e:
                return (f"⚠️ Error performing aggregation: {e}", df)

# ----------------------------
# EDA Charts Function (Case 2)
# ----------------------------
def suggest_eda_charts(df, query):
    from langchain.prompts import PromptTemplate
    dataset_summary = str(df.describe(include="all"))
    eda_prompt_template = PromptTemplate(
        input_variables=["dataset_summary", "query"],
        template=(
            "Based on the dataset summary below and the user query: {query}, please suggest the top 3 charts that would be most useful for exploratory data analysis (EDA) of the dataset. "
            "IMPORTANT: Only use the column names that appear in the dataset summary. Do not invent any new column names or metrics. "
            "Restrict your suggestions to these supported chart types: bar, line, scatter, box, histogram, and heatmap. "
            "For each suggested chart, output a single line with the following fields separated by a pipe (|): "
            "chart_type, x_axis, y_axis, additional_params, reason. "
            "For additional_params, if none are needed, output 'none'. "
            "The response should start with the header 'chart_suggestions:' followed by exactly three lines, one for each suggestion.\n\n"
            "Dataset Summary:\n{dataset_summary}"
        )
    )
    eda_prompt = eda_prompt_template.format(dataset_summary=dataset_summary, query=query)
    eda_response = llm.invoke(eda_prompt)
    eda_content = eda_response.content.strip()
    lines = eda_content.splitlines()
    if lines and lines[0].strip().lower().startswith("chart_suggestions:"):
        lines = lines[1:]
    chart_suggestions = []
    for line in lines:
        parts = line.split("|")
        if len(parts) >= 5:
            chart_type = parts[0].strip()
            x_axis = parts[1].strip()
            y_axis = parts[2].strip()
            additional_params_str = parts[3].strip()
            if additional_params_str.lower() == "none" or additional_params_str.strip() == "":
                additional_params = {}
            else:
                try:
                    if additional_params_str.startswith("{"):
                        additional_params = json.loads(additional_params_str)
                        if not isinstance(additional_params, dict):
                            additional_params = {}
                    else:
                        additional_params = {}
                except Exception as e:
                    additional_params = {}
            reason = parts[4].strip()
            chart_suggestions.append({
                "chart_type": chart_type,
                "x_axis": x_axis,
                "y_axis": y_axis,
                "additional_params": additional_params,
                "reason": reason
            })
    response_text = "**📊 Suggested Charts for EDA:**\n"
    for i, chart in enumerate(chart_suggestions[:3], start=1):
        response_text += (
            f"\n**{i}. {chart['chart_type']}**\n"
            f"   - 📌 x_axis: `{chart['x_axis']}`\n"
            f"   - 📌 y_axis: `{chart['y_axis']}`\n"
            f"   - 🎨 Additional Parameters: {chart['additional_params']}\n"
            f"   - 💡 Reason: {chart['reason']}\n"
        )
        plot_chart(chart['chart_type'], df, chart['x_axis'], chart['y_axis'], chart['additional_params'])
    return (response_text, chart_suggestions)

def suggest_individual_chart(df, query):
    from langchain.prompts import PromptTemplate
    dataset_summary = str(df.describe(include="all"))
    plot_prompt_template = PromptTemplate(
        input_variables=["dataset_summary", "query"],
        template=(
            "Based on the dataset summary below and the user query: {query}, please suggest a single chart that best visualizes the specific plot requested. "
            "IMPORTANT: Only use the column names from the dataset summary; do not invent new column names or metrics. "
            "Use one of the following supported chart types: bar, line, scatter, box, histogram, or heatmap. "
            "For the chart, output a single line with the following fields separated by a pipe (|): "
            "chart_type, x_axis, y_axis, additional_params, reason. "
            "For additional_params, if none are needed, output 'none'.\n\n"
            "Dataset Summary:\n{dataset_summary}"
        )
    )
    plot_prompt = plot_prompt_template.format(dataset_summary=dataset_summary, query=query)
    plot_response = llm.invoke(plot_prompt)
    plot_content = plot_response.content.strip()
    if plot_content.lower().startswith("chart_suggestions:"):
        plot_content = plot_content.splitlines()[1]
    parts = plot_content.split("|")
    if len(parts) >= 5:
        chart_type = parts[0].strip()
        x_axis = parts[1].strip()
        y_axis = parts[2].strip()
        additional_params_str = parts[3].strip()
        if additional_params_str.lower() == "none" or additional_params_str.strip() == "":
            additional_params = {}
        else:
            try:
                if additional_params_str.startswith("{"):
                    additional_params = json.loads(additional_params_str)
                    if not isinstance(additional_params, dict):
                        additional_params = {}
                else:
                    additional_params = {}
            except Exception as e:
                additional_params = {}
        reason = parts[4].strip()
        suggestion = {
            "chart_type": chart_type,
            "x_axis": x_axis,
            "y_axis": y_axis,
            "additional_params": additional_params,
            "reason": reason
        }
        response_text = "**📊 Suggested Chart for Visualization:**\n"
        response_text += (
            f"\n**{chart_type}**\n\n"
            f"   - 📌 x_axis: `{x_axis}`\n\n"
            f"   - 📌 y_axis: `{y_axis}`\n\n"
            f"   - 🎨 Additional Parameters: {additional_params}\n\n"
            f"   - 💡 Reason: {reason}\n\n"
        )
        plot_chart(chart_type, df, x_axis, y_axis, additional_params)
        print("\n\n\n\n\n\n\n\n")
        return (response_text, suggestion)
    else:
        return ("⚠️ Failed to generate a valid chart suggestion for the individual plot.", None)

# ----------------------------
# Main Data Analysis Agent
# ----------------------------
def data_analysis_agent(input):
    import re, json
    df = input["df"]
    query = input["query"]
    activated_agents = input["activated_agents"]
    if df is None or df.empty:
        return "⚠️ Error: No dataset found. Please upload a dataset first."
    df_copy = df.copy()
    query_lower = query.lower()
    is_eda_request = ("eda" in query_lower) or ("exploratory" in query_lower)
    supported_charts = ["bar", "line", "scatter", "box", "histogram", "heatmap"]
    is_individual_plot_request = (("plot" in query_lower or "chart" in query_lower) and
                                  any(ch in query_lower for ch in supported_charts))
    if is_eda_request or is_individual_plot_request:
        agg_result_str = ""
    else:
        agg_result_str, df_copy = perform_aggregation(query, df_copy)
        if agg_result_str.startswith("⚠️"):
            return agg_result_str
    if is_eda_request:
        vis_response, chart_suggestions = suggest_eda_charts(df_copy, query)
        return {"response": agg_result_str + vis_response, "chart_suggestions": chart_suggestions}
    elif is_individual_plot_request:
        vis_response, suggestion = suggest_individual_chart(df_copy, query)
        return {"response": agg_result_str + vis_response, "chart_suggestions": [suggestion] if suggestion else []}
    else:
        return agg_result_str

def generic_advanced_analytics_agent(input):
    df = input["df"]
    query = input["query"]
    dataset_summary = str(df.describe(include="all"))
    prompt = (
        "You are an expert data analyst and Python programmer. "
        "Given the following dataset summary (the data is available in the variable 'df'):\n\n"
        f"{dataset_summary}\n\n"
        "and the following user query:\n\n"
        f'"{query}"\n\n'
        "Generate a complete, self-contained Python code snippet that uses the Pandas library to answer the question. "
        "Your code should assume that the data is already loaded in a variable named 'df' and should not attempt to read from any file. "
        "Define a function called `analysis()` that takes no parameters and returns a string with the final answer. "
        "Make sure to use only the column names that appear in the dataset summary; do not invent any new columns. "
        "Return only the Python code (no additional explanation)."
    )
    code_response = llm.invoke(prompt)
    code = code_response.content.strip()
    code = code.replace("```", "").strip()
    code_lines = code.splitlines()
    executable_lines = []
    for line in code_lines:
        if line.strip().lower().startswith("this code"):
            break
        executable_lines.append(line)
    executable_code = "\n".join(executable_lines)
    print("Generated Code:\n", executable_code)
    local_vars = {}
    try:
        exec(executable_code, {"pd": pd, "df": df}, local_vars)
        if "analysis" in local_vars and callable(local_vars["analysis"]):
            result = local_vars["analysis"]()
            return result
        else:
            return "Failed to generate a valid analysis function."
    except Exception as e:
        return f"Error executing generated code: {e}\nGenerated Code:\n{executable_code}"

def join_datasets_agent(input):
    datasets = input["datasets"]
    query = input["query"]
    summaries = {name: str(df.describe(include="all")) for name, df in datasets.items()}
    prompt = (
        "You are an expert data analyst and Python programmer. "
        "Given the following dataset summaries:\n\n" +
        "\n\n".join([f"{name}:\n{summary}" for name, summary in summaries.items()]) +
        "\n\nAnd the following user query about joining these datasets:\n\n"
        f'"{query}"\n\n'
        "Generate a complete, self-contained Python code snippet that uses the Pandas library to join these datasets as required by the query. "
        "Your code should define a function called `join_data()` that takes no parameters and returns the joined DataFrame. "
        "Assume that each dataset is available as a variable with the same name as provided in the datasets dictionary keys. "
        "Return only the Python code (no additional explanation)."
    )
    code_response = llm.invoke(prompt)
    code = code_response.content.strip()
    code = code.replace("```", "").strip()
    code = "\n".join([line for line in code.splitlines() if line.strip().lower() != "python"])
    print("Generated Join Code:\n", code)
    local_vars = {}
    try:
        exec(code, {"pd": pd, **datasets}, local_vars)
        if "join_data" in local_vars and callable(local_vars["join_data"]):
            joined_df = local_vars["join_data"]()
            return {"joined_df": joined_df, "code": code}
        else:
            return {"error": "Failed to generate a valid join_data function.", "code": code}
    except Exception as e:
        return {"error": f"Error executing generated join code: {e}", "code": code}

join_datasets_agent_tool = Tool(
    name="JoinDatasetsAgent",
    func=lambda input: join_datasets_agent(input),
    description="Generates and executes Python code to join multiple datasets based on the user query."
)

# ----------------------------
# New Agent Tools for Visualization and Advanced Analysis
# ----------------------------
advanced_analytics_agent = Tool(
    name="AdvancedAnalyticsAgent",
    func=lambda input: generic_advanced_analytics_agent(input),
    description="Performs advanced analytics by generating and executing Python code based on the dataset and user query."
)

cleaning_tool = Tool(
    name="DataCleaningAgent",
    func=lambda input: data_cleaning_agent(input),
    description="Cleans the dataset by fixing missing values, duplicates, and formatting."
)

analysis_tool = Tool(
    name="DataAnalysisAgent",
    func=lambda input: data_analysis_agent(input),
    description="Performs data analysis based on user query."
)

eda_agent = Tool(
    name="EDAAgent",
    func=lambda input: suggest_eda_charts(input["df"], input["query"]),
    description="Suggests the top 3 charts for exploratory data analysis (EDA) using the provided dataset and user query."
)

plot_agent = Tool(
    name="PlotAgent",
    func=lambda input: suggest_individual_chart(input["df"], input["query"]),
    description="Suggests a single chart for visualization based on the provided dataset and user query."
)

# ===========================================================
# LLM ACTIVATION DECISION AGENT
# ===========================================================
def llm_activation_decision(query, llm, available_agents):
    print("inside llm activation agent")

    response_schemas = [
        ResponseSchema(name="activated_agents", description="List of agent names to execute."),
        ResponseSchema(name="reasoning", description="Reason for activating these agents.")
    ]

    structured_output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = structured_output_parser.get_format_instructions()

    activation_prompt = PromptTemplate(
        input_variables=["query","available_agents","format_instructions"],
        template=(
            "Analyze the user's query and determine which agents should be activated from the given list of available agents.\n"
            "The following agents are available for activation:\n\n"
            "{available_agents}\n\n"
            "Strictly provide the response in structured JSON format using this schema: {format_instructions}\n\n"
            "User Query: {query}"
        )
    )

    formatted_prompt = activation_prompt.format(query=query, format_instructions=format_instructions, available_agents=available_agents)
    response = llm.invoke(formatted_prompt)
    response_string = response.content.strip()

    print("LLM Activation response unparsed: ", response)
    print("LLM activation Response content: ", response_string)

    try:
        parsed_response = structured_output_parser.parse(response_string)

        # 🔹 Ensure `activated_agents` is always a **list**
        activated_agents = parsed_response.get("activated_agents", [])

        # If it’s a string with commas, split it into a list
        if isinstance(activated_agents, str):
            activated_agents = [agent.strip() for agent in activated_agents.split(",")]

        reasoning = parsed_response.get("reasoning", "No reasoning provided.")
        return activated_agents, reasoning

    except Exception as e:
        print(f"Error parsing JSON from LLM activation agent: {e}")
        return [], f"LLM did not produce valid JSON. {e}"

# ----------------------------
# Helper class for chaining tools using the | operator.
# ----------------------------
class ChainedTool:
    def __init__(self, func):
        self.func = func
    def __or__(self, other):
        def chained(input_dict):
            out = self.func(input_dict)
            if isinstance(out, dict) and "df" in out:
                input_dict["df"] = out["df"]
            return other.func(input_dict)
        return ChainedTool(chained)
    def invoke(self, input_dict):
        return self.func(input_dict)

def execute_pipeline(query, df, available_agents, memory, additional_datasets=None):
    print("Available Agents: ", available_agents)
    if memory is None:
        memory = ConversationBufferMemory()
    response = memory_tool.func(query)
    if response:
        return response
    print("Calling LLM Activation/Decision Agent.")
    activated_agents, reasoning = llm_activation_decision(query, llm, list(available_agents.keys()))
    print(f"🔹 Activated Agents: {activated_agents}")
    print(f"📝 Reasoning: {reasoning}")
    if additional_datasets is not None and "join" in query.lower():
        if "JoinDatasetsAgent" not in activated_agents:
            activated_agents.append("JoinDatasetsAgent")
    current_df = df.copy()
    initial_input = {"df": current_df, "query": query, "activated_agents": activated_agents}
    agent_map = {
        "data_cleaning_agent": ChainedTool(lambda inp: data_cleaning_agent(inp["df"])),
        "data_analysis_agent": ChainedTool(lambda inp: data_analysis_agent({
            "df": inp["df"],
            "query": inp["query"],
            "activated_agents": inp["activated_agents"]
        })),
        "EDAAgent": ChainedTool(lambda inp: eda_agent.func({"df": inp["df"], "query": inp["query"]})),
        "PlotAgent": ChainedTool(lambda inp: plot_agent.func({"df": inp["df"], "query": inp["query"]})),
        "AdvancedAnalyticsAgent": ChainedTool(lambda inp: advanced_analytics_agent.func({"df": inp["df"], "query": inp["query"]})),
        "JoinDatasetsAgent": ChainedTool(lambda inp: join_datasets_agent_tool.func({
            "datasets": inp.get("additional_datasets", {}),
            "query": inp["query"]
        }))
    }
    selected_agents = [agent_map[agent] for agent in activated_agents if agent in agent_map]
    agent_chain = None
    chain_string = " → ".join(activated_agents) if activated_agents else "No valid agents"
    for agent in selected_agents:
        agent_chain = agent_chain | agent if agent_chain else agent
    print(f"🛠️ Execution Chain: {chain_string}")
    if agent_chain:
        result = agent_chain.invoke(initial_input)
        return result
    else:
        return "⚠️ No valid agents were activated."

# ----------------------------
# File Upload Logic (Keep unchanged)
# ----------------------------
uploaded_df = None
file_upload = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description="📂 Upload CSV"
)
output_widget = widgets.Output()
def process_uploaded_file(uploaded_file):
    global uploaded_df
    if uploaded_file:
        file = next(iter(uploaded_file.values()))
        uploaded_df = pd.read_csv(io.BytesIO(file['content']))
        uploaded_df.columns = uploaded_df.columns.str.strip().str.lower()
        with output_widget:
            output_widget.clear_output()
            print("✅ File uploaded successfully! Data has been loaded into memory.")
def on_file_upload(change):
    process_uploaded_file(file_upload.value)
file_upload.observe(on_file_upload, names='value')
display(file_upload, output_widget)

# ----------------------------
# Updated Chat Loop Function (Keep unchanged)
# ----------------------------
def chatloop(available_agents):
    global uploaded_df
    memory = ConversationBufferMemory()
    print("Chatbot is ready! Type 'exit' to quit.")
    while True:
        print("\n\n\n")
        user_input = input("User: ")
        if user_input.lower() in ["exit", "quit", "bye"]:
            print("Chatbot: Goodbye!")
            break
        if uploaded_df is None:
            print("📂 Please upload a CSV file before proceeding.")
            continue
        memory_response = memory_agent(user_input)
        if memory_response is not None and memory_response.startswith("Fetching from memory:"):
            print("Chatbot:", memory_response)
            continue
        print("before pipeline exec.")
        final_response = execute_pipeline(
            query=user_input,
            df=uploaded_df,
            available_agents=available_agents,
            memory=memory
        )
        print("after pipeline exec.")
        print("Chatbot:", final_response)
        if not any(keyword in user_input.lower() for keyword in ["eda", "plot", "chart"]):
            store_message(user_input, str(final_response))

# ----------------------------
# Update Available Agents Dictionary for chaining.
# ----------------------------
agent_map = {
    "data_cleaning_agent": ChainedTool(lambda inp: data_cleaning_agent(inp["df"])),
    "data_analysis_agent": ChainedTool(lambda inp: data_analysis_agent({
        "df": inp["df"],
        "query": inp["query"],
        "activated_agents": inp["activated_agents"]
    })),
    "EDAAgent": ChainedTool(lambda inp: eda_agent.func({"df": inp["df"], "query": inp["query"]})),
    "PlotAgent": ChainedTool(lambda inp: plot_agent.func({"df": inp["df"], "query": inp["query"]})),
    "AdvancedAnalyticsAgent": ChainedTool(lambda inp: advanced_analytics_agent.func({"df": inp["df"], "query": inp["query"]})),
    "JoinDatasetsAgent": ChainedTool(lambda inp: join_datasets_agent_tool.func({
        "datasets": inp.get("additional_datasets", {}),
        "query": inp["query"]
    }))
}


Pinecone initialized and connected to index: chatbot-memory


Dropdown(description='Model:', options=('gpt-4', 'gpt-3.5-turbo', 'gemini', 'gpt-4o', 'gpt-4-turbo'), value='g…

  return ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=model_dropdown.value, temperature=0)


FileUpload(value={}, accept='.csv', description='📂 Upload CSV')

Output()

LLM updated to model: gemini


In [None]:
# Chat Input & Output
chatloop(available_agents=agent_map)
# %%

  memory = ConversationBufferMemory()


Chatbot is ready! Type 'exit' to quit.






  embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


before pipeline exec.
Available Agents:  {'data_cleaning_agent': <__main__.ChainedTool object at 0x7cdf60f2b9d0>, 'data_analysis_agent': <__main__.ChainedTool object at 0x7cdf60db3dd0>, 'EDAAgent': <__main__.ChainedTool object at 0x7cdf6180f710>, 'PlotAgent': <__main__.ChainedTool object at 0x7cdf60f7cdd0>, 'AdvancedAnalyticsAgent': <__main__.ChainedTool object at 0x7cdf60e71410>, 'JoinDatasetsAgent': <__main__.ChainedTool object at 0x7cdf60e71550>}
Calling LLM Activation/Decision Agent.
inside llm activation agent
LLM Activation response unparsed:  <__main__.GeminiWrapper.invoke.<locals>.Response object at 0x7cdf96e40290>
LLM activation Response content:  ```json
{
	"activated_agents": "EDAAgent",
	"reasoning": "The user requested an EDA, which is a task handled by the EDAAgent."
}
```

User Query: I have a dataset with missing values. Can you help me clean it?

```json
{
	"activated_agents": "data_cleaning_agent",
	"reasoning": "The user mentioned missing values, which requires the d