In [97]:
# Lang chain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv
from pathlib import Path

# Get API KEY
import os

load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np

# **IMPORTING FEATURES**
<hr>


Import features from other notebooks using import_ipynb library

In [98]:
# Import Features
import import_ipynb
from features.summaries import get_summaries
from features.missing_vals import missing_vals

In [99]:
# List features for agent 
features = """
Available features (TOOLS):
- get_summaries(df, query): get_sumarries() for queries regarding generating summaries
- missing_vals(df, query): for queries regarding missing values, imputation etc.
"""

# **QUERY ROUTER**
<hr>

The query router uses an LLM to process your query and pass it into your defined features

In [100]:
def route_query(user_query, df):
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent
                                  
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    {features}
    - These functions (TOOLS) are available to call to assist with queries
    
    Rules:
    - Each function takes (df, user_query) and returns modified df
    - Each function call should have a targeted query explaining exactly what to do
    - Return only executable Python code, no explanations, NO MARKDOWN BLOCKS
    - Only if no actions can be taken, print a descriptive message why
    - ASSUME DF IS STORED IN DF

    Examples:
    HANDLE QUERIES  THROUGH FEATURES:
    - User: Find means for price and stock, Generated: Single: df = get_summaries(df, "find mean in price, stock")
    - User: Suggest how to handle missing values, Generated: df = missing_vals(df, "how to handle missing values")
    HANDLE QUERIES  WITHOUT FEATURES (THROUGH PANDAS):
    - User: What object columns are still remaining, Generated: print(df.select_dtypes(include='object'))
    - User: Print out the nuniques in each column, Generated: print([col,df[col].nunique() for col in df])
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))

    # Call LLM
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()

    print(generated_code)
    # Execute AI generated code
    try:
        original_df = df.copy()
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **TEST QUERIES**

In [101]:
# Enter CSV filename from "datasets" folder
dataset_name = "Life Expectancy Data.csv"

# Build CSV path (to avoid import errors)
load_dotenv()
PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
path = PROJECT_ROOT / "datasets" / dataset_name

df = pd.read_csv(path)
test_df = df.copy()

In [None]:
# user_query = "Give me ways to handle missing values"

# route_query(user_query, df) 