In [1]:
import os
import uuid
import random
import numpy as np
import pandas as pd
from typing import Optional, Union, List, Dict, Tuple

# import torch
import asyncio
import logging
from functools import wraps, partial


logger = logging.getLogger(__name__)


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    return seed


def prompts(name: str, description: str):
    def decorator(func):
        func.name = name
        func.description = description
        return func

    return decorator


def extract_code(text):
    texts = text.split("```")
    code = None
    if len(texts) > 1:
        code = texts[1]
        code = code.strip("python").strip("Python")
    return code

In [2]:
from typing import Optional, Union, List, Dict, Tuple
from pydantic import Field
import ast
import astunparse
import logging
import pandas as pd


def prompts(name: str, description: str):
    def decorator(func):
        func.name = name
        func.description = description
        return func

    return decorator


def execute(cmd: str, glo, loc):
    tree = ast.parse(cmd)
    module = ast.Module(tree.body[:-1], type_ignores=[])
    exec(astunparse.unparse(module), self.globals, self.locals)  # type: ignore
    module_end = ast.Module(tree.body[-1:], type_ignores=[])
    module_end_str = astunparse.unparse(module_end)  # type: ignore
    try:
        return eval(module_end_str, self.globals, self.locals)
    except Exception:
        exec(module_end_str, self.globals, self.locals)
        return ""

In [3]:
os.environ['OPENAI_API_KEY'] = ""

In [4]:
import re
import json
import logging
import threading
from typing import Union
from abc import ABC, abstractmethod

from langchain.chains.conversation.memory import (
    ConversationBufferMemory,
    # ConversationBufferWindowMemory,
    # ConversationSummaryBufferMemory
)
from langchain.chat_models import ChatOpenAI
from langchain.agents.tools import Tool
from langchain.agents.conversational.output_parser import ConvoOutputParser
from langchain.agents.conversational.prompt import FORMAT_INSTRUCTIONS
from langchain.agents.conversational_chat.prompt import (
    FORMAT_INSTRUCTIONS as CHAT_FORMAT_INSTRUCTIONS,
)
from langchain.schema import AgentAction, AgentFinish
from langchain.agents import AgentOutputParser

# from api.services.utils import prompts, seed_everything, to_async
# from api.conversation.callback import AsyncLoggerCallbackHandler, LoggerCallbackHandler



class retryActionTool:
    """Retry when chatgpt response parsing failed."""

    @prompts(
        name="RR",
        description="Do not use it.",
    )
    def inference(self, query):
        return f"Modify the following text in ``` such that it will follow RESPONSE FORMAT INSTRUCTIONS:```{query}```\nDo not use retry tool again."


class RetryConvoOutputParser(ConvoOutputParser):
    ai_prefix: str = "AI"

    def get_format_instructions(self) -> str:
        return FORMAT_INSTRUCTIONS

    def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
        if f"{self.ai_prefix}:" in text:
            return AgentFinish(
                {"output": text.split(f"{self.ai_prefix}:")[-1].strip()}, text
            )
        regex = r"Action: (.*?)[\n]*Action Input: (.*)"
        match = re.search(regex, text)
        if not match:
            return AgentAction("retry", "", text)
            # raise OutputParserException(f"Could not parse LLM output: `{text}`")
        action = match.group(1)
        action_input = match.group(2)
        return AgentAction(action.strip(), action_input.strip(" ").strip('"'), text)


class RobustRetryConvoOutputParser(AgentOutputParser):
    def get_format_instructions(self) -> str:
        return CHAT_FORMAT_INSTRUCTIONS

    def parse_json(self, text: str):
        try:
            return [
                i
                for i in re.findall(r"\s([{\[].*?[}\]])$", f" {text}", flags=re.DOTALL)
                if i
            ][0]
        except Exception:
            return

    def parse_markdown_code(self, text: str):
        try:
            for i in re.findall(
                r"`{3}([\w]*)\n([\S\s]+?)\n`{3}", text, flags=re.DOTALL
            )[0]:
                if i:
                    tmp = self.parse_json(i)
                    if tmp:
                        return tmp
        except Exception:
            return

    def parse_origin(self, text: str):
        try:
            cleaned_output = text.strip()
            if "```json" in cleaned_output:
                _, cleaned_output = cleaned_output.split("```json")
            if "```" in cleaned_output:
                cleaned_output, _ = cleaned_output.split("```")
            if cleaned_output.startswith("```json"):
                cleaned_output = cleaned_output[len("```json"):]
            if cleaned_output.startswith("```"):
                cleaned_output = cleaned_output[len("```"):]
            if cleaned_output.endswith("```"):
                cleaned_output = cleaned_output[: -len("```")]
            return cleaned_output.strip()
        except Exception:
            return

    def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
        try:
            json_str = self.parse_json(text)
            markdown_str = self.parse_markdown_code(text)
            origin_str = self.parse_origin(text)

            if json_str:
                cleaned_output = json_str
            elif markdown_str:
                cleaned_output = markdown_str
                if cleaned_output.startswith("```json"):
                    cleaned_output = cleaned_output[len("```json"):]
                if cleaned_output.startswith("```"):
                    cleaned_output = cleaned_output[len("```"):]
                if cleaned_output.endswith("```"):
                    cleaned_output = cleaned_output[: -len("```")]
            elif origin_str:
                cleaned_output = origin_str
            else:
                cleaned_output = text.strip()

            response = json.loads(cleaned_output)
            action, action_input = response["action"], response["action_input"]
            if action == "Final Answer":
                return AgentFinish({"output": action_input}, text)
            else:
                return AgentAction(action, action_input, text)
        except Exception:
            if "action" not in text and "action_input" not in text:
                return AgentFinish({"output": text}, text)
            else:
                logger.warning(f"Not follow format instruction\n{cleaned_output}")
                return AgentAction("RR", "", text)

        
class pythonShellTool:

    @prompts(
        name="python_repl_ast",
        description="This is a python shell. Use this to execute python commands. "
        "Input should be a valid python command. "
        "The output of the command should not be too long."
        "The dataset is a dataframe assign to a variable df"
    )
    def inference(self, query: str):
        try:
            tree = ast.parse(query)
            module = ast.Module(tree.body[:-1], type_ignores=[])
            exec(astunparse.unparse(module), self.globals, self.locals)  # type: ignore
            module_end = ast.Module(tree.body[-1:], type_ignores=[])
            module_end_str = astunparse.unparse(module_end)  # type: ignore
            try:
                return eval(module_end_str, self.globals, self.locals)[:1024]
            except Exception:
                exec(module_end_str, self.globals, self.locals)
                return ""
        except Exception as e:
            return str(e)

In [5]:
class LoadDataTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        
    @prompts(
        name="Load",
        description="This is a tool to load dataset."
        "Input should be a dataset name."
    )
    def inference(self, query: str):
        
        if self.status.dev:
            name = "WineQT"
            data = pd.read_csv("WineQT.csv")
            X = data.drop('quality', axis=1)
            y = data['quality']
            self.status.init_data(name, X, y, None, None)
        else:
            dataset = data_loader.select(query)
            name = dataset['name']
            label = dataset['label']
            data = dataset['train']
            X_train = data.drop(label, axis=1)
            y_train = data[label]
            data = dataset['test']
            if label in data:
                data = data.drop(label, axis=1)
            X_test = data
            y_test = pd.DataFrame(data={label: []*len(X_test)})
            self.status.init_data(name, X, y)
        
        return f"Load dataset {name} success !"

In [216]:
class ShowStatusTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        
    @prompts(
        name="show Status",
        description="This is a tool to show current status."
        "Input should be an empty string."
    )
    def inference(self, query: str):
        return str({
            'dataset': status.dataset,
            'explore': status.explore,
            'preprocess': status.preprocess,
            'model_select': status.model_select,
            'hyperopt': status.hyperopt,
        })

In [243]:
class ModelSelectTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        self.X_train = status.X_train
        self.y_train = status.y_train
        self.locals = {"X_train": self.X_train, 'y_train': self.y_train}
        self.globals = {"np": np, 'pd': pd}
        self.model_list = []
        
        from sklearn import ensemble, linear_model
        import lightgbm
        for module in [ensemble, linear_model, lightgbm]:
            for name, func in vars(module).items():
                if name[0].isupper():
                    self.globals[name] = func
                    self.model_list.append(name)
        
    @prompts(
        name="Pick Model",
        description="This is a tool for deciding which model to use for a machine learning task."
        "Output is description about how to pick a model."
        "Input should be an empty string."
    )
    def inference(self, query: str):
        
        if not self.status.dataset:
            return "Pick a dataset first."
        
        try:
        
            know_how_prompt = coding_agent.run("How to decide which machine learning model to use ?")
            self.know_how_prompt = know_how_prompt

            how_examine_dataset_thought = coding_agent.run(know_how_prompt + f"""
            Here is some information about the feature dataframe `X`:
            \"\"\"
            - First five rows:
            {self.X_train.head(5)}

            - Column types:
            {self.X_train.dtypes}
            \"\"\"

            Here is some information about the label series `y`:
            \"\"\"
            - First five rows:
            {self.y_train.head(5)}

            - Column types:
            {self.y_train.dtypes}
            \"\"\"

            Tell me how to examine the dataset that can help you decide which model to use.
            """)
            self.how_examine_dataset_thought = how_examine_dataset_thought
            # print(how_examine_dataset_thought)

            examine_dataset_prompt = f"""Thoughts:\n{how_examine_dataset_thought}\n
            Write a Python function called `examine_dataset` to examine the dataset.
            Import any package you need beforehand.
            Do not include visualization and plotting.
            Make it as simple as possible.
            The function will be called using `result = examine_dataset(X, y)`, and all findings must be stored in a string variable `result`.
            Write the function as simple and short as possible.
            Show results in markdown.
            """


            examine_dataset_thought = coding_agent.run(examine_dataset_prompt)
            print(examine_dataset_thought)

            examine_dataset_code = extract_code(examine_dataset_thought)
            self.examine_dataset_thought = examine_dataset_thought
            self.examine_dataset_code = examine_dataset_code
            # print(examine_dataset_code)

            exec(examine_dataset_code, self.globals, self.locals)
            examine_dataset_result = self.locals["examine_dataset"](self.X_train, self.y_train)
            self.examine_dataset_result = examine_dataset_result

            final_prompt = f"""
            Human:\n{examine_dataset_prompt}
            AI:\n{examine_dataset_thought}
            Human:\nExecute result: {examine_dataset_result}

            Which model will you use to achieve the best prediction results according to the examine result?
            Also consider those models that are fast such as lightgbm.
            These are some potential model you can choose:
            {self.model_list}

            Write a Python function called `construct_model` to create the model instance.
            Initiate the model with no parameters.
            Import any package you need beforehand.
            Do not include visualization and plotting.
            The function will be called using `clf = construct_model()`, and the constructed model will be stored in a string variable `clf`.
            Show results in markdown.
            """
            model_selection_thought = coding_agent.run(final_prompt)
            self.model_selection_thought = model_selection_thought
            # print(model_selection_thought)
            model_selection_code = extract_code(model_selection_thought)
            self.model_selection_code = model_selection_code
            exec(model_selection_code, self.globals, self.locals)
            self.clf = self.locals['construct_model']()

            self.update_status()
            return model_selection_thought
        except:
            return """
            
            """
    
    def update_status(self):
        self.status.know_how_prompt = self.know_how_prompt
        self.status.how_examine_dataset_thought = self.how_examine_dataset_thought
        self.status.examine_dataset_thought = self.examine_dataset_thought
        self.status.examine_dataset_code = self.examine_dataset_code
        self.status.model_selection_thought = self.model_selection_thought
        self.status.model_selection_code = self.model_selection_code
        self.status.model = self.clf.__class__.__name__
        self.status.model_selection = True

        
ModelSelectTool(status).inference("")

```python
import pandas as pd
import numpy as np

def examine_dataset(X, y):
    # Check for missing values
    if X.isnull().sum().sum() > 0:
        result = "There are missing values in the dataset. You need to decide how to handle them before choosing a model."
    else:
        result = "There are no missing values in the dataset."
    
    # Check for outliers
    if (X.dtypes == np.number).any():
        num_cols = X.select_dtypes(include=np.number).columns
        if (X[num_cols].apply(lambda x: np.abs(x - x.mean()) / x.std() > 3)).sum().sum() > 0:
            result += "\nThere are outliers in the dataset. You need to decide how to handle them before choosing a model."
        else:
            result += "\nThere are no outliers in the dataset."
    
    # Check for class imbalance
    if y.nunique() > 2:
        class_counts = y.value_counts()
        class_ratio = class_counts / class_counts.sum()
        if (class_ratio < 0.05).any():
            result += "\nThere is class

"AI:\n\n```python\nfrom lightgbm import LGBMClassifier\n\ndef construct_model():\n    clf = LGBMClassifier()\n    return clf\n```\n\nExample usage:\n\n```python\nclf = construct_model()\nprint(clf)\n```\n\nOutput:\n\n```\nLGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n               importance_type='split', learning_rate=0.1, max_depth=-1,\n               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,\n               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,\n               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)\n```\n\n"

In [235]:
class ExploreDataTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        self.X = status.X
        self.y = status.y
        self.locals = {"X": self.X, 'y': self.y, 'np': np}
        self.globals = {"np": np, 'pd': pd}
        
    @prompts(
        name="Explore",
        description="This is a tool for dataset exploring."
        "Useful when the user ask for you to explore the dataset."
        "Output is the thought from exploring the dataset."
        "Input should be an empty string."
    )
    def inference(self, query: str):
        
        if not self.status.dataset:
            return "Pick a dataset first."
        
        start_explore_prompt = f"""
You are a data scientist. You are now performing data analysis before training a machine learning model.
Your task is to tell me how the data could be preprocessed and to give me the corresponding Python code.
The preprocessing includes, but is not limited to, missing value handling, scaling, normalization, or encoding.
The task will be performed in two stages.
In the first stage, you will decide which techniques are required to preprocess the dataset by investigating the dataset.
In the second stage, you will preprocess the dataset by implementing the functions.

Here is some information about the feature dataframe `X`:
\"\"\"
- First five rows:
{self.X.head(5)}

- Column types:
{self.X.dtypes}
\"\"\"

Here is some information about the label series `y`:
\"\"\"
- First five rows:
{self.y.head(5)}

- Column types:
{self.y.dtypes}
\"\"\"
"""

        how_to_explore_prompt = """
Now, let's move on to the first stage:
1. Explain how you will explore the dataset in order to select appropriate preprocessing techniques.
2. Write a Python function called `explore` to explore the dataset.
   Import any package you need beforehand.
   The function will be called using `result = explore(X, y)`, and all findings must be stored in a string variable `result`.

Here are some handy packages in the Python environment:
```
numpy==1.24.3
pandas==2.0.1
```
"""

        prompt = "\n\n".join([start_explore_prompt, how_to_explore_prompt])
        self.start_explore_prompt = start_explore_prompt
        self.how_to_explore_prompt = how_to_explore_prompt
        # print(prompt)

        explore_data_thought = coding_agent.run(prompt)
        self.explore_data_thought = explore_data_thought
        # print(explore_data_thought)
        
        explore_code = extract_code(explore_data_thought)
        self.explore_code = explore_code
        # print(explore_code + '\n')
        self.retry_execute(explore_data_thought, explore_code)
        
        explore_data_result = self.locals["explore"](self.X, self.y)
        self.explore_data_result = explore_data_result
        # print(explore_data_result)
        
        self.update_status()
        return explore_data_result
    
    def update_status(self):
        self.status.start_explore_prompt = self.start_explore_prompt
        self.status.how_to_explore_prompt = self.how_to_explore_prompt
        self.status.explore_code = self.explore_code
        self.status.explore_data_result = self.explore_data_result
        self.status.explore_data_thought = self.explore_data_thought
        self.status.explore = True
    
    def retry_execute(self, prompt: str, code: str, retry: int = 3):
        for _ in range(retry):
            try:
                exec(code, self.globals, self.locals)
            except Exception as e:
                import traceback
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{traceback.format_exc()}\n\nModify the code and fix the error.")
                code = extract_code(fix_code)
                print(f"Fix{_}:\n{code}\n")
    

    
    
class ProcessDataTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        self.X = status.X
        self.y = status.y
        self.locals = {"X": self.X, 'y': self.y, 'np': np}
        self.globals = {
            "np": np,
            'pd': pd,
        }
        
        from imblearn import over_sampling, under_sampling
        from sklearn import preprocessing, impute, model_selection
        for module in [preprocessing, impute, over_sampling, under_sampling, model_selection]:
            for name, func in vars(module).items():
                if name[0].isupper():
                    self.globals[name] = func

    @prompts(
        name="Preprocess",
        description="This is a tool for preprocess the dataset."
        "Useful when you want to process the dataset."
        "Useful when you want to preprocess the dataset."
    )
    def inference(self, query: str):
        if not self.status.dataset:
            return "Pick a dataset first."
        print()
        
        explore_data_result = self.status.explore_data_result
        start_explore_prompt = self.status.start_explore_prompt
        how_to_explore_prompt = self.status.how_to_explore_prompt
        explore_data_thought = self.status.explore_data_thought

        preprocess_prompt = """
        The exploration results from the first stage are shown above.
        Now, let's move on to the second stage:
        1. Provide a conclusion based on the exploration results.
        2. Based on the conclusion, specify the preprocessing techniques to be employed for the dataset and briefly describe their implementations.
        3. Write a Python function called `preprocess` that will assist you in preprocessing the data.
           Import any package you need beforehand.
           The function will be called using `proc_X_train, proc_y_train, proc_X_test, proc_y_test = preprocess(X_train, y_train, X_test, y_test)`, 
           where both the parameters and return values are eithor Pandas dataframes or series.
           Be aware of potential data leakage issues.

        Here are some handy packages in the Python environment:
        ```
        numpy==1.24.3
        pandas==2.0.1
        ```
        No additional packages can be used. Return results about the second stage.
        """
        

        prompt = "\n\n".join([
            start_explore_prompt,
            how_to_explore_prompt,
            f"Your Response:\n{explore_data_thought}",
            f"Code Execution Results:\n{explore_data_result}",
            preprocess_prompt]
        )
        preprocess_data_thought = coding_agent.run(prompt)
        self.preprocess_data_thought = preprocess_data_thought
        # print(preprocess_data_thought)
        preprocess_code = extract_code(preprocess_data_thought)
        self.preprocess_code = preprocess_code
        # print(preprocess_code)
        
        best_result = {
            'score': 100,
            'code': preprocess_code,
            'msg': ''
        }
        
        for _ in range(5):
            try:
                print(preprocess_code)
                exec(preprocess_code, self.globals, self.locals)
                result = self.evaluate()
                
                if result['llm'] < best_result['score']:
                    best_result['score'] = result['llm']
                    best_result['msg'] = result['msg']
                    best_result['code'] = preprocess_code
                    
                    self.status.X_train = result['processed_data'][0]
                    self.status.X_test = result['processed_data'][2]
                    self.status.y_train = result['processed_data'][1]
                    self.status.y_test = result['processed_data'][3]
                
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{result['msg']}\n\nYour result is LLm result. Try to improve the preprocess function.")
                preprocess_code = extract_code(fix_code)
                # print(f"Revise{_}:\n{preprocess_code}\n")
            except Exception as e:
                import traceback
                trace = traceback.format_exc()
                print(trace)
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{str(e)}\n\nModify the code and fix the error. Try not to use the module that causes error. Try to simpify the function.")
                preprocess_code = extract_code(fix_code)
                # print(f"Fix{_}:\n{preprocess_code}\n")
                
        print(best_result)
        self.update_status()
        return f"{preprocess_data_thought}\n\nProcessing....\n\nProcess Completed !\n{best_result['msg']}"
    
    def update_status(self):
        self.status.preprocess_data_thought = self.preprocess_data_thought
        self.status.preprocess_code = self.preprocess_code
        self.status.X_train = self.locals['X_train']
        self.status.X_test = self.locals['X_test']
        self.status.y_train = self.locals['y_train']
        self.status.y_test = self.locals['y_test']
        self.status.preprocess = True
        
    def evaluate(self):

        def common_preprocess(X_train, y_train, X_test, y_test):
            # TODO: Missing value dropping

            # TODO: Categorical feature encoding

            # Scale numerical variables using StandardScaler
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

            return X_train, y_train, X_test, y_test
        
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_squared_error
        from lightgbm import LGBMRegressor, early_stopping

        seed = 42

        def evaluate(X_train, y_train, X_test, y_test, prep_func=None):
            processed_data = (X_train, y_train, X_test, y_test)
            if prep_func:
                X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)
                processed_data = (X_train, y_train, X_test, y_test)

            X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=seed)
            clf = LGBMRegressor(random_state=seed, **self.status.best_param)
            clf.fit(X_train, y_train, eval_set=(X_valid, y_valid), callbacks=[early_stopping(stopping_rounds=10)])
            y_pred = clf.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            return mse, processed_data

        from copy import deepcopy
        X_train = deepcopy(self.status.X_train)
        X_test = deepcopy(self.status.X_test)
        y_train = deepcopy(self.status.y_train)
        y_test = deepcopy(self.status.y_test)
        self.locals["X_train"] = X_train
        self.locals["X_test"] = X_test
        self.locals["y_train"] = y_train
        self.locals["y_test"] = y_test
        
        # LLM
        mse_llm, processed_data = evaluate(X_train, y_train, X_test, y_test, self.locals['preprocess'])

        # Common
        mse_com, _ = evaluate(X_train, y_train, X_test, y_test, common_preprocess)

        # Raw
        mse_raw, _ = evaluate(X_train, y_train, X_test, y_test)

        msg = f"No preprocess: {mse_raw}, Common preprocess: {mse_com}, LLM preprocess: {mse_llm}"
        print(msg)
        return {
            'msg': msg,
            'no': mse_raw,
            'common': mse_com,
            'llm': mse_llm,
            'processed_data': processed_data
        }

In [7]:
import functools
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts.chat import (
    PromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
human_message_prompt = HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template="{text}",
            input_variables=["text"],
        )
    )
chat_prompt_template = ChatPromptTemplate.from_messages([human_message_prompt])
chat = ChatOpenAI(temperature=0)
chain = LLMChain(llm=chat, prompt=chat_prompt_template)#, memory=ConversationBufferMemory())



class CodingAgent():
    def __init__(self, chain):
        self.chain = chain
    @functools.lru_cache(100)
    def run(self, text):
        return self.chain.run(text)
    
    
coding_agent = CodingAgent(chain)

In [50]:
import json
import pdb
from sklearn.model_selection import train_test_split


class E2EHyperOptTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        self.X_train = status.X_train
        self.y_train = status.y_train
        self.X_test = status.X_test
        self.y_test = status.y_test
        # self.locals = {"X": X, 'Y': Y,}

    @prompts(
        name="hyperopt",
        description="This is a tool for hyper parameter tuning."
        "Input should be an empty string."
    )
    def inference(self, query: str):
        if not self.status.dataset:
            return "Pick a dataset first."
        if not self.status.model:
            msg = "You have not select the model. Use LGBMRegressor for this task."
            self.status.model = "LGBMRegressor"
        
        clf = self.construct_estimator(self.status.model)
        param_grid = self.construct_param_grid(clf)
        search_algo = self.get_search_algo(clf, param_grid)
        clff = self.construct_search_algo(search_algo)
        self.clff = clff
        search = clff.fit(self.X_train, self.y_train)
        self.search = search
        
        print(f"Best search score: {search.best_score_}")
        score = self.evaluate(clf, search)

        self.update_status()
        return f"{self.reason}\n\nFinal score after hyper parameter tuning is {score}"
    
    def update_status(self):
        self.status.hyperopt = True
        self.status.hyper_result = self.search
        self.status.hyperopt_param_thought = self.reason
        
    def evaluate(self, clf, search):
        from sklearn.metrics import mean_squared_error
        clff = clf.__class__(**search.best_params_)
        clff.fit(self.X_train, self.y_train)
        y_pred = clff.predict(self.X_test)
        score = mean_squared_error(self.y_test, y_pred)
        print(f"Testing score: {score}")
        return score
    
    def construct_search_algo(self, search_algo):
        # construct search instance
        construct_search_prompt = f"""
        Give me two lines of python code.
        Frist, from lib import {search_algo.strip()}.
        Second, instatiate an instance with {search_algo.strip()}(clf, param_grid, random_state=0, n_iter=10) and assign to hyper_algo variable.
        """
        codes = coding_agent.run(construct_search_prompt)
        print(f"\nConstructing search algo:\n```\n{codes}\n```\n")
        res = self.execute(codes)
        if 'hyper_algo' not in self.locals:
            raise Exception(res)
        return self.locals['hyper_algo']

    def get_search_algo(self, clf, param_grid):
        search_algo_reason_prompt = f"""
        You are now performing a hyper parameter search on a machine learning model.
        Your goal is to decide the search algorithm base on model type and parameter space.


        The given model is a {clf.__class__.__name__}.
        The paramters grid is {param_grid}.
        The avaiable search algorithm is as follow:

        1. GridSearchCV: Exhaustive search over specified parameter values for an estimator.
        2. HalvingGridSearchCV: Search over specified parameter values with successive halving.
        3. ParameterGrid: Grid of parameters with a discrete number of values for each.
        4. ParameterSampler: Generator on parameters sampled from given distributions.
        5. RandomizedSearchCV: Randomized search on hyper parameters.
        6. HalvingRandomSearchCV: Randomized search on hyper parameters with successive halving.

        Think about which algorithm to use base on the model, parameter space and algorithm description.
        Tell me which algorithm you have choosen and why.
        List all the other algorithms you have choosen not to use and why.
        """
        
        reason = coding_agent.run(search_algo_reason_prompt)
        self.reason = reason
        print(reason)

        search_algo = coding_agent.run(f"{reason}\n\nAccording to above, give me the algorithm name of the choosen algorithm in a single token.")
        print(f"\nDeciding search algo:\n{search_algo}\n")
        return search_algo

    def construct_param_grid(self, clf):
        param_grid_prompt = f"""
        You are now performing a hyper parameter search on a machine learning model.
        Your goal is to construct the paramter space to search.

        For example:
        The model is a LinearSVR.
        The parameters are {LinearSVR().get_params()}
        The parameter grid can be defined as:
        ```
        param_grid = {{'C': [1, 10, 100, 1000], 'tol': [0.001, 0.0001]}}
        ```
        Remeber to define the space only on important features. 


        Now.
        The given model is a {clf.__class__.__name__}.
        The paramters are {clf.get_params()}.
        The detail description about the parameters is as follow:
        {clf.__doc__}

        Construct the parameter grid you will use for hyper parameter in json format:
        """

        param_grid = coding_agent.run(param_grid_prompt)
        param_grid = json.loads(param_grid)
        print(f"\nConstruct params grid:\n{param_grid}\n")
        self.locals['param_grid'] = param_grid
        return param_grid
        
    def construct_estimator(self, estimator_name: str):
        # construct clf instance
        construct_estimator_prompt = f"Give me two lines of python code. Construct a {estimator_name.strip()} estimator and assign the estimator clf variable."
        codes = coding_agent.run(construct_estimator_prompt)
        print(f"\nConstruct estimator with following:\n```\n{codes}\n```\n")
        res = self.execute(codes)
        if 'clf' not in self.locals:
            raise Exception(res)
        return self.locals['clf']
        
    def execute(self, cmd: str):
        try:
            tree = ast.parse(cmd)
            module = ast.Module(tree.body[:-1], type_ignores=[])
            exec(astunparse.unparse(module), self.globals, self.locals)  # type: ignore
            module_end = ast.Module(tree.body[-1:], type_ignores=[])
            module_end_str = astunparse.unparse(module_end)  # type: ignore
            try:
                return eval(module_end_str, self.globals, self.locals)
            except Exception:
                exec(module_end_str, self.globals, self.locals)
                return ""
        except Exception as e:
            return str(e)

In [261]:
class ConversationBot(ABC):

    def __init__(self):
        seed_everything(0)
        # self.handler = LoggerCallbackHandler()
        # self.async_handler = AsyncLoggerCallbackHandler()
        self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
        self.memory = ConversationBufferMemory(
            memory_key="chat_history", output_key="output", return_messages=True
        )
        self.state = []
        self.tools = self.load_tools()

        # retry when llm output parsing error
        retrytool = retryActionTool()
        func = retrytool.inference
        self.tools.append(
            Tool(
                name=func.name,
                description=func.description,
                # coroutine=to_async(func),
                func=func,
            )
        )
        self.output_parser = RobustRetryConvoOutputParser()
        self.agent = self.init_agent()

    def run_text(self, text: str, state):
        logger.info(f"User: {text}")
        res = self.agent({"input": text})
        response = res["output"]
        self.state += [(text, response)]
        logger.info(
            f"\nProcessed run_text, Input text: {text}\nCurrent state: {self.state}\n"
        )
        return self.state, self.state

    async def arun_text(self, text: str):
        logger.info(f"User: {text}")
        res = await self.agent.acall({"input": text})
        response = res["output"]
        self.state += [(text, response)]
        logger.info(
            f"\nProcessed run_text, Input text: {text}\nCurrent state: {self.state}\n"
        )
        return self.state, self.state

    def _clear(self):
        self.memory.clear()
        self.state = []

    def init_agent(self):
        input_variables = ["input", "agent_scratchpad", "chat_history"]
        return initialize_agent(
            self.tools,
            self.llm,
            agent="chat-conversational-react-description",
            verbose=True,
            memory=self.memory,
            return_intermediate_steps=True,
            max_iterations=5,
            # max_execution_time=60,
            agent_kwargs={
                "prefix": AUTOML_CHATGPT_PREFIX,
                "format_instructions": AUTOML_CHATGPT_FORMAT_INSTRUCTIONS,
                "suffix": AUTOML_CHATGPT_SUFFIX,
                "input_variables": input_variables,
                "output_parser": self.output_parser,
            },
        )

    def load_tools(self):
        tools = []
        X, X_train, X_test, y, y_train, y_test = self.load_artifacts()

        for tool in [
            # pythonShellTool(),
            E2EHyperOptTool(status),
            # HyperOptTool(),
            # ConstructSearchTool(),
            # ConstructParamGridTool(),
            # ConstructEstimatorTool(),
            # FinalTool(X, Y),
            ShowStatusTool(status),
            ModelSelectTool(status),
            ExploreDataTool(status),
            ProcessDataTool(status),
            LoadDataTool(status),
        ]:
            func = tool.inference
            tools.append(
                Tool(
                    name=func.name,
                    description=func.description,
                    # coroutine=to_async(func),
                    func=func,
                )
            )
        logger.info(f"tools: {[i.name for i in tools]}")
        return tools

    def load_artifacts(self):
        data = pd.read_csv("WineQT.csv")
        X = data.drop('quality', axis=1)
        y = data['quality']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        # X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
        return X, X_train, X_test, y, y_train, y_test

In [10]:
from prompt import AUTOML_CHATGPT_PREFIX, AUTOML_CHATGPT_FORMAT_INSTRUCTIONS, AUTOML_CHATGPT_SUFFIX
from langchain.agents.initialize import initialize_agent
from langchain.agents.tools import Tool
from sklearn.svm import LinearSVR
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox
from imblearn.over_sampling import SMOTE

In [96]:
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split

class Status():
    
    def __init__(self, dev=True):
        self.init()
        self.dev = dev
        
    def init_data(self, dataset, X_train, X_test, y_train, y_test):
        self.init()
        self.dataset = dataset
        self.X = X
        self.y = y
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.1, random_state=42)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def init(self):
        
        self.dataset = ""
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
        self.start_explore_prompt = ""
        self.how_to_explore_prompt = ""
        self.explore_data_thought = ""
        self.hyperopt_param_thought = ""
        
        self.explore_result = "Nothing special."
        
        self.explore = False
        self.preprocess = False
        self.model_select = False
        self.hyperopt = False
        
        self.explore_code = ""
        self.preprocess_code = ""
        self.model_select_code = ""
        self.hyperopt_code = ""
        
        self.model = ""
        self.best_param = {}

status = Status()

In [74]:
class DataLoader():
    
    def __init__(self, chain):
        self.datasets = {}
        self.chain = chain
        
    def select(self, name: str):
        if name not in self.datasets:
            prompt = f"""
            These are the datasets we have:
            {self.datasets.keys()}
            If someone ask for {name}, what is the most possible dataset that is referred to ?
            Give me the response in one single word:
            """
            print(prompt)
            name = self.chain.run(prompt)
            return self.datasets[name]
        else:
            return self.datasets[name]
        
    def show(self):
        print(self.datasets.keys())
    
    def load(self):
        
        for dirn in os.listdir('kaggle'):
            
            if not os.path.isdir(os.path.join('kaggle', dirn)):
                continue

            try:
                train_path = os.path.join('kaggle', dirn, 'train.csv')
                train_df = pd.read_csv(train_path)

                test_path = os.path.join('kaggle', dirn, 'test.csv')
                test_df = pd.read_csv(test_path)

                label = list(set(train_df.columns) - set(test_df.columns))[0]

                self.datasets[dirn] = {
                    'train': train_df,
                    'test': test_df,
                    'label': label,
                    'name': dirn
                }
            except Exception as e:
                print(e)
                
                
data_loader = DataLoader(chain)
data_loader.load()
data_loader.select('allstate')

KeyboardInterrupt: 

In [None]:
class E2EPreprocessTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.locals = {"X": X, 'y': y, 'np': np}
        self.globals = {
            "np": np,
            'pd': pd,
            # 'StandardScaler': StandardScaler,
            # 'PowerTransformer': PowerTransformer,
            # 'boxcox': boxcox,
            # 'SMOTE': SMOTE,
            # 'RandomOverSampler': RandomOverSampler
        }

    @prompts(
        name="preprocess",
        description="This is a tool for dataset preprocessing."
        "Input should be a dataset name."
    )
    def inference(self, query: str):
        explore_prompts = self.explore_dataset(query)
        self.preprocess_dataset(explore_prompts)
        # self.evaluate()
        
    def retry_execute(self, prompt: str, code: str, retry: int = 3):
        for _ in range(retry):
            try:
                exec(code, self.globals, self.locals)
            except Exception as e:
                import traceback
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{traceback.format_exc()}\n\nModify the code and fix the error.")
                code = extract_code(fix_code)
                print(f"Fix{_}:\n{code}\n")
        
    def evaluate(self):

        def common_preprocess(X_train, y_train, X_test, y_test):
            # TODO: Missing value dropping

            # TODO: Categorical feature encoding

            # Scale numerical variables using StandardScaler
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

            return X_train, y_train, X_test, y_test
        
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_squared_error
        from lightgbm import LGBMRegressor, early_stopping

        seed = 42

        def evaluate(X_train, y_train, X_test, y_test, prep_func=None):
            if prep_func:
                X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)

            X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=seed)
            clf = LGBMRegressor(random_state=seed)
            clf.fit(X_train, y_train, eval_set=(X_valid, y_valid), callbacks=[early_stopping(stopping_rounds=10)])
            y_pred = clf.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)

            return mse

        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=seed)


        # LLM
        mse_llm = evaluate(X_train, y_train, X_test, y_test, self.locals['preprocess'])

        # Common
        mse_com = evaluate(X_train, y_train, X_test, y_test, common_preprocess)

        # Raw
        mse_raw = evaluate(X_train, y_train, X_test, y_test)

        msg = f"No preprocess: {mse_raw}, Common preprocess: {mse_com}, LLM preprocess: {mse_llm}"
        print(msg)
        return {
            'msg': msg,
            'no': mse_raw,
            'common': mse_com,
            'llm': mse_llm
        }
        
    def preprocess_dataset(self, explore_prompts: str):
        start_explore_prompt, how_to_explore_prompt, explore_data_thought, explore_data_result = explore_prompts
        
        preprocess_prompt = """
        The exploration results from the first stage are shown above.
        Now, let's move on to the second stage:
        1. Provide a conclusion based on the exploration results.
        2. Based on the conclusion, specify the preprocessing techniques to be employed for the dataset and briefly describe their implementations.
        3. Write a Python function called `preprocess` that will assist you in preprocessing the data.
           Import any package you need beforehand.
           The function will be called using `proc_X_train, proc_y_train, proc_X_test, proc_y_test = preprocess(X_train, y_train, X_test, y_test)`, 
           where both the parameters and return values are eithor Pandas dataframes or series.
           Be aware of potential data leakage issues.

        Here are some handy packages in the Python environment:
        ```
        numpy==1.24.3
        pandas==2.0.1
        ```
        No additional packages can be used.
        """
        
        """
        scikit-learn==1.2.2
        scipy==1.10.1
        imbalanced-learn==0.10.1
        """

        prompt = "\n\n".join([
            start_explore_prompt,
            how_to_explore_prompt,
            f"Your Response:\n{explore_data_thought}",
            f"Code Execution Results:\n{explore_data_result}",
            preprocess_prompt]
        )
        preprocess_data_thought = coding_agent.run(prompt)
        print(preprocess_data_thought)
        preprocess_code = extract_code(preprocess_data_thought)
        self.preprocess_code = preprocess_code
        
        best_result = {
            'score': 100,
            'code': preprocess_code
        }
        
        for _ in range(5):
            try:
                exec(preprocess_code, self.globals, self.locals)
                result = self.evaluate()
                
                if result['llm'] < best_result['score']:
                    best_result['score'] = result['llm']
                    best_result['code'] = preprocess_code
                
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{result['msg']}\n\nYour result is LLm result. Try to improve the preprocess function.")
                preprocess_code = extract_code(fix_code)
                print(f"Revise{_}:\n{preprocess_code}\n")
            except Exception as e:
                import traceback
                trace = traceback.format_exc()
                print(trace)
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{str(e)}\n\nModify the code and fix the error. Try not to use the module that causes error. Try to simpify the function.")
                preprocess_code = extract_code(fix_code)
                print(f"Fix{_}:\n{preprocess_code}\n")
        
        # self.retry_execute(preprocess_data_thought, preprocess_code)

        
    def explore_dataset(self, query: str):
        start_explore_prompt = f"""
You are a data scientist. You are now performing data analysis before training a machine learning model.
Your task is to tell me how the data could be preprocessed and to give me the corresponding Python code.
The preprocessing includes, but is not limited to, missing value handling, scaling, normalization, or encoding.
The task will be performed in two stages.
In the first stage, you will decide which techniques are required to preprocess the dataset by investigating the dataset.
In the second stage, you will preprocess the dataset by implementing the functions.

Here is some information about the feature dataframe `X`:
\"\"\"
- First five rows:
{self.X.head(5)}

- Column types:
{self.X.dtypes}
\"\"\"

Here is some information about the label series `y`:
\"\"\"
- First five rows:
{self.y.head(5)}

- Column types:
{self.y.dtypes}
\"\"\"
"""

        how_to_explore_prompt = """
Now, let's move on to the first stage:
1. Explain how you will explore the dataset in order to select appropriate preprocessing techniques.
2. Write a Python function called `explore` to explore the dataset.
   Import any package you need beforehand.
   The function will be called using `result = explore(X, y)`, and all findings must be stored in a string variable `result`.

Here are some handy packages in the Python environment:
```
numpy==1.24.3
pandas==2.0.1
```
"""

        prompt = "\n\n".join([start_explore_prompt, how_to_explore_prompt])
        self.prompt = prompt
        print(prompt)

        explore_data_thought = coding_agent.run(prompt)
        self.explore_data_thought = explore_data_thought
        print(explore_data_thought)
        
        explore_code = extract_code(explore_data_thought)
        self.explore_code = explore_code
        print(explore_code + '\n')
        self.retry_execute(explore_data_thought, explore_code)
        
        explore_data_result = self.locals["explore"](self.X, self.y)
        self.explore_data_result = explore_data_result
        print(explore_data_result)
        
        return (start_explore_prompt, how_to_explore_prompt, explore_data_thought, explore_data_result)
    
    
preprocess_tool = E2EPreprocessTool(X, y)
preprocess_tool.inference("")

# Demo start

In [218]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you load the dataset ?", "")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Load",
    "action_input": "dataset_name"
}[0m
Observation: [33;1m[1;3mLoad dataset WineQT success ![0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The dataset WineQT has been successfully loaded."
}[0m

[1m> Finished chain.[0m


('The dataset WineQT has been successfully loaded.',
 [('Can you load the dataset ?',
   'The dataset WineQT has been successfully loaded.')])

# Demo 1: Can you explore the dataset ?

In [205]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you explore the dataset ?", "")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Explore",
    "action_input": ""
}[0m
Observation: [38;5;200m[1;3mThere are no missing values in the dataset.
There are 415 outliers in the dataset. You may need to handle them before preprocessing.
Some features are highly skewed. You may need to apply scaling or normalization.
All features are numerical.
There is no high correlation between features.[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The dataset has no missing values, 415 outliers, some highly skewed features that may require scaling or normalization, all features are numerical, and there is no high correlation between features."
}[0m

[1m> Finished chain.[0m


('The dataset has no missing values, 415 outliers, some highly skewed features that may require scaling or normalization, all features are numerical, and there is no high correlation between features.',
 [('Can you explore the dataset ?',
   'The dataset has no missing values, 415 outliers, some highly skewed features that may require scaling or normalization, all features are numerical, and there is no high correlation between features.')])

# Demo 2: Can you preprocess the dataset ?

In [101]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you process the dataset ?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Preprocess",
    "action_input": ""
}[0m

import numpy as np
import pandas as pd

def preprocess(X_train, y_train, X_test, y_test):
    # Remove outliers
    quantiles = X_train.quantile([0.25, 0.75])
    IQR = quantiles.loc[0.75] - quantiles.loc[0.25]
    lower_bound = quantiles.loc[0.25] - 1.5 * IQR
    upper_bound = quantiles.loc[0.75] + 1.5 * IQR
    X_train = X_train[(X_train >= lower_bound) & (X_train <= upper_bound)].dropna()
    y_train = y_train[X_train.index]
    X_test = X_test[(X_test >= lower_bound) & (X_test <= upper_bound)].dropna()
    y_test = y_test[X_test.index]
    
    # Apply Box-Cox transformation to skewed features
    skewness = X_train.skew()
    skewed_features = skewness[abs(skewness) > 1].index
    for feature in skewed_features:
        X_train[feature], _ = pd.Series(np.boxcox(X_train[feature] + 1))
        X_test[feature], _ = pd.Series(np.boxcox(X_test[feature] + 1))
    
   

'The dataset requires preprocessing techniques to handle outliers and skewed features. The preprocessing techniques that will be employed are outlier handling using the interquartile range (IQR) method and skewed feature handling using the Box-Cox transformation. The Python function `preprocess` is provided to preprocess the dataset. The function takes in the feature dataframe `X_train`, the label series `y_train`, the feature dataframe `X_test`, and the label series `y_test` as inputs and returns the preprocessed feature dataframe `proc_X_train`, the preprocessed label series `proc_y_train`, the preprocessed feature dataframe `proc_X_test`, and the preprocessed label series `proc_y_test`. Note that there is a potential data leakage issue in the outlier handling step, as we are using the quantiles of the training set to remove outliers from both the training and test sets. To avoid this issue, we should calculate the quantiles separately for the training and test sets. However, for sim

# Demo 3: Can you select the model ?

In [244]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you select the estimator ?", [])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Pick Model",
    "action_input": ""
}[0m```python
import pandas as pd
import numpy as np

def examine_dataset(X, y):
    # Check for missing values
    if X.isnull().sum().sum() > 0:
        result = "There are missing values in the dataset. You need to decide how to handle them before choosing a model."
    else:
        result = "There are no missing values in the dataset."
    
    # Check for outliers
    if (X.dtypes == np.number).any():
        num_cols = X.select_dtypes(include=np.number).columns
        if (X[num_cols].apply(lambda x: np.abs(x - x.mean()) / x.std() > 3)).sum().sum() > 0:
            result += "\nThere are outliers in the dataset. You need to decide how to handle them before choosing a model."
        else:
            result += "\nThere are no outliers in the dataset."
    
    # Check for class imbalance
    if y.nunique() > 2:
        class_counts = y.value_counts()
        class_r

([('Can you select the estimator ?',
   "Here is an example of a model you can use: \n\n```python\nfrom lightgbm import LGBMClassifier\n\ndef construct_model():\n    clf = LGBMClassifier()\n    return clf\n\nclf = construct_model()\nprint(clf)\n```\n\nThis code will output the following:\n\n```\nLGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n               importance_type='split', learning_rate=0.1, max_depth=-1,\n               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,\n               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,\n               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)\n```")],
 [('Can you select the estimator ?',
   "Here is an example of a model you can use: \n\n```python\nfrom lightgbm import LGBMClassifier\n\ndef construct_model():\n    clf = LGBMClassifier()\n    return clf\n\nclf = construct_model(

# Demo 4: Can you run hyper parameter tuning ?

In [246]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you run hyper parameter tuning ?", [])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "hyperopt",
    "action_input": ""
}[0m
Construct estimator with following:
```
from lightgbm import LGBMClassifier

clf = LGBMClassifier()
```


Construct params grid:
{'boosting_type': ['gbdt', 'dart', 'goss'], 'learning_rate': [0.01, 0.05, 0.1, 0.5], 'max_depth': [3, 5, 7, 9], 'num_leaves': [31, 50, 100, 200], 'n_estimators': [50, 100, 200, 500], 'min_child_samples': [10, 20, 30, 50], 'reg_alpha': [0.0, 0.1, 0.5, 1.0], 'reg_lambda': [0.0, 0.1, 0.5, 1.0], 'colsample_bytree': [0.5, 0.7, 1.0], 'subsample': [0.5, 0.7, 1.0]}

I have chosen to use RandomizedSearchCV for the hyper parameter search on the LGBMClassifier model. This is because RandomizedSearchCV is a good choice when the parameter space is large and it can sample a wide range of values for each hyper parameter. Additionally, it is faster than GridSearchCV and can often find good hyper parameter values with fewer iterations.

I have chosen not to us

KeyboardInterrupt: 

# Demo 5: Can you run the ML process end to end ?

In [219]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you show the current status ?", "")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "show Status",
    "action_input": ""
}[0m
Observation: [33;1m[1;3m{'dataset': 'WineQT', 'explore': False, 'preprocess': False, 'model_select': False, 'hyperopt': False}[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The current status is that no dataset has been loaded, no dataset has been explored, no dataset has been preprocessed, no model has been selected, and no hyperparameter tuning has been performed."
}[0m

[1m> Finished chain.[0m


('The current status is that no dataset has been loaded, no dataset has been explored, no dataset has been preprocessed, no model has been selected, and no hyperparameter tuning has been performed.',
 [('Can you show the current status ?',
   'The current status is that no dataset has been loaded, no dataset has been explored, no dataset has been preprocessed, no model has been selected, and no hyperparameter tuning has been performed.')])

# Gradio

In [262]:
demo.close()

Closing server running on port: 7867
