In [1]:
import os
import uuid
import random
import numpy as np
import pandas as pd
from typing import Optional, Union, List, Dict, Tuple

# import torch
import asyncio
import logging
from functools import wraps, partial


logger = logging.getLogger(__name__)


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    return seed


def prompts(name: str, description: str):
    def decorator(func):
        func.name = name
        func.description = description
        return func

    return decorator


def extract_code(text):
    texts = text.split("```")
    code = None
    if len(texts) > 1:
        code = texts[1]
        code = code.strip("python").strip("Python")
    return code

In [2]:
from typing import Optional, Union, List, Dict, Tuple
from pydantic import Field
import ast
import astunparse
import logging
import pandas as pd


def prompts(name: str, description: str):
    def decorator(func):
        func.name = name
        func.description = description
        return func

    return decorator


def execute(cmd: str, glo, loc):
    tree = ast.parse(cmd)
    module = ast.Module(tree.body[:-1], type_ignores=[])
    exec(astunparse.unparse(module), self.globals, self.locals)  # type: ignore
    module_end = ast.Module(tree.body[-1:], type_ignores=[])
    module_end_str = astunparse.unparse(module_end)  # type: ignore
    try:
        return eval(module_end_str, self.globals, self.locals)
    except Exception:
        exec(module_end_str, self.globals, self.locals)
        return ""

In [3]:
# os.environ['OPENAI_API_KEY'] = ""
os.environ['OPENAI_API_KEY'] = ""

In [4]:
import re
import json
import logging
import threading
from typing import Union
from abc import ABC, abstractmethod

from langchain.chains.conversation.memory import (
    ConversationBufferMemory,
    # ConversationBufferWindowMemory,
    # ConversationSummaryBufferMemory
)
from langchain.chat_models import ChatOpenAI
from langchain.agents.tools import Tool
from langchain.agents.conversational.output_parser import ConvoOutputParser
from langchain.agents.conversational.prompt import FORMAT_INSTRUCTIONS
from langchain.agents.conversational_chat.prompt import (
    FORMAT_INSTRUCTIONS as CHAT_FORMAT_INSTRUCTIONS,
)
from langchain.schema import AgentAction, AgentFinish
from langchain.agents import AgentOutputParser

# from api.services.utils import prompts, seed_everything, to_async
# from api.conversation.callback import AsyncLoggerCallbackHandler, LoggerCallbackHandler



class retryActionTool:
    """Retry when chatgpt response parsing failed."""

    @prompts(
        name="RR",
        description="Do not use it.",
    )
    def inference(self, query):
        return f"Modify the following text in ``` such that it will follow RESPONSE FORMAT INSTRUCTIONS:```{query}```\nDo not use retry tool again."


class RetryConvoOutputParser(ConvoOutputParser):
    ai_prefix: str = "AI"

    def get_format_instructions(self) -> str:
        return FORMAT_INSTRUCTIONS

    def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
        if f"{self.ai_prefix}:" in text:
            return AgentFinish(
                {"output": text.split(f"{self.ai_prefix}:")[-1].strip()}, text
            )
        regex = r"Action: (.*?)[\n]*Action Input: (.*)"
        match = re.search(regex, text)
        if not match:
            return AgentAction("retry", "", text)
            # raise OutputParserException(f"Could not parse LLM output: `{text}`")
        action = match.group(1)
        action_input = match.group(2)
        return AgentAction(action.strip(), action_input.strip(" ").strip('"'), text)


class RobustRetryConvoOutputParser(AgentOutputParser):
    def get_format_instructions(self) -> str:
        return CHAT_FORMAT_INSTRUCTIONS

    def parse_json(self, text: str):
        try:
            return [
                i
                for i in re.findall(r"\s([{\[].*?[}\]])$", f" {text}", flags=re.DOTALL)
                if i
            ][0]
        except Exception:
            return

    def parse_markdown_code(self, text: str):
        try:
            for i in re.findall(
                r"`{3}([\w]*)\n([\S\s]+?)\n`{3}", text, flags=re.DOTALL
            )[0]:
                if i:
                    tmp = self.parse_json(i)
                    if tmp:
                        return tmp
        except Exception:
            return

    def parse_origin(self, text: str):
        try:
            cleaned_output = text.strip()
            if "```json" in cleaned_output:
                _, cleaned_output = cleaned_output.split("```json")
            if "```" in cleaned_output:
                cleaned_output, _ = cleaned_output.split("```")
            if cleaned_output.startswith("```json"):
                cleaned_output = cleaned_output[len("```json"):]
            if cleaned_output.startswith("```"):
                cleaned_output = cleaned_output[len("```"):]
            if cleaned_output.endswith("```"):
                cleaned_output = cleaned_output[: -len("```")]
            return cleaned_output.strip()
        except Exception:
            return

    def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
        try:
            json_str = self.parse_json(text)
            markdown_str = self.parse_markdown_code(text)
            origin_str = self.parse_origin(text)

            if json_str:
                cleaned_output = json_str
            elif markdown_str:
                cleaned_output = markdown_str
                if cleaned_output.startswith("```json"):
                    cleaned_output = cleaned_output[len("```json"):]
                if cleaned_output.startswith("```"):
                    cleaned_output = cleaned_output[len("```"):]
                if cleaned_output.endswith("```"):
                    cleaned_output = cleaned_output[: -len("```")]
            elif origin_str:
                cleaned_output = origin_str
            else:
                cleaned_output = text.strip()

            response = json.loads(cleaned_output)
            action, action_input = response["action"], response["action_input"]
            if action == "Final Answer":
                return AgentFinish({"output": action_input}, text)
            else:
                return AgentAction(action, action_input, text)
        except Exception:
            if "action" not in text and "action_input" not in text:
                return AgentFinish({"output": text}, text)
            else:
                logger.warning(f"Not follow format instruction\n{cleaned_output}")
                return AgentAction("RR", "", text)

        
class pythonShellTool:

    @prompts(
        name="python_repl_ast",
        description="This is a python shell. Use this to execute python commands. "
        "Input should be a valid python command. "
        "The output of the command should not be too long."
        "The dataset is a dataframe assign to a variable df"
    )
    def inference(self, query: str):
        try:
            tree = ast.parse(query)
            module = ast.Module(tree.body[:-1], type_ignores=[])
            exec(astunparse.unparse(module), self.globals, self.locals)  # type: ignore
            module_end = ast.Module(tree.body[-1:], type_ignores=[])
            module_end_str = astunparse.unparse(module_end)  # type: ignore
            try:
                return eval(module_end_str, self.globals, self.locals)[:1024]
            except Exception:
                exec(module_end_str, self.globals, self.locals)
                return ""
        except Exception as e:
            return str(e)

In [5]:
import functools
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts.chat import (
    PromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
human_message_prompt = HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template="{text}",
            input_variables=["text"],
        )
    )
chat_prompt_template = ChatPromptTemplate.from_messages([human_message_prompt])
chat = ChatOpenAI(temperature=0)
chain = LLMChain(llm=chat, prompt=chat_prompt_template)#, memory=ConversationBufferMemory())



class CodingAgent():
    def __init__(self, chain):
        self.chain = chain
    @functools.lru_cache(100)
    def run(self, text):
        return self.chain.run(text)
    
    
coding_agent = CodingAgent(chain)

In [6]:
from prompt import AUTOML_CHATGPT_PREFIX, AUTOML_CHATGPT_FORMAT_INSTRUCTIONS, AUTOML_CHATGPT_SUFFIX
from langchain.agents.initialize import initialize_agent
from langchain.agents.tools import Tool
from sklearn.svm import LinearSVR
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox
from imblearn.over_sampling import SMOTE

In [7]:
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split

class Status():
    
    def __init__(self, dev=True):
        self.init()
        self.dev = dev
        
    def init_data(self, dataset, X_train, X_test, y_train, y_test):
        self.init()
        self.dataset = dataset
        self.X = X_train
        self.y = y_train
        
        if self.dev:
            X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.1, random_state=42)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def init(self):
        
        self.dataset = ""
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.y_pred = None
        
        self.start_explore_prompt = ""
        self.how_to_explore_prompt = ""
        self.explore_data_thought = ""
        self.hyperopt_param_thought = ""
        
        self.explore_result = "Nothing special."
        
        self.explore = False
        self.preprocess = False
        self.model_select = False
        self.hyperopt = False
        
        self.explore_code = ""
        self.preprocess_code = ""
        self.model_select_code = ""
        self.hyperopt_code = ""
        
        self.model = ""
        self.best_param = {}

status = Status(dev=False)

In [9]:
from concurrent.futures import ThreadPoolExecutor, as_completed

class DataLoader():
    
    def __init__(self, chain):
        self.datasets = {}
        self.chain = chain
        
    def select(self, name: str):
        if name not in self.datasets:
            prompt = f"""
            These are the datasets we have:
            {self.datasets.keys()}
            If someone ask for {name}, what is the most possible dataset that is referred to ?
            Give me the response in one single word:
            """
            print(prompt)
            name = self.chain.run(prompt)
            return self.datasets[name]
        else:
            return self.datasets[name]
        
    def show(self):
        print(self.datasets.keys())
        
    def load_doc(self, dirn: str):
        try:
            train_path = os.path.join('kaggle', dirn, 'train.csv')
            train_df = pd.read_csv(train_path)

            test_path = os.path.join('kaggle', dirn, 'test.csv')
            test_df = pd.read_csv(test_path)

            label = list(set(train_df.columns) - set(test_df.columns))[0]

            self.datasets[dirn] = {
                'train': train_df,
                'test': test_df,
                'label': label,
                'name': dirn
            }
            return dirn
        except Exception as e:
            print(e)
    
    def load(self):
        dir_list = []
        for dirn in os.listdir('kaggle'):
            if not os.path.isdir(os.path.join('kaggle', dirn)):
                continue
            dir_list.append(dirn)
    
        print(dir_list)
        with ThreadPoolExecutor(max_workers=8) as executor:
            future_to_url = [executor.submit(self.load_doc, dirn) for dirn in dir_list]
            for future in as_completed(future_to_url):
                dirn = future.result()
                print(f"{dirn} complete.")
                
                
data_loader = DataLoader(chain)
data_loader.load()

['santander-customer-satisfaction', 'porto-seguro-safe-driver-prediction', 'santander-customer-transaction-prediction', 'house-prices-advanced-regression-techniques', 'santander-value-prediction-challenge', 'bnp-paribas-cardif-claims-management', 'ieee-fraud-detection', 'allstate-claims-severity', 'otto-group-product-classification-challenge', 'mercedes-benz-greener-manufacturing', '.ipynb_checkpoints']
[Errno 2] No such file or directory: 'kaggle/ieee-fraud-detection/train.csv'
None complete.
house-prices-advanced-regression-techniques complete.
mercedes-benz-greener-manufacturing complete.[Errno 2] No such file or directory: 'kaggle/.ipynb_checkpoints/train.csv'

None complete.
otto-group-product-classification-challenge complete.
allstate-claims-severity complete.
bnp-paribas-cardif-claims-management complete.
porto-seguro-safe-driver-prediction complete.
santander-customer-satisfaction complete.
santander-customer-transaction-prediction complete.
santander-value-prediction-challeng

In [10]:
class LoadDataTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        
    @prompts(
        name="Load",
        description="This is a tool to load dataset."
        "Input should be a dataset name."
    )
    def inference(self, query: str):
        
        if self.status.dev:
            name = "WineQT"
            data = pd.read_csv("WineQT.csv")
            X = data.drop('quality', axis=1)
            y = data['quality']
            self.status.init_data(name, X, y, None, None)
        else:
            dataset = data_loader.select(query)
            name = dataset['name']
            label = dataset['label']
            data = dataset['train']
            X_train = data.drop(label, axis=1)
            y_train = data[label]
            data = dataset['test']
            if label in data:
                data = data.drop(label, axis=1)
            X_test = data
            y_test = pd.DataFrame(data={label: [0]*len(X_test)})[label]
            self.status.init_data(name, X_train, X_test, y_train, y_test)
            print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
        
        return f"Load dataset {name} success !"

In [26]:
class ExploreDataTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        self.X = status.X
        self.y = status.y
        self.locals = {"X": self.X, 'y': self.y, 'np': np}
        self.globals = {"np": np, 'pd': pd}
        
    @prompts(
        name="Explore",
        description="This is a tool for dataset exploring."
        "Useful when the user ask for you to explore the dataset."
        "Output is the thought from exploring the dataset."
        "Input should be an empty string."
    )
    def inference(self, query: str):
        
        if not self.status.dataset:
            return "Pick a dataset first."
        
        start_explore_prompt = f"""
You are a data scientist. You are now performing data analysis before training a machine learning model.
Your task is to tell me how the data could be preprocessed and to give me the corresponding Python code.
The preprocessing includes, but is not limited to, missing value handling, scaling, normalization, or encoding.
The task will be performed in two stages.
In the first stage, you will decide which techniques are required to preprocess the dataset by investigating the dataset.
In the second stage, you will preprocess the dataset by implementing the functions.

Here is some information about the feature dataframe `X`:
\"\"\"
- First five rows:
{self.X.head(5)}

- Column types:
{self.X.dtypes}
\"\"\"

Here is some information about the label series `y`:
\"\"\"
- First five rows:
{self.y.head(5)}

- Column types:
{self.y.dtypes}
\"\"\"
"""

        how_to_explore_prompt = """
Now, let's move on to the first stage:
1. Explain how you will explore the dataset in order to select appropriate preprocessing techniques.
2. Write a Python function called `explore` to explore the dataset.
   Import any package you need beforehand.
   Do not print a long list.
   The function will be called using `result = explore(X, y)`, and all findings must be stored in a string variable `result`.

Here are some handy packages in the Python environment:
```
numpy==1.24.3
pandas==2.0.1
```
"""

        prompt = "\n\n".join([start_explore_prompt, how_to_explore_prompt])
        self.start_explore_prompt = start_explore_prompt
        self.how_to_explore_prompt = how_to_explore_prompt
        # print(prompt)

        explore_data_thought = coding_agent.run(prompt)
        self.explore_data_thought = explore_data_thought
        # print(explore_data_thought)
        
        explore_code = extract_code(explore_data_thought)
        self.explore_code = explore_code
        # print(explore_code + '\n')
        self.retry_execute(explore_data_thought, explore_code)
        
        explore_data_result = self.locals["explore"](self.X, self.y)
        self.explore_data_result = explore_data_result
        # print(explore_data_result)
        
        self.update_status()
        return explore_data_result
    
    def update_status(self):
        self.status.start_explore_prompt = self.start_explore_prompt
        self.status.how_to_explore_prompt = self.how_to_explore_prompt
        self.status.explore_code = self.explore_code
        self.status.explore_data_result = self.explore_data_result
        self.status.explore_data_thought = self.explore_data_thought
        self.status.explore = True
    
    def retry_execute(self, prompt: str, code: str, retry: int = 3):
        for _ in range(retry):
            try:
                exec(code, self.globals, self.locals)
            except Exception as e:
                import traceback
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{traceback.format_exc()}\n\nModify the code and fix the error.")
                code = extract_code(fix_code)
                print(f"Fix{_}:\n{code}\n")
    

    
    
class ProcessDataTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        self.X = status.X
        self.y = status.y
        self.locals = {"X": self.X, 'y': self.y, 'np': np}
        self.globals = {
            "np": np,
            'pd': pd,
        }
        
        from scipy import stats
        from imblearn import over_sampling, under_sampling
        from sklearn import preprocessing, impute, model_selection, feature_selection, compose
        for module in [preprocessing, impute, over_sampling, under_sampling, model_selection, feature_selection, compose, stats]:
            for name, func in vars(module).items():
                if name[0].isupper():
                    self.globals[name] = func

    @prompts(
        name="Preprocess",
        description="This is a tool for preprocess the dataset."
        "Useful when you want to process the dataset."
        "Useful when you want to preprocess the dataset."
    )
    def inference(self, query: str):
        if not self.status.dataset:
            return "Pick a dataset first."
        print()
        
        explore_data_result = self.status.explore_data_result
        start_explore_prompt = self.status.start_explore_prompt
        how_to_explore_prompt = self.status.how_to_explore_prompt
        explore_data_thought = self.status.explore_data_thought

        preprocess_prompt = """
        The exploration results from the first stage are shown above.
        Now, let's move on to the second stage:
        1. Provide a conclusion based on the exploration results.
        2. Based on the conclusion, specify the preprocessing techniques to be employed for the dataset and briefly describe their implementations.
        3. Write a Python function called `preprocess` that will assist you in preprocessing the data.
           Keep the function as simple as possible.
           Import any package you need beforehand.
           The function will be called using `proc_X_train, proc_y_train, proc_X_test, proc_y_test = preprocess(X_train, y_train, X_test, y_test)`, 
           where both the parameters and return values are eithor Pandas dataframes or series.
           Be aware of potential data leakage issues.

        These are the only packages in the Python environment:
        ```
        numpy==1.24.3
        pandas==2.0.1
        ```
        No additional packages can be used. Return results about the second stage.
        """
        

        prompt = "\n\n".join([
            # start_explore_prompt,
            # how_to_explore_prompt,
            f"Your Response:\n{explore_data_thought}",
            f"Code Execution Results:\n{explore_data_result}",
            preprocess_prompt]
        )
        preprocess_data_thought = coding_agent.run(prompt)
        self.preprocess_data_thought = preprocess_data_thought
        # print(preprocess_data_thought)
        preprocess_code = extract_code(preprocess_data_thought)
        self.preprocess_code = preprocess_code
        # print(preprocess_code)
        
        best_result = {
            'score': 100,
            'code': preprocess_code
        }
        
        for _ in range(5):
            try:
                print(preprocess_code)
                exec(preprocess_code, self.globals, self.locals)
                result = self.evaluate()
                
                if result['llm'] < best_result['score']:
                    best_result['score'] = result['llm']
                    best_result['code'] = preprocess_code
                    
                    self.status.X_train = result['processed_data'][0]
                    self.status.X_test = result['processed_data'][2]
                    self.status.y_train = result['processed_data'][1]
                    self.status.y_test = result['processed_data'][3]
                
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{result['msg']}\n\nYour result is LLm result. Try to improve the preprocess function.")
                preprocess_code = extract_code(fix_code)
                # print(f"Revise{_}:\n{preprocess_code}\n")
            except Exception as e:
                import traceback
                trace = traceback.format_exc()
                print(trace)
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{str(trace)[:512]}\n\nModify the code and fix the error. Try not to use the module that causes error. Simplify the preprocess function as much as posiible.")
                preprocess_code = extract_code(fix_code)
                # print(f"Fix{_}:\n{preprocess_code}\n")
                
        print(best_result)
        self.update_status()
        return f"{preprocess_data_thought}\n\nProcessing....\n\nProcess Completed !"
    
    def update_status(self):
        self.status.preprocess_data_thought = self.preprocess_data_thought
        self.status.preprocess_code = self.preprocess_code
        self.status.X_train = self.locals['X_train']
        self.status.X_test = self.locals['X_test']
        self.status.y_train = self.locals['y_train']
        self.status.y_test = self.locals['y_test']
        self.status.preprocess = True
        
    def evaluate(self):
        
        print('start evaluate')

        def common_preprocess(X_train, y_train, X_test, y_test):
            # TODO: Missing value dropping

            # TODO: Categorical feature encoding

            # Scale numerical variables using StandardScaler
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

            return X_train, y_train, X_test, y_test
        
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_squared_error
        from lightgbm import LGBMRegressor, early_stopping

        seed = 42

        def evaluate(X_train, y_train, X_test, y_test, prep_func=None):
            processed_data = (X_train, y_train, X_test, y_test)
            if prep_func:
                try:
                    X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)
                except Exception as e:
                    import traceback
                    trace = traceback.format_exc()
                    print(trace)
                    raise Exception(trace)
                processed_data = (X_train, y_train, X_test, y_test)

            X_train = X_train.select_dtypes(exclude=['object'])
            X_test = X_test.select_dtypes(exclude=['object'])
            print(X_train.shape. X_test.shape, y_train.shape, y_test.shape)

            X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=seed)
            clf = LGBMRegressor(random_state=seed, **self.status.best_param)
            clf.fit(X_train, y_train, eval_set=(X_valid, y_valid), callbacks=[early_stopping(stopping_rounds=10)])
            y_pred = clf.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            return mse, processed_data

        from copy import deepcopy
        X_train = deepcopy(self.status.X_train)
        X_test = deepcopy(self.status.X_test)
        y_train = deepcopy(self.status.y_train)
        y_test = deepcopy(self.status.y_test)
        self.locals["X_train"] = X_train
        self.locals["X_test"] = X_test
        self.locals["y_train"] = y_train
        self.locals["y_test"] = y_test
        
        # LLM
        mse_llm, processed_data = evaluate(X_train, y_train, X_test, y_test, self.locals['preprocess'])

        # Common
        # mse_com, _ = evaluate(X_train, y_train, X_test, y_test, common_preprocess)

        # Raw
        mse_raw, _ = evaluate(X_train, y_train, X_test, y_test)
        mse_com = mse_raw

        msg = f"No preprocess: {mse_raw}, Common preprocess: {mse_com}, LLM preprocess: {mse_llm}"
        print(msg)
        return {
            'msg': msg,
            'no': mse_raw,
            'common': mse_com,
            'llm': mse_llm,
            'processed_data': processed_data
        }

In [52]:
import json
import pdb
from sklearn.model_selection import train_test_split


class E2EHyperOptTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, status):
        self.status = status
        self.X_train = status.X_train
        self.y_train = status.y_train
        self.X_test = status.X_test
        self.y_test = status.y_test
        # self.locals = {"X": X, 'Y': Y,}

    @prompts(
        name="hyperopt",
        description="This is a tool for hyper parameter tuning."
        "Input should be an empty string."
    )
    def inference(self, query: str):
        if not self.status.dataset:
            return "Pick a dataset first."
        if not self.status.model:
            msg = "You have not select the model. Use LGBMRegressor for this task."
            self.status.model = "LGBMRegressor"
        
        clf = self.construct_estimator(self.status.model)
        param_grid = self.construct_param_grid(clf)
        search_algo = self.get_search_algo(clf, param_grid)
        clff = self.construct_search_algo(search_algo)
        self.clff = clff
        
        X_train = self.X_train.select_dtypes(exclude=['object'])
        search = clff.fit(X_train, self.y_train)
        self.search = search
        
        print(f"Best search score: {search.best_score_}")
        score = self.evaluate(clf, search)

        self.update_status()
        return f"{self.reason}\n\nFinal score after hyper parameter tuning is {score}"
    
    def update_status(self):
        self.status.hyperopt = True
        self.status.hyper_result = self.search
        self.status.hyperopt_param_thought = self.reason
        self.status.y_pred = self.y_pred
        
    def evaluate(self, clf, search):
        from sklearn.metrics import mean_squared_error
        
        X_train = self.X_train.select_dtypes(exclude=['object'])
        X_test = self.X_test.select_dtypes(exclude=['object'])
        
        clff = clf.__class__(**search.best_params_)
        clff.fit(X_train, self.y_train)
        y_pred = clff.predict(X_test)
        self.y_pred = y_pred
        score = mean_squared_error(self.y_test, y_pred)
        import pdb
        pdb.set_trace()
        print(f"Testing score: {score}")
        return score
    
    def construct_search_algo(self, search_algo):
        # construct search instance
        construct_search_prompt = f"""
        Give me two lines of python code.
        Frist, from lib import {search_algo.strip()}.
        Second, instatiate an instance with {search_algo.strip()}(clf, param_grid, random_state=0, n_iter=15) and assign to hyper_algo variable.
        """
        codes = coding_agent.run(construct_search_prompt)
        print(f"\nConstructing search algo:\n```\n{codes}\n```\n")
        res = self.execute(codes)
        if 'hyper_algo' not in self.locals:
            raise Exception(res)
        return self.locals['hyper_algo']

    def get_search_algo(self, clf, param_grid):
        search_algo_reason_prompt = f"""
        You are now performing a hyper parameter search on a machine learning model.
        Your goal is to decide the search algorithm base on model type and parameter space.


        The given model is a {clf.__class__.__name__}.
        The paramters grid is {param_grid}.
        The avaiable search algorithm is as follow:

        1. GridSearchCV: Exhaustive search over specified parameter values for an estimator.
        2. HalvingGridSearchCV: Search over specified parameter values with successive halving.
        3. ParameterGrid: Grid of parameters with a discrete number of values for each.
        4. ParameterSampler: Generator on parameters sampled from given distributions.
        5. RandomizedSearchCV: Randomized search on hyper parameters.
        6. HalvingRandomSearchCV: Randomized search on hyper parameters with successive halving.

        Think about which algorithm to use base on the model, parameter space and algorithm description.
        Tell me which algorithm you have choosen and why.
        List all the other algorithms you have choosen not to use and why.
        """
        
        reason = coding_agent.run(search_algo_reason_prompt)
        self.reason = reason
        print(reason)

        search_algo = coding_agent.run(f"{reason}\n\nAccording to above, give me the algorithm name of the choosen algorithm in a single token.")
        print(f"\nDeciding search algo:\n{search_algo}\n")
        return search_algo

    def construct_param_grid(self, clf):
        param_grid_prompt = f"""
        You are now performing a hyper parameter search on a machine learning model.
        Your goal is to construct the paramter space to search.

        For example:
        The model is a LinearSVR.
        The parameters are {LinearSVR().get_params()}
        The parameter grid can be defined as:
        ```
        param_grid = {{'C': [1, 10, 100, 1000], 'tol': [0.001, 0.0001]}}
        ```
        Remeber to define the space only on important features. 


        Now.
        The given model is a {clf.__class__.__name__}.
        The paramters are {clf.get_params()}.
        The detail description about the parameters is as follow:
        {clf.__doc__}

        Construct the parameter grid you will use for hyper parameter in json format:
        """

        param_grid = coding_agent.run(param_grid_prompt)
        param_grid = json.loads(param_grid)
        print(f"\nConstruct params grid:\n{param_grid}\n")
        self.locals['param_grid'] = param_grid
        return param_grid
        
    def construct_estimator(self, estimator_name: str):
        # construct clf instance
        construct_estimator_prompt = f"Give me two lines of python code. Construct a {estimator_name.strip()} estimator and assign the estimator clf variable."
        codes = coding_agent.run(construct_estimator_prompt)
        print(f"\nConstruct estimator with following:\n```\n{codes}\n```\n")
        res = self.execute(codes)
        if 'clf' not in self.locals:
            raise Exception(res)
        return self.locals['clf']
        
    def execute(self, cmd: str):
        try:
            tree = ast.parse(cmd)
            module = ast.Module(tree.body[:-1], type_ignores=[])
            exec(astunparse.unparse(module), self.globals, self.locals)  # type: ignore
            module_end = ast.Module(tree.body[-1:], type_ignores=[])
            module_end_str = astunparse.unparse(module_end)  # type: ignore
            try:
                return eval(module_end_str, self.globals, self.locals)
            except Exception:
                exec(module_end_str, self.globals, self.locals)
                return ""
        except Exception as e:
            return str(e)

In [53]:
class ConversationBot(ABC):

    def __init__(self):
        seed_everything(0)
        # self.handler = LoggerCallbackHandler()
        # self.async_handler = AsyncLoggerCallbackHandler()
        self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
        self.memory = ConversationBufferMemory(
            memory_key="chat_history", output_key="output", return_messages=True
        )
        self.state = []
        self.tools = self.load_tools()

        # retry when llm output parsing error
        retrytool = retryActionTool()
        func = retrytool.inference
        self.tools.append(
            Tool(
                name=func.name,
                description=func.description,
                # coroutine=to_async(func),
                func=func,
            )
        )
        self.output_parser = RobustRetryConvoOutputParser()
        self.agent = self.init_agent()

    def run_text(self, text: str):
        logger.info(f"User: {text}")
        res = self.agent({"input": text})
        response = res["output"]
        self.state += [(text, response)]
        logger.info(
            f"\nProcessed run_text, Input text: {text}\nCurrent state: {self.state}\n"
        )
        return response

    async def arun_text(self, text: str):
        logger.info(f"User: {text}")
        res = await self.agent.acall({"input": text})
        response = res["output"]
        self.state += [(text, response)]
        logger.info(
            f"\nProcessed run_text, Input text: {text}\nCurrent state: {self.state}\n"
        )
        return response

    def _clear(self):
        self.memory.clear

    def init_agent(self):
        input_variables = ["input", "agent_scratchpad", "chat_history"]
        return initialize_agent(
            self.tools,
            self.llm,
            agent="chat-conversational-react-description",
            verbose=True,
            memory=self.memory,
            return_intermediate_steps=True,
            max_iterations=5,
            # max_execution_time=60,
            agent_kwargs={
                "prefix": AUTOML_CHATGPT_PREFIX,
                "format_instructions": AUTOML_CHATGPT_FORMAT_INSTRUCTIONS,
                "suffix": AUTOML_CHATGPT_SUFFIX,
                "input_variables": input_variables,
                "output_parser": self.output_parser,
            },
        )

    def load_tools(self):
        tools = []
        X, X_train, X_test, y, y_train, y_test = self.load_artifacts()

        for tool in [
            # pythonShellTool(),
            E2EHyperOptTool(status),
            # HyperOptTool(),
            # ConstructSearchTool(),
            # ConstructParamGridTool(),
            # ConstructEstimatorTool(),
            # FinalTool(X, Y),
            ExploreDataTool(status),
            ProcessDataTool(status),
            LoadDataTool(status),
        ]:
            func = tool.inference
            tools.append(
                Tool(
                    name=func.name,
                    description=func.description,
                    # coroutine=to_async(func),
                    func=func,
                )
            )
        logger.info(f"tools: {[i.name for i in tools]}")
        return tools

    def load_artifacts(self):
        data = pd.read_csv("WineQT.csv")
        X = data.drop('quality', axis=1)
        y = data['quality']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        # X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
        return X, X_train, X_test, y, y_train, y_test

In [None]:
class E2EPreprocessTool:
    globals: Optional[Dict] = dict()
    locals: Optional[Dict] = dict()

    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.locals = {"X": X, 'y': y, 'np': np}
        self.globals = {
            "np": np,
            'pd': pd,
            # 'StandardScaler': StandardScaler,
            # 'PowerTransformer': PowerTransformer,
            # 'boxcox': boxcox,
            # 'SMOTE': SMOTE,
            # 'RandomOverSampler': RandomOverSampler
        }

    @prompts(
        name="preprocess",
        description="This is a tool for dataset preprocessing."
        "Input should be a dataset name."
    )
    def inference(self, query: str):
        explore_prompts = self.explore_dataset(query)
        self.preprocess_dataset(explore_prompts)
        # self.evaluate()
        
    def retry_execute(self, prompt: str, code: str, retry: int = 3):
        for _ in range(retry):
            try:
                exec(code, self.globals, self.locals)
            except Exception as e:
                import traceback
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{traceback.format_exc()}\n\nModify the code and fix the error.")
                code = extract_code(fix_code)
                print(f"Fix{_}:\n{code}\n")
        
    def evaluate(self):

        def common_preprocess(X_train, y_train, X_test, y_test):
            # TODO: Missing value dropping

            # TODO: Categorical feature encoding

            # Scale numerical variables using StandardScaler
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

            return X_train, y_train, X_test, y_test
        
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_squared_error
        from lightgbm import LGBMRegressor, early_stopping

        seed = 42

        def evaluate(X_train, y_train, X_test, y_test, prep_func=None):
            if prep_func:
                X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)

            X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=seed)
            clf = LGBMRegressor(random_state=seed)
            clf.fit(X_train, y_train, eval_set=(X_valid, y_valid), callbacks=[early_stopping(stopping_rounds=10)])
            y_pred = clf.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)

            return mse

        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=seed)


        # LLM
        mse_llm = evaluate(X_train, y_train, X_test, y_test, self.locals['preprocess'])

        # Common
        mse_com = evaluate(X_train, y_train, X_test, y_test, common_preprocess)

        # Raw
        mse_raw = evaluate(X_train, y_train, X_test, y_test)

        msg = f"No preprocess: {mse_raw}, Common preprocess: {mse_com}, LLM preprocess: {mse_llm}"
        print(msg)
        return {
            'msg': msg,
            'no': mse_raw,
            'common': mse_com,
            'llm': mse_llm
        }
        
    def preprocess_dataset(self, explore_prompts: str):
        start_explore_prompt, how_to_explore_prompt, explore_data_thought, explore_data_result = explore_prompts
        
        preprocess_prompt = """
        The exploration results from the first stage are shown above.
        Now, let's move on to the second stage:
        1. Provide a conclusion based on the exploration results.
        2. Based on the conclusion, specify the preprocessing techniques to be employed for the dataset and briefly describe their implementations.
        3. Write a Python function called `preprocess` that will assist you in preprocessing the data.
           Import any package you need beforehand.
           The function will be called using `proc_X_train, proc_y_train, proc_X_test, proc_y_test = preprocess(X_train, y_train, X_test, y_test)`, 
           where both the parameters and return values are eithor Pandas dataframes or series.
           Be aware of potential data leakage issues.

        Here are some handy packages in the Python environment:
        ```
        numpy==1.24.3
        pandas==2.0.1
        ```
        No additional packages can be used.
        """
        
        """
        scikit-learn==1.2.2
        scipy==1.10.1
        imbalanced-learn==0.10.1
        """

        prompt = "\n\n".join([
            start_explore_prompt,
            how_to_explore_prompt,
            f"Your Response:\n{explore_data_thought}",
            f"Code Execution Results:\n{explore_data_result}",
            preprocess_prompt]
        )
        preprocess_data_thought = coding_agent.run(prompt)
        print(preprocess_data_thought)
        preprocess_code = extract_code(preprocess_data_thought)
        self.preprocess_code = preprocess_code
        
        best_result = {
            'score': 100,
            'code': preprocess_code
        }
        
        for _ in range(5):
            try:
                exec(preprocess_code, self.globals, self.locals)
                result = self.evaluate()
                
                if result['llm'] < best_result['score']:
                    best_result['score'] = result['llm']
                    best_result['code'] = preprocess_code
                
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{result['msg']}\n\nYour result is LLm result. Try to improve the preprocess function.")
                preprocess_code = extract_code(fix_code)
                print(f"Revise{_}:\n{preprocess_code}\n")
            except Exception as e:
                import traceback
                trace = traceback.format_exc()
                print(trace)
                fix_code = coding_agent.run(f"{prompt}\n\nExecute result:\n{str(e)}\n\nModify the code and fix the error. Try not to use the module that causes error. Try to simpify the function.")
                preprocess_code = extract_code(fix_code)
                print(f"Fix{_}:\n{preprocess_code}\n")
        
        # self.retry_execute(preprocess_data_thought, preprocess_code)

        
    def explore_dataset(self, query: str):
        start_explore_prompt = f"""
You are a data scientist. You are now performing data analysis before training a machine learning model.
Your task is to tell me how the data could be preprocessed and to give me the corresponding Python code.
The preprocessing includes, but is not limited to, missing value handling, scaling, normalization, or encoding.
The task will be performed in two stages.
In the first stage, you will decide which techniques are required to preprocess the dataset by investigating the dataset.
In the second stage, you will preprocess the dataset by implementing the functions.

Here is some information about the feature dataframe `X`:
\"\"\"
- First five rows:
{self.X.head(5)}

- Column types:
{self.X.dtypes}
\"\"\"

Here is some information about the label series `y`:
\"\"\"
- First five rows:
{self.y.head(5)}

- Column types:
{self.y.dtypes}
\"\"\"
"""

        how_to_explore_prompt = """
Now, let's move on to the first stage:
1. Explain how you will explore the dataset in order to select appropriate preprocessing techniques.
2. Write a Python function called `explore` to explore the dataset.
   Import any package you need beforehand.
   The function will be called using `result = explore(X, y)`, and all findings must be stored in a string variable `result`.

Here are some handy packages in the Python environment:
```
numpy==1.24.3
pandas==2.0.1
```
"""

        prompt = "\n\n".join([start_explore_prompt, how_to_explore_prompt])
        self.prompt = prompt
        print(prompt)

        explore_data_thought = coding_agent.run(prompt)
        self.explore_data_thought = explore_data_thought
        print(explore_data_thought)
        
        explore_code = extract_code(explore_data_thought)
        self.explore_code = explore_code
        print(explore_code + '\n')
        self.retry_execute(explore_data_thought, explore_code)
        
        explore_data_result = self.locals["explore"](self.X, self.y)
        self.explore_data_result = explore_data_result
        print(explore_data_result)
        
        return (start_explore_prompt, how_to_explore_prompt, explore_data_thought, explore_data_result)
    
    
preprocess_tool = E2EPreprocessTool(X, y)
preprocess_tool.inference("")

# Demo start

In [29]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you load the mercedes-benz-greener-manufacturing dataset ?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Load",
    "action_input": "mercedes-benz-greener-manufacturing"
}
```[0m(4209, 377) (4209, 377) (4209,) (4209,)

Observation: [31;1m[1;3mLoad dataset mercedes-benz-greener-manufacturing success ![0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The mercedes-benz-greener-manufacturing dataset has been successfully loaded."
}
```[0m

[1m> Finished chain.[0m


'The mercedes-benz-greener-manufacturing dataset has been successfully loaded.'

# Demo 1: Can you explore the dataset ?

In [30]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you explore the dataset ?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Explore",
    "action_input": ""
}[0m




Observation: [33;1m[1;3mThere are no missing values in the dataset.
There are outliers in the following columns: [('X10', 56), ('X12', 316), ('X13', 244), ('X15', 2), ('X16', 11), ('X17', 32), ('X18', 33), ('X19', 419), ('X20', 601), ('X21', 11), ('X22', 366), ('X23', 87), ('X24', 8), ('X26', 21), ('X28', 137), ('X29', 181), ('X30', 19), ('X31', 978), ('X32', 47), ('X33', 1), ('X34', 23), ('X35', 978), ('X36', 19), ('X37', 978), ('X38', 140), ('X39', 1), ('X40', 3), ('X41', 48), ('X42', 1), ('X43', 304), ('X44', 48), ('X47', 54), ('X48', 94), ('X49', 514), ('X50', 901), ('X52', 178), ('X53', 29), ('X54', 183), ('X55', 22), ('X56', 89), ('X57', 56), ('X59', 3), ('X60', 6), ('X61', 194), ('X62', 25), ('X63', 48), ('X65', 9), ('X66', 114), ('X67', 8), ('X68', 309), ('X69', 126), ('X70', 337), ('X71', 436), ('X73', 84), ('X74', 3), ('X75', 152), ('X76', 183), ('X77', 53), ('X78', 24), ('X79', 106), ('X80', 223), ('X81', 964), ('X82', 72), ('X83', 5), ('X84', 436), ('X86', 6), ('X87', 4)

'The dataset has no missing values, but there are outliers in some columns and some features are highly skewed. There are also 8 categorical features that may need to be encoded using one-hot or label encoding. Additionally, there are highly correlated features that may need to be removed. These findings were obtained from exploring the dataset.'

In [None]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer("Hello world")["input_ids"]

# Demo 2: Can you preprocess the dataset ?

In [42]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you process the dataset ?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Preprocess",
    "action_input": ""
}[0m

import numpy as np
import pandas as pd
from scipy.stats import boxcox
from sklearn.preprocessing import OneHotEncoder

def preprocess(X_train, y_train, X_test, y_test):
    # Winsorization to remove outliers
    X_train = X_train.clip(lower=X_train.quantile(0.01), upper=X_train.quantile(0.99), axis=1)
    X_test = X_test.clip(lower=X_test.quantile(0.01), upper=X_test.quantile(0.99), axis=1)
    
    # Box-Cox transformation to reduce skewness
    numerical_cols = X_train.select_dtypes(include=np.number).columns
    for col in numerical_cols:
        X_train[col], _ = boxcox(X_train[col] + 1)
        X_test[col], _ = boxcox(X_test[col] + 1)
    
    # One-hot encoding for categorical features
    categorical_cols = X_train.select_dtypes(include='object').columns
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoder.fit(X_train[categorical_cols



Traceback (most recent call last):
  File "/tmp/ipykernel_2935595/1357834626.py", line 250, in evaluate
    X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)
  File "<string>", line 15, in preprocess
NameError: name 'boxcox' is not defined

Traceback (most recent call last):
  File "/tmp/ipykernel_2935595/1357834626.py", line 250, in evaluate
    X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)
  File "<string>", line 15, in preprocess
NameError: name 'boxcox' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_2935595/1357834626.py", line 189, in inference
    result = self.evaluate()
  File "/tmp/ipykernel_2935595/1357834626.py", line 280, in evaluate
    mse_llm, processed_data = evaluate(X_train, y_train, X_test, y_test, self.locals['preprocess'])
  File "/tmp/ipykernel_2935595/1357834626.py", line 255, in evaluate
    raise Exception



Traceback (most recent call last):
  File "/tmp/ipykernel_2935595/1357834626.py", line 250, in evaluate
    X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)
TypeError: cannot unpack non-iterable NoneType object

Traceback (most recent call last):
  File "/tmp/ipykernel_2935595/1357834626.py", line 250, in evaluate
    X_train, y_train, X_test, y_test = prep_func(X_train, y_train, X_test, y_test)
TypeError: cannot unpack non-iterable NoneType object

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_2935595/1357834626.py", line 189, in inference
    result = self.evaluate()
  File "/tmp/ipykernel_2935595/1357834626.py", line 280, in evaluate
    mse_llm, processed_data = evaluate(X_train, y_train, X_test, y_test, self.locals['preprocess'])
  File "/tmp/ipykernel_2935595/1357834626.py", line 255, in evaluate
    raise Exception(trace)
Exception: Traceback (most recent call last):
  

'The dataset contains no missing values. There are outliers in many columns, which may need to be removed or transformed. Some features are highly skewed, which may require a transformation or normalization. There are 8 categorical features that may need to be encoded using one-hot or label encoding. There are many highly correlated features that may need to be removed. The recommended preprocessing techniques are Winsorization to remove outliers, Box-Cox transformation to reduce skewness, one-hot encoding for categorical features, and removing one of the highly correlated features to avoid multicollinearity. A Python function has been provided for you to use.'

# Demo 3: Can you select the model ?

# Demo 4: Can you run hyper parameter tuning ?

In [35]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
clf = LGBMRegressor()
param_grid = {'learning_rate': [0.01, 0.05, 0.1, 0.5], 'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 5, 7, 9], 'num_leaves': [15, 31, 63, 127], 'min_child_samples': [10, 20, 30, 50], 'reg_alpha': [0.0, 0.1, 0.5, 1.0], 'reg_lambda': [0.0, 0.1, 0.5, 1.0], 'colsample_bytree': [0.5, 0.7, 1.0], 'subsample': [0.5, 0.7, 1.0]}
hyper_algo = RandomizedSearchCV(clf, param_grid, random_state=0, n_iter=50)
search = hyper_algo.fit(status.X_train, status.y_train)


ValueError: 
All the 250 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/lightgbm/sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/lightgbm/sklearn.py", line 748, in fit
    self._Booster = train(
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/lightgbm/engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/lightgbm/basic.py", line 2605, in __init__
    train_set.construct()
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/lightgbm/basic.py", line 1815, in construct
    self._lazy_init(self.data, label=self.label,
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/lightgbm/basic.py", line 1474, in _lazy_init
    data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data,
  File "/home/jason-tsai/.cache/pypoetry/virtualenvs/insight-explorer-JqXJ1KnR-py3.8/lib/python3.8/site-packages/lightgbm/basic.py", line 594, in _data_from_pandas
    raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: X0, X1, X2, X3, X4, X5, X6, X8


In [54]:
chat_agent = ConversationBot()
chat_agent.run_text("Can you run hyper parameter tuning ?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "hyperopt",
    "action_input": ""
}[0m
Construct estimator with following:
```
import lightgbm as lgb

clf = lgb.LGBMRegressor()
```


Construct params grid:
{'learning_rate': [0.01, 0.05, 0.1, 0.5], 'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 5, 7, 9], 'num_leaves': [15, 31, 63, 127], 'min_child_samples': [10, 20, 30, 50], 'reg_alpha': [0.0, 0.1, 0.5, 1.0], 'reg_lambda': [0.0, 0.1, 0.5, 1.0], 'colsample_bytree': [0.5, 0.7, 1.0], 'subsample': [0.5, 0.7, 1.0]}

I have chosen to use RandomizedSearchCV for the hyper parameter search on the LGBMRegressor model. This is because RandomizedSearchCV is a good choice when the parameter space is large and it is not feasible to exhaustively search all possible combinations. Additionally, RandomizedSearchCV can be more efficient than GridSearchCV as it randomly samples a subset of the parameter space, which can lead to finding better hyperparameters in less ti

ipdb>   score


10289.612378331778


ipdb>  l


[1;32m     60 [0m        [0my_pred[0m [0;34m=[0m [0mclff[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0mX_test[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     61 [0m        [0mself[0m[0;34m.[0m[0my_pred[0m [0;34m=[0m [0my_pred[0m[0;34m[0m[0;34m[0m[0m
[1;32m     62 [0m        [0mscore[0m [0;34m=[0m [0mmean_squared_error[0m[0;34m([0m[0mself[0m[0;34m.[0m[0my_test[0m[0;34m,[0m [0my_pred[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     63 [0m        [0;32mimport[0m [0mpdb[0m[0;34m[0m[0;34m[0m[0m
[1;32m     64 [0m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 65 [0;31m        [0mprint[0m[0;34m([0m[0;34mf"Testing score: {score}"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     66 [0m        [0;32mreturn[0m [0mscore[0m[0;34m[0m[0;34m[0m[0m
[1;32m     67 [0m[0;34m[0m[0m
[1;32m     68 [0m    [0;32mdef[0m [0mconstruct_search_algo[0m[0

ipdb>  y_pred


array([ 79.01196233,  94.57370525,  79.2718237 , ...,  91.96851864,
       110.22102444,  91.88256186])


ipdb>  y_test


*** NameError: name 'y_test' is not defined


ipdb>  self.y_test


0       0
1       0
2       0
3       0
4       0
       ..
4204    0
4205    0
4206    0
4207    0
4208    0
Name: y, Length: 4209, dtype: int64


ipdb>  q


# Demo 5: Can you run the ML process end to end ?