In [2]:
import copy
import numpy as np
import pandas as pd
from openai import OpenAI
from load_api_key import load_openai_api_key
import ast
from caafe.run_llm_code import check_ast
from typing import Any, Dict, Optional

key = load_openai_api_key()
client = OpenAI(api_key=key)

df = pd.read_csv("../../SyntheticData/synthetic_life_expectancy.csv")
ds = ["A medical dataset containing patient information on life expectancy."]

API Key loaded successfully.


In [4]:
# Changed ds[4][-1] to df.columns[-1], both being the target column name

def get_prompt(
    df, iterative=1, data_description_unparsed=None, samples=None, **kwargs
):
    how_many = (
        "up to 10 useful columns. Generate as many features as useful for downstream classifier, but as few as necessary to reach good performance."
        if iterative == 1
        else "exactly one useful column"
    )
    return f"""
    The dataframe `df` is loaded and in memory. Columns are also named attributes.
    Description of the dataset in `df` (column dtypes might be inaccurate):
    "{data_description_unparsed}"

    Columns in `df` (true feature dtypes listed here, categoricals encoded as int):
    {samples}

    This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
    Number of samples (rows) in training dataset: {int(len(df))}

    This code generates additional columns that are useful for a downstream classification algorithm (such as XGBoost) predicting \"{df.columns[-1]}\".
    Additional columns add new semantic information, that is they use real world knowledge on the dataset. They can e.g. be feature combinations, transformations, aggregations where the new column is a function of the existing columns.
    The scale of columns and offset does not matter. Make sure all used columns exist. Follow the above description of columns closely and consider the datatypes and meanings of classes.
    This code also drops columns, if these may be redundant and hurt the predictive performance of the downstream classifier (Feature selection). Dropping columns may help as the chance of overfitting is lower, especially if the dataset is small.
    The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is accuracy. The best performing code will be selected.
    Added columns can be used in other codeblocks, dropped columns are not available anymore.

    Code formatting for each added column:
    ```python
    # (Feature name and description)
    # Usefulness: (Description why this adds useful real world knowledge to classify \"{df.columns[-1]}\" according to dataset description and attributes.)
    # Input samples: (Three samples of the columns used in the following code, e.g. '{df.columns[0]}': {list(df.iloc[:3, 0].values)}, '{df.columns[1]}': {list(df.iloc[:3, 1].values)}, ...)
    (Some pandas code using {df.columns[0]}', '{df.columns[1]}', ... to add a new column for each row in df)
    ```end

    Code formatting for dropping columns:
    ```python
    # Explanation why the column XX is dropped
    df.drop(columns=['XX'], inplace=True)
    ```end

    Each codeblock generates {how_many} and can drop unused columns (Feature selection).
    Each codeblock ends with ```end and starts with "```python"
    Codeblock:
    """

In [5]:
def build_prompt_from_df(ds, df, iterative=1):
    data_description_unparsed = ds[-1]
    feature_importance = {}  # xgb_eval(_obj)

    samples = ""
    df_ = df.head(10)
    for i in list(df_):
        # show the list of values
        nan_freq = "%s" % float("%.2g" % (df[i].isna().mean() * 100))
        s = df_[i].tolist()
        if str(df[i].dtype) == "float64":
            s = [round(sample, 2) for sample in s]
        samples += (
            f"{df_[i].name} ({df[i].dtype}): NaN-freq [{nan_freq}%], Samples {s}\n"
        )

    kwargs = {
        "data_description_unparsed": data_description_unparsed,
        "samples": samples,
        "feature_importance": {
            k: "%s" % float("%.2g" % feature_importance[k]) for k in feature_importance
        },
    }

    prompt = get_prompt(
        df,
        data_description_unparsed=data_description_unparsed,
        iterative=iterative,
        samples=samples,
    )

    return prompt

In [6]:
prompt = build_prompt_from_df(ds, df)
print(prompt)


    The dataframe `df` is loaded and in memory. Columns are also named attributes.
    Description of the dataset in `df` (column dtypes might be inaccurate):
    "A medical dataset containing patient information on life expectancy."

    Columns in `df` (true feature dtypes listed here, categoricals encoded as int):
    Cigarettes per Day (int64): NaN-freq [0.0%], Samples [1, 7, 6, 1, 2, 17, 11, 0, 11, 4]
Age (int64): NaN-freq [0.0%], Samples [58, 71, 48, 34, 62, 27, 40, 58, 77, 38]
Diet Quality (object): NaN-freq [0.0%], Samples ['Poor', 'Poor', 'Average', 'Average', 'Average', 'Average', 'Average', 'Average', 'Average', 'Average']
Life Expectancy (float64): NaN-freq [0.0%], Samples [70.6, 72.7, 81.6, 80.8, 73.4, 98.9, 89.0, 72.6, 77.9, 82.6]


    This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
    Number of samples (rows) in training dataset: 1000

    This code generates additional colu

In [7]:
messages = [
    {
        "role": "system",
        "content": "You are an expert datascientist assistant solving Kaggle problems. You answer only by generating code. Answer as concisely as possible.",
    },
    {
        "role": "user",
        "content": prompt,
    },
]

In [8]:
def generate_code(messages, client):
    """if model == "skip":
        return ""
    """

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        stop=["```end"],
        temperature=0.5,
        max_tokens=500,
    )
    code = completion.choices[0].message.content
    code = code.replace("```python", "").replace("```", "").replace("<end>", "")
    return code

In [9]:
code = generate_code(messages, client)
print(code)


# (Cigarettes per Day to Age Ratio)
# Usefulness: This feature represents the ratio of smoking to age, which may correlate with life expectancy as smoking is a known risk factor.
# Input samples: 'Cigarettes per Day': [1, 7, 6], 'Age': [58, 71, 48]
df['Cigarettes_Age_Ratio'] = df['Cigarettes per Day'] / (df['Age'] + 1)  # Adding 1 to avoid division by zero



In [10]:
def run_llm_code(code: str, df: pd.DataFrame, convert_categorical_to_integer: Optional[bool] = True, fill_na: Optional[bool] = True) -> pd.DataFrame:
    """
    Executes the given code on the given dataframe and returns the resulting dataframe.

    Parameters:
    code (str): The code to execute.
    df (pandas.DataFrame): The dataframe to execute the code on.
    convert_categorical_to_integer (bool, optional): Whether to convert categorical columns to integer values. Defaults to True.
    fill_na (bool, optional): Whether to fill NaN values in object columns with empty strings. Defaults to True.

    Returns:
    pandas.DataFrame: The resulting dataframe after executing the code.
    """
    try:
        loc = {}
        df = copy.deepcopy(df)

        """if fill_na and False:
            df.loc[:, (df.dtypes == object)] = df.loc[:, (df.dtypes == object)].fillna(
                ""
            )
        if convert_categorical_to_integer and False:
            df = df.apply(convert_categorical_to_integer_f)"""

        access_scope = {"df": df, "pd": pd, "np": np}
        parsed = ast.parse(code)
        check_ast(parsed)
        exec(compile(parsed, filename="<ast>", mode="exec"), access_scope, loc)
        df = copy.deepcopy(df)

    except Exception as e:
        print("Code could not be executed", e)
        raise (e)

    return df

In [12]:
df_new = run_llm_code(code, df)
df_new

Unnamed: 0,Cigarettes per Day,Age,Diet Quality,Life Expectancy,Cigarettes_Age_Ratio
0,1,58,Poor,70.6,0.016949
1,7,71,Poor,72.7,0.097222
2,6,48,Average,81.6,0.122449
3,1,34,Average,80.8,0.028571
4,2,62,Average,73.4,0.031746
...,...,...,...,...,...
995,19,23,Average,102.1,0.791667
996,15,20,Poor,96.0,0.714286
997,3,68,Average,72.6,0.043478
998,5,59,Poor,74.3,0.083333
