# AI Agent: Data Science Workflow Automation

<p style="font-size: 1.2em; line-height: 1.3;">
In this project, we build an AI Agent to handle a data science task: create a working machine learning model with the given data.<br>
The agent is asked to: load the dataset, preprocess it, choose a target variable, create and tune a model using three hyperparameter sets, evaluate its performance, and return the best model with its metrics and optimal hyperparameters.
</p>

Links to the AI Agent library and dataset:
- smolagents: https://huggingface.co/docs/smolagents/en/index 
- US Health Insurance Dataset: https://www.kaggle.com/datasets/teertha/ushealthinsurancedataset  or  https://www.kaggle.com/datasets/mirichoi0218/insurance

### Connecting to Hugging Face

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Importing Libraries

In [2]:
from smolagents import CodeAgent, tool, HfApiModel
import xgboost as xgb
import pandas as pd

### Creating the AI's Toolset

In [3]:
@tool
def read_dataset(file_name: str) -> pd.DataFrame:
    """A tool to read a dataset file in either CSV or Excel format.
    Args:
        file_name: The name of the file to be read.  
    Returns:
        df: A pandas.DataFrame.
    Raises:
        ValueError: If the file format is unsupported or the file cannot be read.
    """
    try:
        if file_name.lower().endswith('.csv'):
            df = pd.read_csv(file_name)
            return df
        elif file_name.lower().endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_name)
            return df
        else:
            raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")

    except Exception as e:
        return f"Failed to read the file: {e}"
    
@tool
def preview_dataset(df: pd.DataFrame) -> str:
    """A tool to preview the first rows of a DataFrame.
    Args:
        df: The DataFrame to preview.
    Returns:
        str: The first rows of the DataFrame.
    """
    preview = df.head()
    return f"Previewing the dataset with {len(preview)} rows:\n{preview}"

@tool
def dataset_info(df: pd.DataFrame) -> str:
    """A tool to display the DataFrame's information.
    Args:
        df: The DataFrame to describe.
    Returns:
        str: A string containing the DataFrame's info.
    """
    import io
    buffer = io.StringIO()
    df.info(buf=buffer)
    return f"Dataset information:\n{buffer.getvalue()}"

@tool
def describe_numerical(df: pd.DataFrame) -> str:
    """A tool to display summary statistics for numerical columns in the DataFrame.
    Args:
        df: The DataFrame to summarize.
    Returns:
        str: Summary statistics of numerical columns.
    """
    summary = df.describe()
    return f"Summary statistics for numerical columns:\n{summary}"

@tool
def describe_categorical(df: pd.DataFrame) -> str:
    """A tool to display summary statistics for categorical columns in the DataFrame.
    Args:
        df: The DataFrame to summarize.
    Returns:
        str: Summary statistics of categorical columns.
    """
    summary = df.describe(include='object')
    return f"Summary statistics for categorical columns:\n{summary}"

@tool
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """A tool to clean the DataFrame by removing rows with null values and duplicates.
    Args:
        df: The DataFrame to clean.
    Returns:
        df: The cleaned DataFrame.
    """
    df = df.dropna().drop_duplicates()
    return df

@tool
def delete_columns(df: pd.DataFrame, columns_to_delete: list[str]) -> pd.DataFrame:
    """A tool to delete specified columns from a DataFrame.
    Args:
        df: The DataFrame from which columns will be deleted.
        columns_to_delete: A list of column names to delete.
    Returns:
        df: The updated DataFrame.
    Raises:
        ValueError: If any of the columns to delete are not found in the DataFrame.
    """
    try:
        missing_columns = [col for col in columns_to_delete if col not in df.columns]
        if missing_columns:
            raise ValueError(f"The following columns are not found in the DataFrame: {', '.join(missing_columns)}")
        df = df.drop(columns=columns_to_delete)
        return df
    
    except Exception as e:
        raise ValueError(f"Failed to delete columns: {e}")

@tool
def convert_to_float(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    """A tool to convert specified columns to float type.
    Args:
        df: The DataFrame containing the columns.
        columns: A list of column names to convert.
    Returns:
        df: The updated DataFrame.
    """
    for col in columns:
        df[col] = df[col].astype(float)
    return df

@tool
def convert_to_int(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    """A tool to convert specified columns to integer type.
    Args:
        df: The DataFrame containing the columns.
        columns: A list of column names to convert.
    Returns:
        df: The updated DataFrame.
    """
    for col in columns:
        df[col] = df[col].astype(int)
    return df

@tool
def one_hot_encode(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    """A tool to apply one-hot encoding to specified categorical columns using pandas.get_dummies.
    Args:
        df: The DataFrame containing the columns.
        columns: A list of categorical column names to encode.
    Returns:
        df: The updated DataFrame with one-hot encoded columns.
    Raises:
        ValueError: If none of the specified columns are found or if they are not categorical.
    """
    valid_columns = [col for col in columns if col in df.columns and df[col].dtype == 'object']
    if not valid_columns:
        raise ValueError("No valid categorical columns provided for encoding.")
    df = pd.get_dummies(df, columns=columns, drop_first=True)
    return df

@tool
def split_features_target(df: pd.DataFrame, target_column: str) -> tuple:
    """A tool to split the DataFrame into features (X) and target (y).
    Args:
        df: The DataFrame to split.
        target_column: The name of the column to be used as the target. 
    Returns:
        tuple: The features DataFrame (X), and target Series (y).
    Raises:
        ValueError: If the target column is not found in the DataFrame.
    """
    try:
        if target_column not in df.columns:
            raise ValueError(f"Target column '{target_column}' not found in DataFrame")
        
        X = df.drop(columns=[target_column])
        y = df[target_column]
        return (X, y)
    
    except Exception as e:
        raise ValueError(f"Failed to split features and target: {e}")

@tool
def train_test_split_data(X: pd.DataFrame, y: pd.Series, test_size: float = 0.3, random_state: int = 42) -> tuple:
    """A tool to split the features (X) and target (y) into training and testing sets.
    Args:
        X: The feature set to split.
        y: The target set to split.
        test_size: The proportion of the data to include in the test split (default is 0.3).
        random_state: Controls the randomness of the split (default is 42).
    Returns:
        tuple: The training and testing sets for both features (X_train, X_test) and target (y_train, y_test).
    Raises:
        ValueError: If the DataFrame is empty or the split fails.
    """
    from sklearn.model_selection import train_test_split

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return (X_train, X_test, y_train, y_test)
    
    except Exception as e:
        raise ValueError(f"Failed to split the dataset: {e}")

@tool
def apply_xgboost_classifier(X_train: pd.DataFrame, y_train: pd.Series, n_estimators: int = 100, max_depth: int = 6, subsample: float = 0.8, colsample_bytree: float = 0.8) -> xgb.XGBClassifier:
    """A tool that applies XGBoostClassifier to the training data if the target is categorical.
    Args:
        X_train: Features for training.
        y_train: Target labels for training (categorical).
        n_estimators: Number of trees in the model (default is 100).
        max_depth: Maximum depth of trees (default is 6).
        subsample: Fraction of samples to be used for training (default is 0.8).
        colsample_bytree: Fraction of features to be used for each tree (default is 0.8).
    Returns:
        xgb.XGBClassifier: The trained XGBClassifier model.
    """
    from xgboost import XGBClassifier

    model = XGBClassifier(
        subsample=subsample, 
        colsample_bytree=colsample_bytree, 
        n_estimators=n_estimators, 
        max_depth=max_depth
    )
    model.fit(X_train, y_train)
    return model

@tool
def apply_xgboost_regressor(X_train: pd.DataFrame, y_train: pd.Series, n_estimators: int = 100, max_depth: int = 6, subsample: float = 0.8, colsample_bytree: float = 0.8) -> xgb.XGBRegressor:
    """A tool that applies XGBoostRegressor to the training data if the target is numerical.
    Args:
        X_train: Features for training.
        y_train: Target values for training (numerical).
        n_estimators: Number of trees in the model (default is 100).
        max_depth: Maximum depth of trees (default is 6).
        subsample: Fraction of samples to be used for training (default is 0.8).
        colsample_bytree: Fraction of features to be used for each tree (default is 0.8).
    Returns:
        xgb.XGBRegressor: The trained XGBoostRegressor model.
    """
    from xgboost import XGBRegressor

    model = XGBRegressor(
        subsample=subsample, 
        colsample_bytree=colsample_bytree, 
        n_estimators=n_estimators, 
        max_depth=max_depth
    )
    model.fit(X_train, y_train)
    return model

@tool
def evaluate_classifier_performance(model:xgb.XGBClassifier, X_test: pd.DataFrame, y_test: pd.Series) -> dict:
    """A tool to evaluate the performance of a classifier model using precision, recall, and F1-score.
    Args:
        model: The trained classifier model used for predictions.
        X_test: The test features.
        y_test: The true target values for the test set.
    Returns:
        dict: A dictionary containing precision, recall, and F1-score.
    """
    from sklearn.metrics import precision_score, recall_score, f1_score

    try:
        y_pred = model.predict(X_test)
        
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        return {"Precision": precision, "Recall": recall, "F1-Score": f1}
    
    except Exception as e:
        raise ValueError(f"Failed to evaluate classifier model performance: {e}")

@tool
def evaluate_regressor_performance(model:xgb.XGBRegressor, X_test: pd.DataFrame, y_test: pd.Series) -> dict:
    """A tool to evaluate the performance of a regressor model using MAE, RMSE, and R² score.
    Args:
        model: The trained regressor model used for predictions.
        X_test: The test features.
        y_test: The true target values for the test set.
    Returns:
        dict: A dictionary containing the MAE, RMSE, and R² score.
    """
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    import numpy as np

    try:
        y_pred = model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        return {"MAE": mae, "RMSE": rmse, "R2": r2}
    
    except Exception as e:
        raise ValueError(f"Failed to evaluate regressor model performance: {e}")

### Running the Agent

In [4]:
# Creating the agent and giving it its tools
agent = CodeAgent(tools=[read_dataset, preview_dataset, dataset_info, describe_numerical,
                         describe_categorical, clean_dataset, delete_columns, convert_to_float,
                         convert_to_int, one_hot_encode, split_features_target, train_test_split_data, 
                         apply_xgboost_classifier, apply_xgboost_regressor, evaluate_regressor_performance,
                         evaluate_classifier_performance],
                         model=HfApiModel(),
                         additional_authorized_imports=['pandas', 'numpy']) # authorizing additional libraries for the agent to be able to import

# Running the agent and giving it its task
agent.run('You are a data science intern working with a dataset stored in the "insurance.csv" file. '
'Using the given tools, create a DataFrame, preprocess the data, choose a target variable, create a model and evaluate its results. '
'Use 3 different sets for the hyperparameters — starting with the default values, step by step —, always aiming to increase the performance. '
'Return the best model, its metrics and optimal hyperparameters.')

{'Best Model': 'XGBoostRegressor',
 'Metrics': {'MAE': 3728.648516516468,
  'RMSE': 5878.842935972424,
  'R2': 0.7980291189990276},
 'Optimal Hyperparameters': {'n_estimators': 200,
  'subsample': 0.9,
  'colsample_bytree': 0.9}}