In [8]:
#general imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import train_test_split

In [9]:
#Read dataset and make ydata-profiling dashboard

#from ydata_profiling import ProfileReport

df = pd.read_csv("./data/datasetData.csv")
#Drop index col
df = df.drop(["Unnamed: 0"], axis=1)
#Remove special symbole
df['AutoML_solution'] = df['AutoML_solution'].str.replace(':', '')
#Apply one hot encoding
df = pd.get_dummies(df, columns=['AutoML_solution'], prefix='', prefix_sep='')

#profile = ProfileReport(df, title="Profiling Report")
#profile.to_notebook_iframe()

In [10]:
#AutoFeat for all automls

from autofeat import AutoFeatRegressor

X = df.drop(["runtime_limit"], axis=1)
y = df["runtime_limit"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### get list of selected features ###
afreg = AutoFeatRegressor(verbose=1)

X_train = afreg.fit_transform(X_train, y_train)
X_test = afreg.transform(X_test)

2024-06-20 08:26:45,971 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 5565 features.
2024-06-20 08:26:45,972 INFO: [AutoFeat] With 256 data points this new feature matrix would use about 0.01 gb of space.
2024-06-20 08:26:45,973 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             15 features transformed

2024-06-20 08:26:50,312 INFO: [feateng] Generated 21 transformed features from 15 original features - done.
2024-06-20 08:26:50,315 INFO: [feateng] Step 2: first combination of features


[feateng]             400/            630 feature tuples combined

2024-06-20 08:26:51,071 INFO: [feateng] Generated 592 feature combinations from 630 original feature tuples - done.
2024-06-20 08:26:51,073 INFO: [feateng] Generated altogether 646 new features in 2 steps
2024-06-20 08:26:51,074 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2024-06-20 08:26:51,088 INFO: [feateng] Generated a total of 364 additional features


[featsel] Scaling data.../            630 feature tuples combined

2024-06-20 08:26:57,402 INFO: [featsel] Feature selection run 1/5


done.


2024-06-20 08:27:02,097 INFO: [featsel] Feature selection run 2/5
2024-06-20 08:27:03,903 INFO: [featsel] Feature selection run 3/5
2024-06-20 08:27:05,985 INFO: [featsel] Feature selection run 4/5
2024-06-20 08:27:08,174 INFO: [featsel] Feature selection run 5/5
2024-06-20 08:27:10,445 INFO: [featsel] 15 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2024-06-20 08:27:10,449 INFO: [featsel] 11 features after correlation filtering
2024-06-20 08:27:10,478 INFO: [featsel] 7 features after noise filtering
2024-06-20 08:27:10,479 INFO: [AutoFeat] Computing 6 new features.


[AutoFeat]     5/    6 new features

2024-06-20 08:27:12,631 INFO: [AutoFeat]     6/    6 new features ...done.
2024-06-20 08:27:12,632 INFO: [AutoFeat] Final dataframe with 21 feature columns (6 new).
2024-06-20 08:27:12,633 INFO: [AutoFeat] Training final regression model.
2024-06-20 08:27:12,646 INFO: [AutoFeat] Trained model: largest coefficients:
2024-06-20 08:27:12,646 INFO: 96.524318759094
2024-06-20 08:27:12,646 INFO: 2.805375 * autogluon*sqrt(duplicated_cols)
2024-06-20 08:27:12,647 INFO: 1.260376 * gama*log(dataset_cols)
2024-06-20 08:27:12,647 INFO: 0.000552 * dataset_cols**2*h2o_automl
2024-06-20 08:27:12,649 INFO: [AutoFeat] Final score: 0.1233
2024-06-20 08:27:12,652 INFO: [AutoFeat] Computing 6 new features.
2024-06-20 08:27:12,662 INFO: [AutoFeat]     6/    6 new features ...done.


[AutoFeat]     5/    6 new features

In [11]:
#Create prediction plot
def create_prediction_plot(y_test, predictions):
    prediction_results = pd.DataFrame({
        'runtime_limit_is': y_test,
        'runtime_limit_predicted': predictions
    })

    best_case_x = [0, 5, 10, 20, 40, 80, 160, 320, 640]
    best_case_y = [0, 5, 10, 20, 40, 80, 160, 320, 640]
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x='runtime_limit_predicted', 
        y='runtime_limit_is', 
        data=prediction_results, 
        color='gray', marker='o'  # Using a distinct color palette
    )

    plt.plot(best_case_x, best_case_y)
    plt.xscale('log', base=10)  # Logarithmic scale for x-axis
    plt.yscale('log', base=10)  # Logarithmic scale for y-axis


    # Find the limits in log space
    x_min, x_max = 1, 100
    y_min, y_max = 1, np.exp(6.6)

    # Determine the limits to make them symmetrical in log space
    log_min = min(np.log10(x_min), np.log10(y_min))
    log_max = max(np.log10(x_max), np.log10(y_max))

    # Apply the symmetrical limits
    plt.xlim([10**log_min, 10**log_max])
    plt.ylim([10**log_min, 10**log_max])




    plt.xlabel('Optimal runtime predicted')
    plt.ylabel('Optimal runtime measured')
    #plt.legend(title='AutoML Solution', bbox_to_anchor=(1.05, 0.5), loc='center left')
    #plt.title('Actual vs Predicted Runtime Limits')
    plt.grid(True)
    #plt.legend(title='Series')
    plt.show()
    

In [12]:
def train_model(model,):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Compute MAE
    mae = MAE(predictions, y_test)

    print(f"{type(model)} Mean Absolute Error (MAE):", round(mae))
    return model

In [13]:

import lightgbm as lgb
def train_lgbm_model(model,):
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)



    # Define parameters for the LightGBM model
    params = {
        'objective': 'regression',  # Set the objective as regression
        'metric': 'mae',            # Use mean absolute error as the evaluation metric
        'verbose': 1                # Disable verbose output
    }

    # Train the LightGBM model
    num_round = 100
    model = lgb.train(params, train_data, num_round, valid_sets=[test_data])

    # Make predictions on the test set
    predictions = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Compute MAE
    mae = MAE(predictions, y_test)

    print(f"{type(model)} Mean Absolute Error (MAE):", round(mae))
    return model

In [14]:

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR

models = { "LightGBM": None, "Baseline": DummyRegressor(strategy="median"), "Linear Regression": LinearRegression(), 
          "Decision Tree": DecisionTreeRegressor(random_state=42), 
          "Sklearn Neural Network": MLPRegressor(random_state=42), "Ridge": Ridge(), 
          "Lasso": Lasso(), "Elastic": ElasticNet(), "Random Forest": RandomForestRegressor(), 
          "Bayesian": BayesianRidge(), "SVM": SVR()}


for model_name, model in models.items():
    if model_name == "LightGBM":
        models[model_name] = train_lgbm_model(model)
    else:
        models[model_name] = train_model(model)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 256, number of used features: 18
[LightGBM] [Info] Start training from score 105.742188
<class 'lightgbm.basic.Booster'> Mean Absolute Error (MAE): 112
<class 'sklearn.dummy.DummyRegressor'> Mean Absolute Error (MAE): 104
<class 'sklearn.linear_model._base.LinearRegression'> Mean Absolute Error (MAE): 337
<class 'sklearn.tree._classes.DecisionTreeRegressor'> Mean Absolute Error (MAE): 104
<class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'> Mean Absolute Error (MAE): 2964


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


<class 'sklearn.linear_model._ridge.Ridge'> Mean Absolute Error (MAE): 328
<class 'sklearn.linear_model._coordinate_descent.Lasso'> Mean Absolute Error (MAE): 325
<class 'sklearn.linear_model._coordinate_descent.ElasticNet'> Mean Absolute Error (MAE): 309
<class 'sklearn.ensemble._forest.RandomForestRegressor'> Mean Absolute Error (MAE): 104
<class 'sklearn.linear_model._bayes.BayesianRidge'> Mean Absolute Error (MAE): 306
<class 'sklearn.svm._classes.SVR'> Mean Absolute Error (MAE): 104
