In [31]:
#general imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import train_test_split


import lightgbm as lgb
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR

In [32]:
#Read dataset and make ydata-profiling dashboard

#from ydata_profiling import ProfileReport

df = pd.read_csv("../data/datasetRuntimeData.csv")
#Drop index col
df = df.drop(["Unnamed: 0"], axis=1)
#Remove special symbole
df['AutoML_adapter'] = df['AutoML_adapter'].str.replace(':', '')
#Apply one hot encoding
df = pd.get_dummies(df, columns=['AutoML_adapter'], prefix='', prefix_sep='')

#profile = ProfileReport(df, title="Profiling Report")
#profile.to_notebook_iframe()

In [34]:
#Split data all AutoML together

X = df.drop(["runtime_limit"], axis=1)
y = df["runtime_limit"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [35]:
# #Create prediction plot
# def create_prediction_plot(y_test, predictions):
#     prediction_results = pd.DataFrame({
#         'runtime_limit_is': y_test,
#         'runtime_limit_predicted': predictions
#     })

#     best_case_x = [0, 5, 10, 20, 40, 80, 160, 320, 640]
#     best_case_y = [0, 5, 10, 20, 40, 80, 160, 320, 640]
#     plt.figure(figsize=(10, 6))
#     sns.scatterplot(
#         x='runtime_limit_predicted', 
#         y='runtime_limit_is', 
#         data=prediction_results, 
#         color='gray', marker='o'  # Using a distinct color palette
#     )

#     plt.plot(best_case_x, best_case_y)
#     plt.xscale('log', base=10)  # Logarithmic scale for x-axis
#     plt.yscale('log', base=10)  # Logarithmic scale for y-axis


#     # Find the limits in log space
#     x_min, x_max = 1, 100
#     y_min, y_max = 1, np.exp(6.6)

#     # Determine the limits to make them symmetrical in log space
#     log_min = min(np.log10(x_min), np.log10(y_min))
#     log_max = max(np.log10(x_max), np.log10(y_max))

#     # Apply the symmetrical limits
#     plt.xlim([10**log_min, 10**log_max])
#     plt.ylim([10**log_min, 10**log_max])




#     plt.xlabel('Optimal runtime predicted')
#     plt.ylabel('Optimal runtime measured')
#     #plt.legend(title='AutoML Solution', bbox_to_anchor=(1.05, 0.5), loc='center left')
#     #plt.title('Actual vs Predicted Runtime Limits')
#     plt.grid(True)
#     #plt.legend(title='Series')
#     plt.show()
    

In [36]:
def train_model(model,):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Compute MAE
    mae = MAE(predictions, y_test)

    print(f"{type(model)} Mean Absolute Error (MAE):", round(mae))
    return model

In [37]:

def train_lgbm_model(model,):
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)



    # Define parameters for the LightGBM model
    params = {
        'objective': 'regression',  # Set the objective as regression
        'metric': 'mae',            # Use mean absolute error as the evaluation metric
        'verbose': 1                # Disable verbose output
    }

    # Train the LightGBM model
    num_round = 100
    model = lgb.train(params, train_data, num_round, valid_sets=[test_data])

    # Make predictions on the test set
    predictions = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Compute MAE
    mae = MAE(predictions, y_test)

    print(f"{type(model)} Mean Absolute Error (MAE):", round(mae))
    return model

In [38]:


models = { "Baseline": DummyRegressor(strategy="median"), 
          "LightGBM": None, 
          "Linear Regression": LinearRegression(), 
          "Decision Tree": DecisionTreeRegressor(random_state=42), 
          "Sklearn Neural Network": MLPRegressor(random_state=42), 
          "Ridge": Ridge(), 
          "Lasso": Lasso(), 
          "Elastic": ElasticNet(), 
          "Random Forest": RandomForestRegressor(), 
          "Bayesian": BayesianRidge(), 
          "SVM": SVR()}


for model_name, model in models.items():
    if model_name == "LightGBM":
        models[model_name] = train_lgbm_model(model)
    else:
        models[model_name] = train_model(model)


<class 'sklearn.dummy.DummyRegressor'> Mean Absolute Error (MAE): 104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 256, number of used features: 15
[LightGBM] [Info] Start training from score 105.683594
<class 'lightgbm.basic.Booster'> Mean Absolute Error (MAE): 112
<class 'sklearn.linear_model._base.LinearRegression'> Mean Absolute Error (MAE): 105
<class 'sklearn.tree._classes.DecisionTreeRegressor'> Mean Absolute Error (MAE): 106
<class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'> Mean Absolute Error (MAE): 3087
<class 'sklearn.linear_model._ridge.Ridge'> Mean Absolute Error (MAE): 106
<class 'sklearn.linear_model._coordinate_descent.Lasso'> Mean Absolute Error (MAE): 106
<class 'sklearn.linear_mode