In [4]:
import pandas as pd
from bson.objectid import ObjectId
import os
import collection
import pymongo

In [5]:
metric_name = ":balanced_accuracy"

def insert_automl_rows(df: pd.DataFrame, auto_ml_solution: str, task: str, training_id: str, dataset_name: str, dataset_size_mb: float, dataset_rows: int, dataset_cols: int, dataset_missing_values:float, 
                       dataset_duplicated_row_values:float, dataset_duplicated_col_values:float, dataset_outlier_row_values:float, runtime_limit : int, metric: float, relative_metric: float):
    new_row = {"AutoML_adapter":auto_ml_solution, "task": task, "trainings_id": training_id, "dataset_name": dataset_name, "dataset_size_in_mb" : dataset_size_mb, "dataset_rows": dataset_rows, "dataset_cols": dataset_cols, "missing_values": dataset_missing_values, "duplicated_rows": dataset_duplicated_row_values, "duplicated_cols": dataset_duplicated_col_values, "outliers": dataset_outlier_row_values,
               "runtime_limit": runtime_limit, metric_name: metric, "relative_"+ metric_name: relative_metric}
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    return df


def get_dataset_meta_informations(data):
        dataset_size_byte = data["analysis"]["size_bytes"]
        dataset_size_mb = dataset_size_byte / 1000  / 1000
        dataset_rows = data["analysis"]["number_of_rows"]
        dataset_cols = data["analysis"]["number_of_columns"]
        #compute ration of missing values
        dataset_missing_values_total = 0
        for k, v in data["analysis"]["missings_per_column"].items():
            if isinstance(v, dict):
                for k1, v1 in v.items():
                    if isinstance(v1, dict):
                        for k2, v2 in v1.items():
                            dataset_missing_values_total = dataset_missing_values_total + v2
                    else:
                        dataset_missing_values_total = dataset_missing_values_total + v1
            else:
                dataset_missing_values_total = dataset_missing_values_total + v
        dataset_missing_values = dataset_missing_values_total
        dataset_duplicated_row_values = len(data["analysis"]["duplicate_rows"])
        dataset_duplicated_col_values = len(data["analysis"]["duplicate_columns"])
        dataset_outlier_row_values_total = 0
        for k, v in data["analysis"]["outlier"].items():
            dataset_outlier_row_values_total += len(v)
        dataset_outlier_row_values = dataset_outlier_row_values_total
        
        return dataset_size_mb, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, dataset_outlier_row_values, dataset_rows, dataset_cols

def generate_best_automl_dataset(trainings: collection ,datasets: collection, models: collection,file_path: str):
    
    header_row = ["AutoML_adapter", "task", "trainings_id", "dataset_name", "dataset_size_in_mb", "dataset_rows", "dataset_cols", "missing_values", "duplicated_rows", "duplicated_cols", "outliers",
                   "runtime_limit", metric_name, "relative_"+ metric_name]
    df = pd.DataFrame(columns = header_row)
    result_dict = {}
    failed_value = 0
    for dataset in datasets.find():
            #Only use dataset with training series
            if dataset["lifecycle_state"] == "active" and dataset["name"] in ["airlines", "albert", "KDDCup09_appetency", "electricity", "bank-marketing", \
                                                                        "Amazon_employee_access", "riccardo", "eeg-eye-state", "jm1", "SpeedDating", \
                                                                        "mushroom", "christine", "phoneme", "Bioresponse", "kr-vs-kp", \
                                                                        "kc1", "pc4", "profb", "credit-approval", "breast-w", \
                                                                        
                                                                        "covertype", "dionis", "Devnagari-Script", "jannis", "Fashion-MNIST", \
                                                                        "shuttle", "tamilnadu-electricity", "letter", "gas-drift", "har", \
                                                                        "artificial-characters", "optdigits", "waveform-5000", "splice", "car", \
                                                                        "one-hundred-plants-margin", "vehicle", "eucalyptus", "soybean", "LED-display-domain-7digit" ]:
                automl_dict = {}
                #Find all trainings from the training series
                for training in trainings.find({"dataset_id": str(dataset["_id"])}):
                    #Get result scores for all series
                    task = training["configuration"]["task"]
                    training_id = str(training["_id"])
                    for model_id in training["model_ids"]:
                        for data in models.find({"_id": ObjectId(model_id)}):
                            if data["lifecycle_state"] == "active":
                                if data["auto_ml_solution"] in [":autogluon", ":evalml", ":flaml", ":gama", ":lama", ":h2o_automl", ":pycaret", ":tpot"]:

                                    if data["auto_ml_solution"] not in automl_dict:
                                        automl_dict[data["auto_ml_solution"]] = {}
                                    if data["status"] == "failed":
                                        automl_dict[data["auto_ml_solution"]][training["configuration"]["runtime_limit"]] = (failed_value, training_id)
                                    else:
                                        automl_dict[data["auto_ml_solution"]][training["configuration"]["runtime_limit"]] = (data["test_score"][metric_name], training_id)
                    
                def runtimes(dict):
                    return [key for key in [5, 10, 20, 40, 80, 160, 320, 640] if key not in dict]
                

                missing_runtime = runtimes(automl_dict[":gama"])
                if len(missing_runtime) != 0:
                    print(f"MISSING RUNTIMES FOR DATASET {dataset['name']}, {missing_runtime}")
                







                # Step 1: Initialize rankings and relative performance dictionaries
                relative_performance = {}

                # Step 2: Process each runtime
                runtimes_dict = {}
                for automl_name, runtime_data in automl_dict.items():
                    for runtime, (performance, training_id) in runtime_data.items():
                        if runtime not in runtimes_dict:
                            runtimes_dict[runtime] = []
                        runtimes_dict[runtime].append((automl_name, performance, training_id))

                # Step 3: Calculate rankings and relative performance
                for runtime, automl_list in runtimes_dict.items():
                    # Sort by performance in descending order
                    sorted_automl = sorted(automl_list, key=lambda x: x[1], reverse=True)

                    # Calculate max performance
                    max_value = max(performance for _, performance, _ in automl_list)
                    relative_performance[runtime] = []

                    # Compute relative performance
                    if max_value >= 0:
                        for automl_name, performance, training_id in sorted_automl:
                            if max_value == 0:
                                relative_value = 0
                            elif performance >= 0:
                                relative_value = performance / max_value
                            else:
                                relative_value = None
                            relative_performance[runtime].append((automl_name, performance, relative_value, training_id))  

 






                #Meta Informations
                dataset_size_mb, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, dataset_outlier_row_values, dataset_rows, dataset_cols = get_dataset_meta_informations(dataset)
                for runtime, automl_x_scores in relative_performance.items():
                    for automl, metric, relative_metric, training_id in automl_x_scores:
                        df = insert_automl_rows(df, automl, task, training_id, dataset['name'], dataset_size_mb, dataset_rows, dataset_cols, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, \
                                                dataset_outlier_row_values, runtime, metric, relative_metric)
                    
                
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    df.to_csv(os.path.join(file_path,"datasetBestAutoMLData.csv"))
    print()



In [6]:
"""
This script is for updating the runtime prediction parameters

For calculating the new runtime prediction parameters edit the following variables or rename the collections in MongoDBCompass
"""
# Set here your database connection
client = pymongo.MongoClient("mongodb://root:example@localhost:27017/")

# fill in the name of your database
db = client["ai-optimization"]

# Collection Name
#ändere die Namen demenstrpchend nach den collection namen aus deiner Datenbank ab
trainings = db["trainings"]
datasets = db["datasets"]
models = db["models"]

# Only edit when changing the folder structure
file_path = "../data"


#reads the information from the database and saves them in a csv file
generate_best_automl_dataset(trainings, datasets, models, file_path)

  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)



