In [2]:
import pandas as pd
from bson.objectid import ObjectId
import os
import collection

In [3]:
def insert_automl_rows(df: pd.DataFrame, auto_ml_solution: str, dataset_size_mb: float, 
                       dataset_rows: int, dataset_cols: int, dataset_missing_values:float, dataset_duplicated_row_values:float, dataset_duplicated_col_values:float, dataset_outlier_row_values:float, runtime_limit : int):
    new_row = {"AutoML_solution":auto_ml_solution, "dataset_size_in_mb" : dataset_size_mb, "dataset_rows": dataset_rows, "dataset_cols": dataset_cols, "missing_values": dataset_missing_values, "duplicated_rows": dataset_duplicated_row_values, "duplicated_cols": dataset_duplicated_col_values, "outliers": dataset_outlier_row_values,
               "runtime_limit": runtime_limit}
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True )
    return df


def get_dataset_meta_informations(data):
        dataset_size_byte = data["analysis"]["size_bytes"]
        dataset_size_mb = dataset_size_byte / 1000  / 1000
        dataset_rows = data["analysis"]["number_of_rows"]
        dataset_cols = data["analysis"]["number_of_columns"]
        #compute ration of missing values
        dataset_missing_values_total = 0
        for k, v in data["analysis"]["missings_per_column"].items():
            if isinstance(v, dict):
                for k1, v1 in v.items():
                    if isinstance(v1, dict):
                        for k2, v2 in v1.items():
                            dataset_missing_values_total = dataset_missing_values_total + v2
                    else:
                        dataset_missing_values_total = dataset_missing_values_total + v1
            else:
                dataset_missing_values_total = dataset_missing_values_total + v
        # dataset_missing_values = round(dataset_missing_values_total / (dataset_rows + dataset_cols), 2)
        dataset_missing_values = dataset_missing_values_total
        #compute ration of duplicated rows
        # dataset_duplicated_row_values_total = 0
        # for v in data["analysis"]["duplicate_rows"]:
        #     dataset_duplicated_row_values_total = dataset_duplicated_row_values_total + len(v)
        # dataset_duplicated_row_values = round(dataset_duplicated_row_values_total / dataset_rows, 2)
        dataset_duplicated_row_values = len(data["analysis"]["duplicate_rows"])
        #compute ration of duplicated col
        # dataset_duplicated_col_values_total = 0
        # for v in data["analysis"]["duplicate_columns"]:
        #     dataset_duplicated_col_values_total = dataset_duplicated_col_values_total + len(v)
        # dataset_duplicated_col_values = round(dataset_duplicated_col_values_total / dataset_cols, 2)
        dataset_duplicated_col_values = len(data["analysis"]["duplicate_columns"])
        #compute ration of outlier rows
        # dataset_outlier_row_values_total = []
        # for k, v in data["analysis"]["outlier"].items():
        #     dataset_outlier_row_values_total += v
        # dataset_outlier_row_values = round(len(set(dataset_outlier_row_values_total)) / dataset_rows, 2)
        dataset_outlier_row_values_total = 0
        for k, v in data["analysis"]["outlier"].items():
            dataset_outlier_row_values_total += len(v)
        dataset_outlier_row_values = dataset_outlier_row_values_total
        
        return dataset_size_mb, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, dataset_outlier_row_values, dataset_rows, dataset_cols

def generate_automl_runtime_dataset(trainings: collection ,datasets: collection, models: collection,file_path: str):
    metric = ":balanced_accuracy"
    header_row = ["AutoML_solution", "dataset_name", "dataset_size_in_mb", "dataset_rows", "dataset_cols", "missing_values", "duplicated_rows", "duplicated_cols", "outliers",
                   "runtime_limit", metric]
    df = pd.DataFrame(columns = header_row)
    result_dict = {}
    failed_value = 0
    for dataset in datasets.find():
            #Only use dataset with training series
            if dataset["lifecycle_state"] == "active" and dataset["name"] in ["airlines", "albert", "KDDCup09_appetency", "electricity", "bank-marketing", \
                                                                        "Amazon_employee_access", "riccardo", "eeg-eye-state", "jm1", "SpeedDating", \
                                                                        "mushroom", "christine", "phoneme", "Bioresponse", "kr-vs-kp", \
                                                                        "kc1", "pc4", "profb", "credit-approval", "breast-w", \
                                                                        
                                                                        "covertype", "dionis", "Devnagari-Script", "jannis", "Fashion-MNIST", \
                                                                        "shuttle", "tamilnadu-electricity", "letter", "gas-drift", "har", \
                                                                        "artificial-characters", "optdigits", "waveform-5000", "splice", "car", \
                                                                        "one-hundred-plants-margin", "vehicle", "eucalyptus", "soybean", "LED-display-domain-7digit" ]:
                automl_dict = {}
                #Find all trainings from the training series
                for training in trainings.find({"dataset_id": str(dataset["_id"])}):
                    #Get result scores for all series
                    for model_id in training["model_ids"]:
                        for data in models.find({"_id": ObjectId(model_id)}):
                            if data["lifecycle_state"] == "active":
                                if data["auto_ml_solution"] in [":autogluon", ":evalml", ":flaml", ":gama", ":lama", ":h2o_automl", ":pycaret", ":tpot"]:

                                    if data["auto_ml_solution"] not in automl_dict:
                                        automl_dict[data["auto_ml_solution"]] = {}
                                    if data["status"] == "failed":
                                        automl_dict[data["auto_ml_solution"]][training["configuration"]["runtime_limit"]] = failed_value
                                    else:
                                        automl_dict[data["auto_ml_solution"]][training["configuration"]["runtime_limit"]] = data["test_score"][metric]
                    
                def runtimes(dict):
                    return [key for key in [5, 10, 20, 40, 80, 160, 320, 640] if key not in dict]
                
                missing_runtime = runtimes(automl_dict[":gama"])
                if len(missing_runtime) != 0:
                    print(f"MISSING RUNTIMES FOR DATASET {dataset['name']}, {missing_runtime}")
                best_scores = {}
                #Find the best score X runtime for each AutoML
                for automl, runtime_scores in automl_dict.items():
                    best_score = None
                    best_runtime = None
                    
                    # Iterate over runtimes starting from the lowest
                    for runtime, score in sorted(runtime_scores.items()):
                        if best_score is None or score >= best_score + 0.001:  # Check for 0.1% improvement
                            best_score = score
                            best_runtime = runtime
                    
                    best_scores[automl] = (best_runtime, best_score)

                #Meta Informations
                for automl, runtime_scores in best_scores.items():
                    dataset_size_mb, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, dataset_outlier_row_values, dataset_rows, dataset_cols = get_dataset_meta_informations(dataset)
                    df = insert_automl_rows(df, automl, dataset["name"], dataset_size_mb, dataset_rows, dataset_cols, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, dataset_outlier_row_values, runtime_scores[0])



    if not os.path.exists(file_path):
        os.makedirs(file_path)

    df.to_csv(os.path.join(file_path,"datasetData.csv"))
    print()









In [5]:
def generate_automl_runtime_dataset(trainings: collection ,datasets: collection, models: collection,file_path: str):
    metric = ":balanced_accuracy"
    header_row = ["AutoML_solution", "dataset_size_in_mb", "dataset_rows", "dataset_cols", "missing_values", "duplicated_rows", "duplicated_cols", "outliers",
                   "runtime_limit"]
    df = pd.DataFrame(columns = header_row)
    result_dict = {}
    failed_value = 0
    for dataset in datasets.find():
            #Only use dataset with training series
            if dataset["lifecycle_state"] == "active" and dataset["name"] in ["airlines", "albert", "KDDCup09_appetency", "electricity", "bank-marketing", \
                                                                        "Amazon_employee_access", "riccardo", "eeg-eye-state", "jm1", "SpeedDating", \
                                                                        "mushroom", "christine", "phoneme", "Bioresponse", "kr-vs-kp", \
                                                                        "kc1", "pc4", "profb", "credit-approval", "breast-w", \
                                                                        
                                                                        "covertype", "dionis", "Devnagari-Script", "jannis", "Fashion-MNIST", \
                                                                        "shuttle", "tamilnadu-electricity", "letter", "gas-drift", "har", \
                                                                        "artificial-characters", "optdigits", "waveform-5000", "splice", "car", \
                                                                        "one-hundred-plants-margin", "vehicle", "eucalyptus", "soybean", "LED-display-domain-7digit"]:
                automl_dict = {}
                #Find all trainings from the training series
                for training in trainings.find({"dataset_id": str(dataset["_id"])}):
                    #Get result scores for all series
                    for model_id in training["model_ids"]:
                        for data in models.find({"_id": ObjectId(model_id)}):
                            if data["lifecycle_state"] == "active":
                                if data["auto_ml_solution"] in [":autogluon", ":evalml", ":flaml", ":gama", ":lama", ":h2o_automl", ":pycaret", ":tpot"]:

                                    if data["auto_ml_solution"] not in automl_dict:
                                        automl_dict[data["auto_ml_solution"]] = {}
                                    if data["status"] == "failed":
                                        automl_dict[data["auto_ml_solution"]][training["configuration"]["runtime_limit"]] = failed_value
                                    else:
                                        automl_dict[data["auto_ml_solution"]][training["configuration"]["runtime_limit"]] = data["test_score"][metric]
                    
                def runtimes(dict):
                    return [key for key in [5, 10, 20, 40, 80, 160, 320, 640] if key not in dict]
                
                missing_runtime = runtimes(automl_dict[":gama"])
                if len(missing_runtime) != 0:
                    print(f"MISSING RUNTIMES FOR DATASET {dataset['name']}, {missing_runtime}")
                best_scores = {}
                #Find the best score X runtime for each AutoML
                for automl, runtime_scores in automl_dict.items():
                    best_score = None
                    best_runtime = None
                    
                    # Iterate over runtimes starting from the lowest
                    for runtime, score in sorted(runtime_scores.items()):
                        if best_score is None or score >= best_score + 0.001:  # Check for 0.1% improvement
                            best_score = score
                            best_runtime = runtime
                    
                    best_scores[automl] = (best_runtime, best_score)

                #Meta Informations
                for automl, runtime_scores in best_scores.items():
                    dataset_size_mb, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, dataset_outlier_row_values, dataset_rows, dataset_cols = get_dataset_meta_informations(dataset)
                    df = insert_automl_rows(df, automl, dataset_size_mb, dataset_rows, dataset_cols, dataset_missing_values, dataset_duplicated_row_values, dataset_duplicated_col_values, dataset_outlier_row_values, runtime_scores[0])



    if not os.path.exists(file_path):
        os.makedirs(file_path)

    df.to_csv(os.path.join(file_path,"datasetData.csv"))
    print()




In [6]:
import pymongo
import os



"""
This script is for updating the runtime prediction parameters

For calculating the new runtime prediction parameters edit the following variables or rename the collections in MongoDBCompass
"""
# Set here your database connection
client = pymongo.MongoClient("mongodb://localhost:5050/")

# fill in the name of your database
db = client["744e63b1-56f6-4fa8-bae4-be31ff8ee100"]

# Collection Name
#ändere die Namen demenstrpchend nach den collection namen aus deiner Datenbank ab
trainings = db["trainings"]
datasets = db["datasets"]
models = db["models"]

# Only edit when changing the folder structure
file_path = os.path.join(os.getcwd(), "data")


#reads the information from the database and saves them in a csv file
generate_automl_runtime_dataset(trainings, datasets, models, file_path)

  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True )



