In [None]:
import pandas as pd
from pathlib import Path

In [None]:
%run utils/mlflow_query.py
%run utils/loading.py
%run utils/comparison.py
%run utils/percentiles.py

In [None]:
mlflow_helper = MlflowHelper(pkl_file=Path("mlflow_run_df.pkl"))
#mlflow_helper.query_all_runs(query_metrics=False)

# Overall accuracy per percentiles

In [None]:
relevant_mimic_run_df = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=False)
mimic_accuracy_df = calculate_accuracies_per_percentiles(
    relevant_run_df=relevant_mimic_run_df, 
    k=20, num_percentiles=10, num_input_percentiles=10,
    percentile_names=[
        'avg_input_frequencies_percentile', 
        'median_input_frequencies_percentile', 
        'min_input_frequencies_percentile', 
        'p10_input_frequencies_percentile', 
        'unknown_inputs_percentile', 
        'output_frequency_percentile',
        'avg_input_frequencies_range', 
        'median_input_frequencies_range', 
        'min_input_frequencies_range', 
        'p10_input_frequencies_range', 
        'unknown_inputs_range', 
    ],
    local_mlflow_dir=mlflow_helper.local_mlflow_dir)

In [None]:
plot_accuracies_per_percentiles(
    relevant_run_df=relevant_mimic_run_df, 
    accuracy_df=mimic_accuracy_df[mimic_accuracy_df["type"] == "median_input_frequencies_percentile"])

In [None]:
relevant_huawei_run_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False)
huawei_accuracy_df = calculate_accuracies_per_percentiles(
    relevant_run_df=relevant_huawei_run_df, k=5, num_percentiles=10, num_input_percentiles=10,
    percentile_names=[
        'avg_input_frequencies_percentile', 
        'median_input_frequencies_percentile', 
        'min_input_frequencies_percentile', 
        'p10_input_frequencies_percentile', 
        'unknown_inputs_percentile', 
        'output_frequency_percentile',
        'avg_input_frequencies_range', 
        'median_input_frequencies_range', 
        'min_input_frequencies_range', 
        'p10_input_frequencies_range', 
        'unknown_inputs_range', 
    ],
    local_mlflow_dir=mlflow_helper.local_mlflow_dir)

In [None]:
plot_accuracies_per_percentiles(
    relevant_huawei_run_df, 
    huawei_accuracy_df[
        (huawei_accuracy_df['type'] == "median_input_frequencies_percentile") | 
        (huawei_accuracy_df['type'] == "unknown_inputs_percentile")])

# Accuracy@ for different input granularities

In [None]:
relevant_mimic_run_df2 = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=False, 
    valid_x_columns=["level_0", "level_1", "level_2"])
mimic_accuracy_df2 = calculate_accuracies_per_percentiles(
    relevant_run_df=relevant_mimic_run_df2, 
    k=20, num_percentiles=10, num_input_percentiles=10,
    percentile_names=[
        'avg_input_frequencies_percentile', 
        #'median_input_frequencies_percentile', 
        #'min_input_frequencies_percentile', 
        #'p10_input_frequencies_percentile', 
        #'unknown_inputs_percentile', 
        #'output_frequency_percentile',
        #'avg_input_frequencies_range', 
        #'median_input_frequencies_range', 
        #'min_input_frequencies_range', 
        #'p10_input_frequencies_range', 
        #'unknown_inputs_range', 
    ],
    local_mlflow_dir=mlflow_helper.local_mlflow_dir)

In [None]:
plot_accuracies_per_percentiles(
    relevant_run_df=relevant_mimic_run_df2[relevant_mimic_run_df2["data_params_ModelConfigbase_hidden_embeddings_trainable"] == "False"], 
    accuracy_df=mimic_accuracy_df2[mimic_accuracy_df2["type"] == "avg_input_frequencies_percentile"],
    comparison_column="data_params_SequenceConfigx_sequence_column_name",
    comparison_column_order=["level_0", "level_1", "level_2"],
    share_y=True)

In [None]:
grouped_df = (
    pd.merge(
        relevant_mimic_run_df2[relevant_mimic_run_df2['data_tags_model_type'] != 'causal2'], 
        mimic_accuracy_df2[mimic_accuracy_df2['type'] == 'avg_input_frequencies_percentile'], left_on="info_run_id", right_on="run_id")
    .groupby(
        [
            "data_tags_model_type",
            "data_params_ModelConfigbase_hidden_embeddings_trainable",
            "data_params_SequenceConfigx_sequence_column_name",
            "info_run_id",
            "type",
            "percentile",
        ],
        as_index=False,
    )
    .agg({"accuracy": max,})
)
g = sns.relplot(
    data=grouped_df,
    x="percentile",
    y="accuracy",
    row="type",
    col="data_tags_model_type",
    hue="data_params_SequenceConfigx_sequence_column_name",
    style="data_params_ModelConfigbase_hidden_embeddings_trainable",
    kind="line",
    palette=None,
)
g.set_titles("Type: {row_name}, Model: {col_name}").set_axis_labels(
    "", "accuracy"
)
for ax in g.axes.flatten():
    ax.tick_params(labelbottom=True)
plt.show()

In [None]:
relevant_huawei_run_df2 = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False, 
    valid_x_columns=["log_cluster_template", "fine_log_cluster_template", "coarse_log_cluster_template"])
huawei_accuracy_df2 = calculate_accuracies_per_percentiles(
    relevant_run_df=relevant_huawei_run_df2, 
    k=5, num_percentiles=10, num_input_percentiles=10,
    percentile_names=[
        'avg_input_frequencies_percentile', 
        'median_input_frequencies_percentile', 
        #'min_input_frequencies_percentile', 
        #'p10_input_frequencies_percentile', 
        'unknown_inputs_percentile', 
        #'output_frequency_percentile',
        #'avg_input_frequencies_range', 
        #'median_input_frequencies_range', 
        #'min_input_frequencies_range', 
        #'p10_input_frequencies_range', 
        #'unknown_inputs_range', 
    ],
    local_mlflow_dir=mlflow_helper.local_mlflow_dir)

In [None]:
plot_accuracies_per_percentiles(
    show_plot=False,
    relevant_run_df=relevant_huawei_run_df2[relevant_huawei_run_df2["data_params_ModelConfigbase_hidden_embeddings_trainable"] == "False"], 
    accuracy_df=huawei_accuracy_df2[(huawei_accuracy_df2["type"] == "avg_input_frequencies_percentile") | (huawei_accuracy_df2["type"] == "unknown_inputs_percentile")],
    comparison_column="data_params_SequenceConfigx_sequence_column_name",
    comparison_column_order=["log_cluster_template", "coarse_log_cluster_template"],
    share_y=True)

In [None]:
grouped_df = (
    pd.merge(
        relevant_huawei_run_df2[relevant_huawei_run_df2['data_tags_model_type'] != 'causal2'], 
        huawei_accuracy_df2[(huawei_accuracy_df2["type"] == "avg_input_frequencies_percentile") | (huawei_accuracy_df2["type"] == "unknown_inputs_percentile")],
        left_on="info_run_id", right_on="run_id")
    .groupby(
        [
            "data_tags_model_type",
            "data_params_ModelConfigbase_hidden_embeddings_trainable",
            "data_params_SequenceConfigx_sequence_column_name",
            "info_run_id",
            "type",
            "percentile",
        ],
        as_index=False,
    )
    .agg({"accuracy": max,})
)
g = sns.relplot(
    data=grouped_df,
    x="percentile",
    y="accuracy",
    row="type",
    col="data_tags_model_type",
    hue="data_params_SequenceConfigx_sequence_column_name",
    style="data_params_ModelConfigbase_hidden_embeddings_trainable",
    kind="line",
    palette=None,
)
g.set_titles("Type: {row_name}, Model: {col_name}").set_axis_labels(
    "", "accuracy"
)
for ax in g.axes.flatten():
    ax.tick_params(labelbottom=True)
plt.show()

In [None]:
relevant_mimic_run_df = pd.read_pickle("percentile_relevant_mimic_run_df.pkl")
relevant_huawei_run_df = pd.read_pickle("percentile_relevant_huawei_run_df.pkl")
relevant_mimic_run_df2 = pd.read_pickle("percentile_relevant_mimic_run_df2.pkl")
relevant_huawei_run_df2 = pd.read_pickle("percentile_relevant_huawei_run_df2.pkl")

mimic_accuracy_df = pd.read_pickle("percentile_mimic_accuracy_df.pkl")
mimic_accuracy_df2 = pd.read_pickle("percentile_mimic_accuracy_df2.pkl")
huawei_accuracy_df = pd.read_pickle("percentile_huawei_accuracy_df.pkl")
huawei_accuracy_df2 = pd.read_pickle("percentile_huawei_accuracy_df2.pkl")