In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%run utils/mlflow_query.py
%run utils/loading.py
%run utils/comparison.py
%run utils/percentiles.py

In [None]:
mlflow_helper = MlflowHelper(pkl_file=Path("mlflow_run_df.pkl"))
#mlflow_helper.query_all_runs(query_metrics=False)

In [None]:
def plot_metrics_over_epochs(metric_df, metric_names, 
    column_feature_name='data_tags_model_type', 
    row_feature_name='data_tags_sequence_type',
    hue_feature_name='data_params_ModelConfighidden_embedding_initializer',
    style_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    titles="{row_name}, {col_name}",
):
    for metric_name in metric_names:
        g = sns.relplot(
            data=metric_df, x="epoch", y=metric_name,
            col=column_feature_name, row=row_feature_name, 
            hue=hue_feature_name, style=style_feature_name, 
            units='info_run_id', estimator=None,
            kind="line", facet_kws={'sharey':False},
        )
        g.set_titles(titles)
        g.savefig("epochs_{}.png".format(metric_name))

def plot_best_metric_strip(metric_df, metric_names, feature_names,
    x_feature_name='data_tags_model_type', 
    x_order=['simple', 'gram', 'text', 'text_paper', 'causal'],
    row_feature_name='data_tags_sequence_type',
    hue_feature_name='data_params_ModelConfighidden_embedding_initializer',
    column_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    titles="{row_name}, Trainable: {col_name}",
):
    grouped_df = metric_df.groupby(feature_names, as_index=False).agg({
        metric_name:max for metric_name in metric_names
    })
    for metric_name in metric_names:
        g = sns.catplot(
            data=grouped_df, y=metric_name, 
            x=x_feature_name, order=x_order, row=row_feature_name, 
            hue=hue_feature_name, col=column_feature_name,
            kind="strip", sharey='row',
        )
        g.set_titles(titles).set_axis_labels('', metric_name)
        for ax in g.axes.flatten():
            ax.tick_params(labelbottom=True)

        g.savefig("strip_{}.png".format(metric_name))

def plot_best_metric_bar(metric_df, metric_names,
    x_feature_name='data_tags_model_type', 
    x_order=['simple', 'gram', 'text', 'text_paper', 'causal'],
    row_feature_name='data_tags_sequence_type',
    hue_feature_name='data_params_ModelConfighidden_embedding_initializer',
    col_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    titles="{row_name}, Trainable: {col_name}",
    palette=None,
    dodge=True,
    col_order=None,
):
    for metric_name in metric_names:
        g = sns.catplot(
            data=metric_df, x=x_feature_name, y=metric_name, order=x_order,
            row=row_feature_name, hue=hue_feature_name, col=col_feature_name, col_order=col_order,
            kind="box", sharey='row', palette=palette, dodge=dodge,
        )
        g.set_titles(titles).set_axis_labels('', metric_name)
        for ax in g.axes.flatten():
            ax.tick_params(labelbottom=True)
        g.savefig("bar_{}.png".format(metric_name))


In [None]:
mimic_df = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=False)
mimic_df = mlflow_helper.load_best_metrics_for_ids(set(mimic_df["info_run_id"]))

In [None]:
plot_best_metric_bar(mimic_df, ['val_top_20_categorical_accuracy_history_best', 'epoch'], 
    hue_feature_name=None, 
    col_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    row_feature_name=None,
    x_order=["simple", "gram", "text", "causal", "causal2"],
    titles="Embeddings Trainable: {col_name}")
plt.show()

In [None]:
mimic_df = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=False, valid_x_columns=["level_0", "level_1", "level_2"])
mimic_df = mlflow_helper.load_best_metrics_for_ids(set(mimic_df["info_run_id"]))

In [None]:
plot_best_metric_bar(mimic_df, ['val_top_20_categorical_accuracy_history_best', 'epoch'], 
    hue_feature_name=None, 
    col_feature_name='data_params_SequenceConfigx_sequence_column_name',
    row_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    col_order=["level_0", "level_1", "level_2"],
    x_order=["simple", "gram", "text", "causal", "causal2"],
    titles="Level: {col_name}, Embeddings Trainable: {row_name}")
plt.show()

In [None]:
mimic_df = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=False, risk_prediction=True)
mimic_df = mlflow_helper.load_best_metrics_for_ids(set(mimic_df["info_run_id"]))

In [None]:
plot_best_metric_bar(mimic_df, ['val_auc_history_best', 'epoch'], 
    hue_feature_name=None, 
    col_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    row_feature_name=None,
    x_order=["simple", "gram", "text", "causal", "causal2"],
    titles="Embeddings Trainable: {col_name}")
plt.show()

In [None]:
huawei_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False, risk_prediction=False)
huawei_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))

In [None]:
plot_best_metric_bar(huawei_df, ['val_top_5_categorical_accuracy_history_best', 'val_top_10_categorical_accuracy_history_best', 'epoch'], 
    hue_feature_name=None, 
    col_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    row_feature_name=None,
    x_order=["simple", "gram", "text", "causal", "causal2"],
    titles="Embeddings Trainable: {col_name}")
plt.show()

In [None]:
huawei_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False, risk_prediction=False, valid_x_columns=["log_cluster_template", "fine_log_cluster_template", "coarse_log_cluster_template"])
huawei_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))

In [None]:
plot_best_metric_bar(huawei_df, ['val_top_5_categorical_accuracy_history_best', 'epoch'], 
    hue_feature_name=None, 
    col_feature_name='data_params_SequenceConfigx_sequence_column_name',
    row_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    col_order=["log_cluster_template", "coarse_log_cluster_template"],
    x_order=["simple", "gram", "text", "causal"],
    titles="Level: {col_name}, Embeddings Trainable: {row_name}")
plt.show()

In [None]:
huawei_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False, risk_prediction=True)
huawei_df = mlflow_helper.load_metric_history_for_ids(set(huawei_df["info_run_id"]))

In [None]:
plot_metrics_over_epochs(huawei_df, metric_names=['auc_history', 'val_auc_history'])
plt.show()

In [None]:
huawei_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False, risk_prediction=True)
huawei_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))

In [None]:
plot_best_metric_bar(huawei_df, ['val_auc_history_best', 'auc_history_best', 'epoch'], 
    hue_feature_name=None, 
    col_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    row_feature_name=None,
    x_order=["simple", "gram", "text", "causal", "causal2"],
    titles="Embeddings Trainable: {col_name}")
plt.show()

# NOISE

In [None]:
mimic_noise_df = mlflow_helper.mimic_run_df(include_noise=True)
mimic_noise_df = mlflow_helper.load_best_metrics_for_ids(set(mimic_noise_df["info_run_id"]))
mimic_noise_df['data_tags_noise_type'] = mimic_noise_df['data_tags_noise_type'].fillna('').astype(str).apply(lambda x: x + '_threshold0.0' if len(str(x)) <= len('added0.0_removed0.1') and len(x) > 0 else x)
mimic_noise_df['data_tags_noise_type'] = mimic_noise_df['data_tags_noise_type'].apply(lambda x: 'no_noise' if len(x) == 0  or x == 'nan' else x)

In [None]:
mimic_noise_df = mimic_noise_df[mimic_noise_df['data_tags_model_type'].apply(lambda x: x in ['gram', 'causal', 'text'])].copy().reset_index(drop=True)
mimic_noise_df = mimic_noise_df[mimic_noise_df['data_tags_noise_type'].str.endswith('_threshold0.0') | mimic_noise_df['data_tags_noise_type'].str.endswith('no_noise')].copy().reset_index(drop=True)
mimic_noise_df = mimic_noise_df[mimic_noise_df['data_params_ModelConfigbase_hidden_embeddings_trainable'].astype(str) == 'False']
mimic_noise_df['noise'] = mimic_noise_df['data_tags_noise_type'].apply(
    lambda x: float(x[len('added'):len('added0.0')]) - float(x[len('added0.0_removed'):len('added0.0_removed0.0')]) if not (x == 'no_noise') else 0.0
)
mimic_noise_df['noisy_knowledge'] = mimic_noise_df['noise'].apply(lambda x: 'False' if x == 0.0 else 'True')
plot_best_metric_bar(mimic_noise_df, ['val_top_20_categorical_accuracy_history_best'], 
    col_feature_name='data_tags_model_type', 
    row_feature_name='data_params_ModelConfigbase_hidden_embeddings_trainable',
    x_feature_name='noise',
    hue_feature_name='noisy_knowledge',
    titles="Knowledge type: {col_name}, Embeddings trainable: {row_name}",
    palette='Set2', 
    dodge=False,
    x_order=None,
    col_order=['gram', 'causal', 'text']
    )
plt.show()