In [None]:
import pandas as pd
from pathlib import Path

In [None]:
%run utils/mlflow_query.py
%run utils/loading.py
%run utils/comparison.py
%run utils/attention_graph.py

In [None]:
mlflow_helper = MlflowHelper(pkl_file=Path("mlflow_run_df.pkl"))
#mlflow_helper.query_all_runs(pkl_file=Path("mlflow_run_df.pkl"))

# Mimic

In [None]:
def calculate_cluster_attributes_for(run_id, cluster_threshold=0.5, local_mlflow_dir=mlflow_helper.local_mlflow_dir):
    attention_weights = load_attention_weights(run_id, local_mlflow_dir)
    if attention_weights is None or len(attention_weights) == 0:
        return {}
    
    attention_importances = calculate_attention_importances(attention_weights)
    dominant_node_mapping = {
        x:sorted([
            (y,float(y_weight)) for y, y_weight in ys.items() if float(y_weight) > cluster_threshold
        ], key=lambda v:v[0])
        for x,ys in attention_weights.items()
    }
    dominant_nodes = set([
        v[0] for sl in dominant_node_mapping.values() for v in sl
    ])
    indecided_features = set([x for x,ys in dominant_node_mapping.items() if len(ys) == 0])
    clusters = {
        x:[
            (y, float(y_weight)) for (y,y_weight) in ys if float(y_weight) > cluster_threshold
        ] for x, ys in attention_importances.items()
    }
    clusters = {
        x:ys for x,ys in clusters.items() if len(ys) > 0
    }
    features_in_clusters = set([
        y[0] for sl in clusters.values() for y in sl
    ])
    shared_clusters = {
        x:ys for x,ys in clusters.items() if len(ys) > 1
    }
    features_in_shared_clusters = set([
        y[0] for sl in shared_clusters.values() for y in sl
    ])
    single_clusters = {
        x:ys for x,ys in clusters.items() if len(ys) == 1
    }
    features_in_single_clusters = set([
        y[0] for sl in single_clusters.values() for y in sl
    ])
    return {
        'run_id': run_id,
        'features': len(attention_weights),
        'indecided_features': len(indecided_features),
        'indecided_features_p': len(indecided_features) / len(attention_weights),
        'clusters': len(clusters),
        'features_in_clusters': len(features_in_clusters),
        'features_in_clusters_p': len(features_in_clusters) / len(attention_weights),
        'shared_clusters': len(shared_clusters),
        'features_in_shared_clusters': len(features_in_shared_clusters),
        'features_in_shared_clusters_p': len(features_in_shared_clusters) / len(attention_weights),
        'features_in_single_clusters_p': len(features_in_single_clusters) / len(attention_weights),
        'avg_shared_cluster_size_p': len(features_in_shared_clusters) / len(shared_clusters),
    }

In [None]:
relevant_mimic_run_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False)
run_ids = set(relevant_mimic_run_df["info_run_id"])
local_mlflow_dir=mlflow_helper.local_mlflow_dir
cluster_threshold=0.5

In [None]:
records = [
    calculate_cluster_attributes_for(run_id, cluster_threshold)
    for run_id in tqdm(run_ids)
]

In [None]:
mimic_df = pd.merge(
    pd.DataFrame.from_records(
        [x for x in records if len(x) > 0]
    ).melt(id_vars=["run_id"], var_name="metric"),
    relevant_mimic_run_df,
    left_on="run_id",
    right_on="info_run_id",
)
mimic_df.head()

In [None]:
mimic_df[
    (mimic_df["data_params_ModelConfigbase_feature_embeddings_trainable"] == "False")
    & (mimic_df["metric"].apply(lambda x: x.endswith("_p")))
].groupby(by=["metric", "data_tags_model_type"]).describe()

In [None]:
sns.catplot(
    data=mimic_df[
        (mimic_df["data_params_ModelConfigbase_feature_embeddings_trainable"] == "False")
        & (mimic_df["metric"].apply(lambda x: x.endswith("_p")))
    ],
    x="data_tags_model_type",
    y="value",
    col="metric",
    kind="box",
    sharey=False,
    order=["gram", "causal", "text"],
)
plt.show()