In [None]:
%run utils/attention_graph.py
%run utils/mlflow_query.py
%run utils/loading.py
%run utils/comparison.py
%run utils/ranks.py

In [None]:
mlflow_helper = MlflowHelper(pkl_file=Path("mlflow_run_df.pkl"))
#mlflow_helper.query_all_runs(pkl_file=Path("mlflow_run_df.pkl"))

In [None]:
relevant_mimic_run_df = mlflow_helper.mimic_run_df(include_noise=True, include_refinements=False)
mimic_gram_false_00_run_id = relevant_mimic_run_df[
        (relevant_mimic_run_df['data_tags_noise_type'].fillna('').apply(len) == 0) &   
        (relevant_mimic_run_df['data_tags_model_type'] == 'gram') &
        (relevant_mimic_run_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False')
].iloc[0].get('info_run_id')
mimic_gram_false_10_run_id = relevant_mimic_run_df[
        (relevant_mimic_run_df['data_tags_noise_type'].fillna('') == 'added0.1_removed0.0_threshold0.0') &   
        (relevant_mimic_run_df['data_tags_model_type'] == 'gram') &
        (relevant_mimic_run_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False')
].iloc[0].get('info_run_id')
mimic_text_false_00_run_id = relevant_mimic_run_df[
        (relevant_mimic_run_df['data_tags_noise_type'].fillna('').apply(len) == 0) &
        (relevant_mimic_run_df['data_tags_model_type'] == 'text') &
        (relevant_mimic_run_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False')
].iloc[0].get('info_run_id')
mimic_text_false_10_run_id = relevant_mimic_run_df[
        (relevant_mimic_run_df['data_tags_noise_type'].fillna('') == 'added0.1_removed0.0_threshold0.0') &  
        (relevant_mimic_run_df['data_tags_model_type'] == 'text') &
        (relevant_mimic_run_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False')
].iloc[0].get('info_run_id')
mimic_causal_false_00_run_id = relevant_mimic_run_df[
        (relevant_mimic_run_df['data_tags_noise_type'].fillna('').apply(len) == 0) &
        (relevant_mimic_run_df['data_tags_model_type'] == 'causal') &
        (relevant_mimic_run_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False')
].iloc[0].get('info_run_id')
mimic_causal_false_10_run_id = relevant_mimic_run_df[
        (relevant_mimic_run_df['data_tags_noise_type'].fillna('') == 'added0.1_removed0.0_threshold0.0') & 
        (relevant_mimic_run_df['data_tags_model_type'] == 'causal') &
        (relevant_mimic_run_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False')
].iloc[0].get('info_run_id')
print('Gram', mimic_gram_false_00_run_id, 'Text', mimic_text_false_00_run_id, 'Causal', mimic_causal_false_00_run_id)
print('NOISE 10%: Gram', mimic_gram_false_10_run_id, 'Text', mimic_text_false_10_run_id, 'Causal', mimic_causal_false_10_run_id)

In [None]:
relevant_huawei_run_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=False)
huawei_gram_false_00_run_id = relevant_huawei_run_df[
        (relevant_huawei_run_df['data_tags_noise_type'].fillna('').apply(len) == 0) &   
        (relevant_huawei_run_df['data_tags_model_type'] == 'gram') &
        (relevant_huawei_run_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False')
].iloc[0].get('info_run_id')


In [None]:
create_graph_visualization(
    run_id=huawei_gram_false_00_run_id,
    local_mlflow_dir=mlflow_helper.local_mlflow_dir,
    threshold=0.2,
    run_name="huawe_gram",
    use_node_mapping=False
)

# Attention Weights

In [None]:
def calculate_shared_attention_weights(attention_weights: Dict[str, Dict[str, float]]):
    if attention_weights is None:
        return [0.0]
    attention_importances = calculate_attention_importances(attention_weights)
    shared_weights = [
        sum([
            float(weight) for con_feature, weight in attention_weights[in_feature].items()
            if len(attention_importances[con_feature]) > 1    
        ])
        for in_feature in attention_weights
    ]
    return shared_weights

In [None]:
rel_runs = mlflow_helper.mimic_run_df()
shared_weights = []
for run_id in set(rel_runs["info_run_id"]):
    attention_weights = load_attention_weights(run_id=run_id, local_mlflow_dir=mlflow_helper.local_mlflow_dir)
    shared_weights.append({
        "run_id": run_id,
        "shared_weights": calculate_shared_attention_weights(attention_weights)
    })

In [None]:
shared_df = pd.merge(rel_runs, pd.DataFrame.from_records(shared_weights), left_on="info_run_id", right_on="run_id")
shared_df["avg_shared_weights"] = shared_df["shared_weights"].apply(lambda x: np.mean(x))
shared_df["median_shared_weights"] = shared_df["shared_weights"].apply(lambda x: np.median(x))
shared_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
shared_df["data_tags_model_type"] = shared_df["data_tags_model_type"].apply(
    lambda x: {
        "gram": "hierarchy",
        "causal": "causal_old",
        "causal2": "causal",
    }.get(x,x)
)
shared_df["Embeddings Trainable"] = shared_df["data_params_ModelConfigbase_hidden_embeddings_trainable"]
sns.catplot(
    data=shared_df[
        shared_df["data_tags_model_type"].apply(lambda x: x in ["hierarchy", "text", "causal"])
    ].explode("shared_weights"), 
    x="data_tags_model_type", 
    y="shared_weights",
    hue="Embeddings Trainable",
    kind="box",
    order=["hierarchy", "causal", "text"],
    palette="Set2",
).set_axis_labels("", "shared attention importance")
plt.savefig("sharedimportances_trainable_healthcare.png", dpi=100, bbox_inches="tight")
plt.show()

In [None]:
import json

texts = load_icd9_text()
unknowns = set([x for x,y in texts.items() if 
    (y["description"].lower().startswith("other")
    or y["description"].lower().startswith("unspecified")
    or y["description"].lower().endswith("unspecified")
    or y["description"].lower().endswith("unspecified type")
    or y["description"].lower().endswith("not elsewhere classified"))])

attentions = load_attention_weights(
    mimic_gram_false_00_run_id, 
    mlflow_helper.local_mlflow_dir
)
print(sum([len(x) for x in attentions.values()]))
attentions_without_unknowns = {
    x:[y for y in ys if y not in unknowns or x == y] for x,ys in attentions.items()
}
print(sum([len(x) for x in attentions_without_unknowns.values()]))
with open('gram_without_unknowns.json', 'w') as f:
    json.dump(attentions_without_unknowns, f)

In [None]:
import string

def transform_to_words(description: str) -> Set[str]:
    description = description.translate(
        str.maketrans(string.punctuation, " " * len(string.punctuation))
    )
    words = [str(x).lower().strip() for x in description.split()]
    return set([x for x in words if len(x) > 0])

input_descriptions = [(x, transform_to_words(y["description"])) for x,y in texts.items() if x in attentions]

word_overlaps = {}
for x, x_desc in tqdm(input_descriptions):
    for y, y_desc in input_descriptions:
        if x == y:
            continue


        word_overlap = x_desc.intersection(y_desc)
        if len(word_overlap) == 0:
            continue

        overlap_string = " ".join([x for x in sorted(word_overlap)])
        if overlap_string not in word_overlaps:
            word_overlaps[overlap_string] = set()

        word_overlaps[overlap_string].update([x,y])

print(len(word_overlaps))
print(sum([len(ws) for ws in word_overlaps.values()]))

In [None]:
max_size_diff = 0.2
max_intersection_diff = 0.25
cleaned_word_overlaps = {}
replacements = {}
for words, features in tqdm(word_overlaps.items()):
    found_replacement = False
    for other_words, other_features in cleaned_word_overlaps.items():
        if (len(other_features) <= (1 + max_size_diff) * len(features) and 
            len(other_features) >= (1 - max_size_diff) * len(features) and 
            len(other_features.intersection(features)) >= max_intersection_diff * len(features)):
            #print("Found replacement", 
            #    words, len(features), 
            #    other_words, len(other_features), 
            #    len(other_features.intersection(features)))
            if other_words not in replacements:
                replacements[other_words] = set()
            replacements[other_words].add(words)
            found_replacement = True
            break
    
    if not found_replacement:
        cleaned_word_overlaps[words] = features

print(len(cleaned_word_overlaps))
print(sum([len(ws) for ws in cleaned_word_overlaps.values()]))

In [None]:
cleaned_word_overlaps["10 any body burn degree involving less of or percent surface than third unspecified with"]

In [None]:
print(len(word_overlaps))
print(sum([len(ws) for ws in word_overlaps.values()]))
word_overlaps["(acute) asthma exacerbation with"]

In [None]:
feature_node_mapping = create_graph_visualization(
    run_id=mimic_gram_false_00_run_id, 
    local_mlflow_dir=mlflow_helper.local_mlflow_dir,
    threshold=0.2, 
    run_name='mimic_gram_false_00', 
    use_node_mapping=False)

In [None]:
colored_connections, feature_node_mapping = create_graph_visualization_reference(
    run_id=mimic_gram_false_10_run_id, 
    reference_run_id=mimic_gram_false_00_run_id, 
    local_mlflow_dir=mlflow_helper.local_mlflow_dir,
    threshold=0.2, 
    run_name='mimic_gram_false_00', 
    use_node_mapping=False)

## Drain Hierarchy

In [None]:
huawei_run_df = mlflow_helper.huawei_run_df(include_drain_hierarchy=True)
drain_run_id = huawei_run_df[
    (huawei_run_df["data_params_HuaweiPreprocessorConfigdrain_log_sts"].fillna("[]").astype(str).apply(len) > 2)
    & (huawei_run_df["data_tags_model_type"] == "gram")
]["info_run_id"].iloc[0]
drain_run_id

In [None]:
aw = load_attention_weights(run_id=drain_run_id, local_mlflow_dir=mlflow_helper.local_mlflow_dir)
aimp = calculate_attention_importances(aw)

In [None]:
drain_clusters = [
    (k,[a for a,b in w if float(b) > 0.9]) 
    for k,w in aimp.items() 
    if "log_cluster_template" in k and k[0].isdigit()]
[x for x in drain_clusters if len(x[1]) > 1]

In [None]:
drain_clusters

In [None]:
drain_levels = [w for k,ws in aw.items() for w in ws if "log_cluster_template" in w]
drain_levels_ = {}
for i in range(3):
    drain_levels_[i] = len(set([x for x in drain_levels if str(i) + "_log_cluster_template" in x]))

drain_levels_

In [None]:
feature_node_mapping = create_graph_visualization(
    run_id=drain_run_id, 
    local_mlflow_dir=mlflow_helper.local_mlflow_dir,
    threshold=0.2, 
    run_name='drain_hierarchy', 
    use_node_mapping=False)

In [None]:
mimic_df = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=False, risk_prediction=False, valid_x_columns=["level_0", "level_1", "level_2"])
mimic_df

In [None]:
import numpy as np

mimic_df.groupby(by=["data_params_SequenceConfigx_sequence_column_name", "data_tags_model_type"]).agg({
    "data_metrics_num_connections": np.mean,
    "data_metrics_x_vocab_size": np.mean,
    "data_metrics_y_vocab_size": np.mean,
})

In [None]:
icd9_hierarchy = pd.read_csv('data/hierarchy_icd9.csv')
icd9_hierarchy

In [None]:
def load_icd9_hierarchy_parents_for_level(
                icd9_hierarchy: pd.DataFrame, 
                all_features: Set[str], 
                max_level: str) -> Dict[str, str]:
    parent_infos = {}
    for feature in tqdm(all_features, desc="Processing icd9 hierarchy clusters for level " + max_level):
        parents = set(icd9_hierarchy[icd9_hierarchy["level_0"] == feature][max_level])
        if len(parents) > 1:
            print("Found more than one parent!", feature, parents)
        parent = list(parents)[0]
        if feature in parent_infos and parent not in parent_infos[feature]:
            print("Feature already in weights, but with different parent!", feature, parent, weights[feature])

        parent_infos[feature] = parent

    return parent_infos

def add_icd9_hierarchy_attention_weights_for_level(
                feature_parents: Dict[str, str],
                attention_weights: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, float]]:
    new_attention_weights = {}
    for feature, parent in feature_parents.items():
        if feature in attention_weights:
            new_attention_weights[feature] = attention_weights[feature]
        elif parent in attention_weights:
            new_attention_weights[feature] = attention_weights[parent]
        else:
            new_attention_weights[feature] = {
                parent: 1.0,
            }

    return new_attention_weights

In [None]:
reference_run_id = list(
    mimic_df[
        (mimic_df["data_params_SequenceConfigx_sequence_column_name"] == "level_0") &
        (mimic_df["data_tags_model_type"] != "simple")
    ]["info_run_id"]
)[0]
reference_attention = load_attention_weights(reference_run_id, mlflow_helper.local_mlflow_dir)
all_features = set(reference_attention.keys())
len(all_features)

In [None]:
cluster_infos = []

for level in set(mimic_df["data_params_SequenceConfigx_sequence_column_name"]):
    icd9_parents = load_icd9_hierarchy_parents_for_level(
                icd9_hierarchy=icd9_hierarchy, 
                all_features=all_features, 
                max_level=level)
    
    for run_id in set(
        mimic_df[
            (mimic_df["data_params_SequenceConfigx_sequence_column_name"] == level)
        ]["info_run_id"]
    ):
        original_attention = load_attention_weights(run_id, mlflow_helper.local_mlflow_dir)
        if original_attention is None:
            original_attention = {}
        
        attention = add_icd9_hierarchy_attention_weights_for_level(
                feature_parents=icd9_parents,
                attention_weights=original_attention)
        attention_importances = calculate_attention_importances(attention)
        clusters_around = {
            x:[y for y in ys if y[1] > 0.9] for x,ys in attention_importances.items()
        }
        clusters_around = {
            x:ys for x,ys in clusters_around.items() if len(ys) > 0
        }
        shared_clusters = {
            x:ys for x,ys in clusters_around.items() if len(ys) > 1
        }
        single_clusters = {
            x:ys for x,ys in clusters_around.items() if len(ys) == 1
        }

        all_inputs = set(attention.keys())
        clustered_inputs = {
            y[0] for _,ys in clusters_around.items() for y in ys
        }
        shared_clustered_inputs = {
            y[0] for _,ys in shared_clusters.items() for y in ys
        }
        single_clustered_inputs = {
            y[0] for _,ys in single_clusters.items() for y in ys
        }
        non_clustered_inputs = all_inputs - clustered_inputs

        if len(original_attention) == 0:
            original_attention = {
                x:{x:1.0}
                for x in icd9_parents.values()
            }

        attention_importances_o = calculate_attention_importances(original_attention)
        clusters_around_o = {
            x:[y for y in ys if y[1] > 0.9] for x,ys in attention_importances_o.items()
        }
        clusters_around_o = {
            x:ys for x,ys in clusters_around_o.items() if len(ys) > 0
        }
        shared_clusters_o = {
            x:ys for x,ys in clusters_around_o.items() if len(ys) > 1
        }
        single_clusters_o = {
            x:ys for x,ys in clusters_around_o.items() if len(ys) == 1
        }

        all_inputs_o = set(original_attention.keys())
        clustered_inputs_o = {
            y[0] for _,ys in clusters_around_o.items() for y in ys
        }
        shared_clustered_inputs_o = {
            y[0] for _,ys in shared_clusters_o.items() for y in ys
        }
        single_clustered_inputs_o = {
            y[0] for _,ys in single_clusters_o.items() for y in ys
        }
        non_clustered_inputs_o = all_inputs_o - clustered_inputs_o
        cluster_infos.append({
            'run_id': run_id, 
            'all_inputs': len(all_inputs),
            'clustered_inputs': len(clustered_inputs),
            'clustered_inputs_p': len(clustered_inputs) / len(all_inputs),
            'shared_clustered_inputs': len(shared_clustered_inputs),
            'shared_clustered_inputs_p': len(shared_clustered_inputs) / len(all_inputs),
            'single_clustered_inputs': len(single_clustered_inputs),
            'single_clustered_inputs_p': len(single_clustered_inputs) / len(all_inputs),
            'non_clustered_inputs': len(non_clustered_inputs),
            'non_clustered_inputs_p': len(non_clustered_inputs) / len(all_inputs),
            'clusters': len(clusters_around),
            'shared_clusters': len(shared_clusters),
            'shared_clusters_p': len(shared_clusters) / len(clusters_around),
            'single_clusters': len(single_clusters),
            'single_clusters_p': len(single_clusters) / len(clusters_around),
            'all_inputs_o': len(all_inputs_o),
            'clustered_inputs_o': len(clustered_inputs_o),
            'clustered_inputs_p_o': len(clustered_inputs_o) / len(all_inputs_o),
            'shared_clustered_inputs_o': len(shared_clustered_inputs_o),
            'shared_clustered_inputs_p_o': len(shared_clustered_inputs_o) / len(all_inputs_o),
            'single_clustered_inputs_o': len(single_clustered_inputs_o),
            'single_clustered_inputs_p_o': len(single_clustered_inputs_o) / len(all_inputs_o),
            'non_clustered_inputs_o': len(non_clustered_inputs_o),
            'non_clustered_inputs_p_o': len(non_clustered_inputs_o) / len(all_inputs_o),
            'clusters_o': len(clusters_around_o),
            'shared_clusters_o': len(shared_clusters_o),
            'shared_clusters_p_o': len(shared_clusters_o) / len(clusters_around_o),
            'single_clusters_o': len(single_clusters_o),
            'single_clusters_p_o': len(single_clusters_o) / len(clusters_around_o),
        })
    
pd.DataFrame.from_records(cluster_infos)

In [None]:
added_columns = cluster_infos[1].keys()
merged = pd.merge(
    pd.melt(pd.DataFrame.from_records(cluster_infos), id_vars="run_id", value_vars=[x for x in added_columns if x != "run_id"]),
    mimic_df,
    left_on="run_id",
    right_on="info_run_id",)
merged[["variable", "value", "data_tags_model_type"]]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

f = sns.catplot(
    data=merged,
    x="data_params_SequenceConfigx_sequence_column_name",
    order=["level_0", "level_1", "level_2"],
    sharey=False,
    y="value", col="variable", row="data_params_ModelConfigbase_hidden_embeddings_trainable",
    kind="box", hue="data_tags_model_type")
f.set_titles("Trainable: {row_name}, Metric: {col_name}")
plt.show()

In [None]:
f = sns.catplot(
    data=merged[merged["variable"].apply(lambda x: x in ["clustered_inputs_p", "shared_clustered_inputs_p", "single_clustered_inputs_p"])],
    x="data_params_SequenceConfigx_sequence_column_name",
    order=["level_0", "level_1", "level_2"],
    sharey=False,
    y="value", col="variable", row="data_params_ModelConfigbase_hidden_embeddings_trainable",
    kind="box", hue="data_tags_model_type")
f.set_titles("Trainable: {row_name}, Metric: {col_name}")
plt.show()

In [None]:
def calculate_clusters(run_id, local_mlflow_dir, icd9_parents, threshold=0.9):
    original_attention = load_attention_weights(run_id, local_mlflow_dir)
    if original_attention is None:
        original_attention = {}
    
    attention = add_icd9_hierarchy_attention_weights_for_level(
            feature_parents=icd9_parents,
            attention_weights=original_attention)
    attention_importances = calculate_attention_importances(attention)
    clusters_around = {
        x:[y[0] for y in ys if y[1] > threshold] for x,ys in attention_importances.items()
    }
    clusters_around = {
        x:ys for x,ys in clusters_around.items() if len(ys) > 0
    }
    shared_clusters = {
        x:ys for x,ys in clusters_around.items() if len(ys) > 1
    }
    single_clusters = {
        x:ys for x,ys in clusters_around.items() if len(ys) == 1
    }

    all_inputs = set(attention.keys())
    clustered_inputs = {
        y for _,ys in clusters_around.items() for y in ys
    }
    shared_clustered_inputs = {
        y for _,ys in shared_clusters.items() for y in ys
    }
    single_clustered_inputs = {
        y for _,ys in single_clusters.items() for y in ys
    }
    non_clustered_inputs = all_inputs - clustered_inputs
    return {
        "clusters_around": clusters_around,
        "shared_clusters": shared_clusters,
        "single_clusters": single_clusters,
        "clustered_inputs": clustered_inputs,
        "non_clustered_inputs": non_clustered_inputs,
        "shared_clustered_inputs": shared_clustered_inputs,
        "single_clustered_inputs": single_clustered_inputs,
    }
    

def compare_clusters(run_id_1, run_id_2, local_mlflow_dir, icd9_parents_1, icd9_parents_2, cluster_threshold=0.99):
    clusters_1 = calculate_clusters(run_id_1, local_mlflow_dir, icd9_parents_1)
    clusters_2 = calculate_clusters(run_id_2, local_mlflow_dir, icd9_parents_2)

    return {
        run_id_1: clusters_1,
        run_id_2: clusters_2,
        "same_clustered_inputs": clusters_1["clustered_inputs"].intersection(clusters_2["clustered_inputs"]),
        "same_nonclustered_inputs": clusters_1["non_clustered_inputs"].intersection(clusters_2["non_clustered_inputs"]),
        "same_shared_clustered_inputs": clusters_1["shared_clustered_inputs"].intersection(clusters_2["shared_clustered_inputs"]),
        "same_single_clustered_inputs": clusters_1["single_clustered_inputs"].intersection(clusters_2["single_clustered_inputs"]),
        "same_clusters": [
            x for x in clusters_1["clusters_around"].values() if len([
                y for y in clusters_2["clusters_around"].values() if len(set(y).intersection(set(x))) / len(set(x).union(set(y))) > cluster_threshold
            ]) > 0
        ],
        "same_shared_clusters": [
            x for x in clusters_1["shared_clusters"].values() if len([
                y for y in clusters_2["shared_clusters"].values() if len(set(y).intersection(set(x))) / len(set(x).union(set(y))) > cluster_threshold
            ]) > 0
        ],
        "same_single_clusters": [
            x for x in clusters_1["single_clusters"].values() if len([
                y for y in clusters_2["single_clusters"].values() if len(set(y).intersection(set(x))) / len(set(x).union(set(y))) > cluster_threshold
            ]) > 0
        ],
    }

In [None]:
comparisons = []
level_parents = {}

for run_id_1 in set(mimic_df["info_run_id"]):
    level_1 = mimic_df[mimic_df["info_run_id"] == run_id_1]["data_params_SequenceConfigx_sequence_column_name"].iloc[0]
    if level_1 not in level_parents:
        level_parents[level_1] = load_icd9_hierarchy_parents_for_level(
            icd9_hierarchy=icd9_hierarchy, 
            all_features=all_features, 
            max_level=level_1)
    
    icd9_parents_1 = level_parents[level_1]
    for run_id_2 in set(mimic_df["info_run_id"]):
        level_2 = mimic_df[mimic_df["info_run_id"] == run_id_2]["data_params_SequenceConfigx_sequence_column_name"].iloc[0]
        if level_2 not in level_parents:
            level_parents[level_2] = load_icd9_hierarchy_parents_for_level(
                icd9_hierarchy=icd9_hierarchy, 
                all_features=all_features, 
                max_level=level_2)
        icd9_parents_2 = level_parents[level_2]
        comparison = compare_clusters(run_id_1, run_id_2, mlflow_helper.local_mlflow_dir, icd9_parents_1, icd9_parents_2, cluster_threshold=0.9)
        comparisons.append({
            "run_id_1": run_id_1,
            "run_id_2": run_id_2,
            "same_clusters": len(comparison["same_clusters"]),
            "same_shared_clusters": len(comparison["same_shared_clusters"]),
            "same_single_clusters": len(comparison["same_single_clusters"]),
            "same_clustered_inputs": len(comparison["same_clustered_inputs"]),
            "same_nonclustered_inputs": len(comparison["same_nonclustered_inputs"]),
            "same_shared_clustered_inputs": len(comparison["same_shared_clustered_inputs"]),
            "same_single_clustered_inputs": len(comparison["same_single_clustered_inputs"]),
        })

pd.DataFrame.from_records(comparisons)

In [None]:
level_1="level_2"
level_2 = "level_0"
comp_1 = "simple"
comp_2 = "gram"

icd9_parents_1 = load_icd9_hierarchy_parents_for_level(
            icd9_hierarchy=icd9_hierarchy, 
            all_features=all_features, 
            max_level=level_1)
icd9_parents_2 = load_icd9_hierarchy_parents_for_level(
            icd9_hierarchy=icd9_hierarchy, 
            all_features=all_features, 
            max_level=level_2)
run_id_1 = mimic_df[
    (mimic_df["data_params_SequenceConfigx_sequence_column_name"] == level_1) &
    (mimic_df["data_tags_model_type"] == comp_1) &
    (mimic_df["data_params_ModelConfigbase_feature_embeddings_trainable"] == "False")
]["info_run_id"].iloc[0]
run_id_2 = mimic_df[
    (mimic_df["data_params_SequenceConfigx_sequence_column_name"] == level_2) &
    (mimic_df["data_tags_model_type"] == comp_2) &
    (mimic_df["data_params_ModelConfigbase_feature_embeddings_trainable"] == "False") &
    (mimic_df["info_run_id"] != run_id_1)
]["info_run_id"].iloc[0]
ccomparison = compare_clusters(run_id_1, run_id_2, mlflow_helper.local_mlflow_dir, icd9_parents_1, icd9_parents_2, cluster_threshold=0.9)
len(ccomparison["same_clusters"])

In [None]:
len(ccomparison["same_clustered_inputs"])

In [None]:
comparison = Comparison(
    run_id_1=run_id_1, 
    suffix_1="_" + comp_1 + level_1, 
    run_id_2=run_id_2, 
    suffix_2="_" + comp_2 + level_2, 
    local_mlflow_dir=mlflow_helper.local_mlflow_dir,
    num_percentiles=10,
    feature_replacements=icd9_parents_1)
plot_rank_comparison(comparison)
plot_outlier_distances(comparison)
analyse_best_worst_sequences(comparison, num_best_sequences=1, num_worst_sequences=1, descriptions=load_icd9_text())

In [None]:
plot_rank_comparison(comparison, 
    color="avg_input_frequencies_percentile" + comparison.suffix_1,
    hover_data=[
        "avg_input_frequencies_percentile" + comparison.suffix_1,
        "avg_input_frequencies_percentile" + comparison.suffix_2,
    ])
plot_rank_comparison(comparison, 
    color="avg_input_frequencies_percentile" + comparison.suffix_2,
    hover_data=[
        "avg_input_frequencies_percentile" + comparison.suffix_1,
        "avg_input_frequencies_percentile" + comparison.suffix_2,
    ])

In [None]:
plot_comparison(comparison,
    plot_column="avg_input_frequencies")

In [None]:
index=max(comparison.comparison_df.index)

display(comparison.comparison_df.loc[index]["input" + comparison.suffix_1])
display(comparison.comparison_df.loc[index]["input" + comparison.suffix_2])
display(comparison.comparison_df.loc[index][[
    "output_rank_noties" + comparison.suffix_1, 
    "output_rank_noties" + comparison.suffix_2, 
    "avg_input_frequencies" + comparison.suffix_1,
    "avg_input_frequencies" + comparison.suffix_2,
    "outlier_distance"]])
print(comparison.suffix_1)
for input in comparison.comparison_df.loc[index]["original_inputs" + comparison.suffix_1].split(','):
    if input.strip() in comparison.attention_weights_for(comparison.suffix_1):
        display(comparison.attention_weights_for(comparison.suffix_1).get(input.strip()))
print(comparison.suffix_2)
for input in comparison.comparison_df.loc[index]["original_inputs" + comparison.suffix_2].split(','):
    if input.strip() in comparison.attention_weights_for(comparison.suffix_2):
        display(comparison.attention_weights_for(comparison.suffix_2).get(input.strip()))