In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import sys 
import os
from pathlib import Path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.refinement import knowledge

In [None]:
%run utils/attention_graph.py
%run utils/mlflow_query.py
%run utils/percentiles.py
%run utils/loading.py
%run utils/comparison.py
%run utils/refinement.py

In [None]:
mlflow_helper = MlflowHelper(pkl_file=Path("mlflow_run_df.pkl"))
#mlflow_helper.query_all_runs(query_metrics=False)

# Experiment Results

## Mimic

In [None]:
relevant_mimic_ref_df = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=True)
relevant_mimic_ref_df = relevant_mimic_ref_df[
        relevant_mimic_ref_df["data_tags_refinement_type"].fillna("").astype(str).apply(len) > 0
].copy()
relevant_mimic_ref_df['refinement_run'] = relevant_mimic_ref_df["data_tags_refinement_type"].apply(lambda x: x.split("_")[0])
relevant_mimic_ref_df['refinement_type'] = relevant_mimic_ref_df["data_tags_refinement_type"].apply(lambda x: "_".join(x.split("_")[1:]))
relevant_mimic_ref_df

In [None]:
mimic_accuracy_df = mlflow_helper.load_best_metrics_for_ids(run_ids=set(relevant_mimic_ref_df['info_run_id']))
mimic_accuracy_df['refinement_run'] = mimic_accuracy_df["data_tags_refinement_type"].apply(lambda x: x.split("_")[0])
mimic_accuracy_df['refinement_type'] = mimic_accuracy_df["data_tags_refinement_type"].apply(lambda x: "_".join(x.split("_")[1:]))
mimic_accuracy_df['refinement_type_order'] = mimic_accuracy_df['refinement_type'].replace({
    'reference':0, 
    'original':1, 
    'refinement_0':2,
    'refinement_1':3,
    'refinement_2':4,})

mimic_accuracy_df

In [None]:
g = sns.lineplot(data=mimic_accuracy_df[
        (mimic_accuracy_df['data_params_RefinementConfigreference_file_knowledge'].fillna('').apply(len) > 0) &
        (mimic_accuracy_df['data_params_RefinementConfigedges_to_add'].fillna(0.0).astype(float) == 0.0) &
        (mimic_accuracy_df['val_top_20_categorical_accuracy_history_best'].fillna(-1) > 0)
    ].sort_values(by="refinement_type_order"), 
    x="refinement_type", 
    y="val_top_20_categorical_accuracy_history_best", 
    hue="data_params_RefinementConfigoriginal_file_knowledge",
    estimator=None,
    units="refinement_run",
    sort=False,
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
g = sns.lineplot(data=mimic_accuracy_df[
        (mimic_accuracy_df['data_params_RefinementConfigreference_file_knowledge'].fillna('').apply(len) > 0) &
        (mimic_accuracy_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) == 0.1) &
        (mimic_accuracy_df['val_top_20_categorical_accuracy_history_best'].fillna(-1) > 0)
    ].sort_values(by="refinement_type_order"), 
    x="refinement_type", 
    y="val_top_20_categorical_accuracy_history_best", 
    hue="data_params_RefinementConfigoriginal_file_knowledge",
    estimator=None,
    units="refinement_run",
    sort=False,
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
mimic_accuracy_df_p = calculate_accuracies_per_percentiles(
    relevant_run_df=relevant_mimic_ref_df, 
    k=20, num_percentiles=10, num_input_percentiles=10,
    percentile_names=[
        'avg_input_frequencies_percentile', 
        'median_input_frequencies_percentile', 
        'min_input_frequencies_percentile', 
        'p10_input_frequencies_percentile', 
        'unknown_inputs_percentile', 
        'output_frequency_percentile',
        'avg_input_frequencies_range', 
        'median_input_frequencies_range', 
        'min_input_frequencies_range', 
        'p10_input_frequencies_range', 
        'unknown_inputs_range', 
    ],
    local_mlflow_dir=mlflow_helper.local_mlflow_dir)
mimic_accuracy_df_p

In [None]:
plot_refinement_improvement(
    accuracy_df=mimic_accuracy_df_p,
    refinement_df=relevant_mimic_ref_df[relevant_mimic_ref_df["data_params_RefinementConfigedges_to_add"].fillna("0.0") == "0.0"],
    reference_refinement_type="original",
)

In [None]:
plot_refinement_improvement(
    accuracy_df=mimic_accuracy_df_p,
    refinement_df=relevant_mimic_ref_df[relevant_mimic_ref_df["data_params_RefinementConfigedges_to_add"].fillna("0.0") == "0.1"],
    reference_refinement_type="reference",
)

### GRAM without unknowns

In [None]:
def load_icd9_text():
    icd9_df = pd.read_csv("../data/icd9.csv")
    return (
        icd9_df[["child_name", "child_code"]]
        .drop_duplicates()
        .rename(columns={"child_name": "description", "child_code": "code",})
        .set_index("code")
        .to_dict("index")
    )

In [None]:
mimic_df = mlflow_helper.mimic_run_df(include_noise=False, include_refinements=False)

mimic_gram_run_id = mimic_df[
    (mimic_df['data_tags_noise_type'].fillna('').apply(len) == 0) &   
    (mimic_df['data_params_ModelConfigbase_hidden_embeddings_trainable'] == 'False') &
    (mimic_df['data_tags_model_type'] == 'gram')
].iloc[0].get('info_run_id')

texts = load_icd9_text()
unknowns = set([x for x,y in texts.items() if 
    (y["description"].lower().startswith("other")
    or y["description"].lower().startswith("unspecified")
    or y["description"].lower().endswith("unspecified")
    or y["description"].lower().endswith("unspecified type")
    or y["description"].lower().endswith("not elsewhere classified"))])

attentions = load_attention_weights(
    mimic_gram_run_id, 
    mlflow_helper.local_mlflow_dir
)
print(sum([len(x) for x in attentions.values()]))
attentions_without_unknowns = {
    x:[y for y in ys if y not in unknowns or x == y] for x,ys in attentions.items()
}
print(sum([len(x) for x in attentions_without_unknowns.values()]))
with open('gram_without_unknowns.json', 'w') as f:
    json.dump(attentions_without_unknowns, f)

unknowns2 = set([
    x for x,y in texts.items() 
    if any(z in y["description"].lower() for z in ["other", "unspecified", "elsewhere"])
])

attentions_without_unknowns2 = {
    x:[y for y in ys if y not in unknowns2 or x == y] for x,ys in attentions.items()
}
print(sum([len(x) for x in attentions_without_unknowns2.values()]))
with open('gram_without_unknowns2.json', 'w') as f:
    json.dump(attentions_without_unknowns2, f)

attentions_without_unknowns3 = {
    x:(
        [y for y in ys if y not in unknowns2 or x == y] if x not in unknowns2
        else [x]
     ) for x,ys in attentions.items()
}
print(sum([len(x) for x in attentions_without_unknowns3.values()]))
with open('gram_without_unknowns3.json', 'w') as f:
    json.dump(attentions_without_unknowns3, f)

## Huawei

In [None]:
relevant_huawei_ref_df = mlflow_helper.huawei_run_df(include_noise=False, include_refinements=True)
relevant_huawei_ref_df = relevant_huawei_ref_df[
        relevant_huawei_ref_df["data_tags_refinement_type"].fillna("").astype(str).apply(len) > 0
].copy()
relevant_huawei_ref_df['refinement_run'] = relevant_huawei_ref_df["data_tags_refinement_type"].apply(lambda x: x.split("_")[0])
relevant_huawei_ref_df['refinement_type'] = relevant_huawei_ref_df["data_tags_refinement_type"].apply(lambda x: "_".join(x.split("_")[1:]))
relevant_huawei_ref_df

In [None]:
huawei_accuracy_df = mlflow_helper.load_best_metrics_for_ids(run_ids=set(relevant_huawei_ref_df['info_run_id']))
huawei_accuracy_df['refinement_run'] = huawei_accuracy_df["data_tags_refinement_type"].apply(lambda x: x.split("_")[0])
huawei_accuracy_df['refinement_type'] = huawei_accuracy_df["data_tags_refinement_type"].apply(lambda x: "_".join(x.split("_")[1:]))
huawei_accuracy_df['refinement_type_order'] = huawei_accuracy_df['refinement_type'].replace({
    'reference':0, 
    'original':1, 
    'refinement_0':2,
    'refinement_1':3,
    'refinement_2':4,})
huawei_accuracy_df

In [None]:
g = sns.lineplot(data=huawei_accuracy_df[
        (huawei_accuracy_df['data_params_RefinementConfigreference_file_knowledge'].fillna('').apply(len) > 0) &
        (huawei_accuracy_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) <= 0) &
        (huawei_accuracy_df['val_top_5_categorical_accuracy_history_best'].fillna(-1) > 0)
    ].sort_values(by="refinement_type_order"), 
    x="refinement_type", 
    y="val_top_5_categorical_accuracy_history_best", 
    hue="data_params_RefinementConfigoriginal_file_knowledge",
    estimator=None,
    units="refinement_run",
    sort=False,
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
g = sns.lineplot(data=huawei_accuracy_df[
        (huawei_accuracy_df['data_params_RefinementConfigreference_file_knowledge'].fillna('').apply(len) > 0) &
        (huawei_accuracy_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) == 0.1) &
        (huawei_accuracy_df['val_top_5_categorical_accuracy_history_best'].fillna(-1) > 0)
    ].sort_values(by="refinement_type_order"), 
    x="refinement_type", 
    y="val_top_5_categorical_accuracy_history_best", 
    hue="data_params_RefinementConfigoriginal_file_knowledge",
    estimator=None,
    units="refinement_run",
    sort=False,
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
huawei_accuracy_df_p = calculate_accuracies_per_percentiles(
    relevant_run_df=relevant_huawei_ref_df, 
    k=20, num_percentiles=10, num_input_percentiles=10,
    percentile_names=[
        'avg_input_frequencies_percentile', 
        'median_input_frequencies_percentile', 
        'min_input_frequencies_percentile', 
        'p10_input_frequencies_percentile', 
        'unknown_inputs_percentile', 
        'output_frequency_percentile',
        'avg_input_frequencies_range', 
        'median_input_frequencies_range', 
        'min_input_frequencies_range', 
        'p10_input_frequencies_range', 
        'unknown_inputs_range', 
    ],
    local_mlflow_dir=mlflow_helper.local_mlflow_dir)
huawei_accuracy_df_p

In [None]:
plot_refinement_improvement(
    accuracy_df=huawei_accuracy_df_p,
    refinement_df=relevant_huawei_ref_df[relevant_huawei_ref_df["data_params_RefinementConfigedges_to_add"].fillna("0.0") == "0.0"],
    reference_refinement_type="original",
)

In [None]:
plot_refinement_improvement(
    accuracy_df=huawei_accuracy_df_p,
    refinement_df=relevant_huawei_ref_df[relevant_huawei_ref_df["data_params_RefinementConfigedges_to_add"].fillna("0.0") == "0.1"],
    reference_refinement_type="reference",
)

# Graph Plotting

In [None]:
mimic_example_runs = {
    'edges_added': {
        'gram': relevant_mimic_ref_df[
            (relevant_mimic_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/gram_original_file_knowledge.json') &
            (relevant_mimic_ref_df['data_params_RefinementConfigedges_to_add'].fillna('0.0') == '0.1') 
        ]['refinement_run'].iloc[0],
        'text': relevant_mimic_ref_df[
            (relevant_mimic_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/text_original_file_knowledge.json') &
            (relevant_mimic_ref_df['data_params_RefinementConfigedges_to_add'].fillna('0.0') == '0.1') 
        ]['refinement_run'].iloc[0],
        'causal': relevant_mimic_ref_df[
            (relevant_mimic_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/causal_original_file_knowledge.json') &
            (relevant_mimic_ref_df['data_params_RefinementConfigedges_to_add'].fillna('0.0') == '0.1') 
        ]['refinement_run'].iloc[0],
    },
    'edges_removed': {
        'gram': relevant_mimic_ref_df[
            (relevant_mimic_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/gram_original_file_knowledge.json') &
            (relevant_mimic_ref_df['data_params_RefinementConfigedges_to_add'].fillna('0.0') == '0.0') 
        ]['refinement_run'].iloc[0],
        'text': relevant_mimic_ref_df[
            (relevant_mimic_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/text_original_file_knowledge.json') &
            (relevant_mimic_ref_df['data_params_RefinementConfigedges_to_add'].fillna('0.0') == '0.0') 
        ]['refinement_run'].iloc[0],
        'causal': relevant_mimic_ref_df[
            (relevant_mimic_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/causal_original_file_knowledge.json') &
            (relevant_mimic_ref_df['data_params_RefinementConfigedges_to_add'].fillna('0.0') == '0.0') 
        ]['refinement_run'].iloc[0],
    },
}

mimic_example_runs

In [None]:
huawei_example_runs = {
    'edges_added': {
        'gram': relevant_huawei_ref_df[
            (relevant_huawei_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/huawei_gram_original_file_knowledge.json') &
            (relevant_huawei_ref_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) == 0.1) 
        ]['refinement_run'].iloc[0],
        'text': relevant_huawei_ref_df[
            (relevant_huawei_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/huawei_text_original_file_knowledge.json') &
            (relevant_huawei_ref_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) == 0.1) 
        ]['refinement_run'].iloc[0],
        'causal': relevant_huawei_ref_df[
            (relevant_huawei_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/huawei_causal_original_file_knowledge.json') &
            (relevant_huawei_ref_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) == 0.1) 
        ]['refinement_run'].iloc[0],
    },
    'edges_removed': {
        'gram': relevant_huawei_ref_df[
            (relevant_huawei_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/huawei_gram_original_file_knowledge.json') &
            (relevant_huawei_ref_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) <= 0) 
        ].sort_values(by="info_start_time")['refinement_run'].iloc[0],
        'text': relevant_huawei_ref_df[
            (relevant_huawei_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/huawei_text_original_file_knowledge.json') &
            (relevant_huawei_ref_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) <= 0) 
        ].sort_values(by="info_start_time")['refinement_run'].iloc[0],
        'causal': relevant_huawei_ref_df[
            (relevant_huawei_ref_df['data_params_RefinementConfigoriginal_file_knowledge'] == 'data/huawei_causal_original_file_knowledge.json') &
            (relevant_huawei_ref_df['data_params_RefinementConfigedges_to_add'].fillna(-1).astype(float) <= 0) 
        ].sort_values(by="info_start_time")['refinement_run'].iloc[0],
    },
}

huawei_example_runs

In [None]:
class RefinementConfig:
    min_edge_weight: float = 0.8
    max_train_examples: int = 100
    refinement_metric: str = "mean_outlier_score"
    refinement_metric_maxrank: int = 100
    max_edges_to_remove: int = 100
    max_refinement_metric: int = -1
    mlflow_dir: str = "../gsim01/mlruns/1/"

In [None]:
def plot_for_removed_edges(original_run_id, reference_run_id, local_mlflow_dir, use_node_mapping=False):
    original_attention = load_attention_weights(original_run_id, local_mlflow_dir)
    frequencies = load_input_frequency_dict(original_run_id, local_mlflow_dir)

    config = RefinementConfig()
    config.min_edge_weight = 0.5
    config.max_train_examples = 50
    config.max_refinement_metric = -2
    refined_knowledge = knowledge.KnowledgeProcessor(config).load_refined_knowledge(refinement_run_id=original_run_id, reference_run_id=reference_run_id)
    
    feature_node_mapping = convert_to_node_mapping(
        [x for x in original_attention], use_node_mapping
    )
    colored_connections = calculate_colored_connections(
        reference_connections=set(
            [(c,p) for c,ps in refined_knowledge.items() for p in ps]
        ),
        attention_weights=original_attention,
        feature_node_mapping=feature_node_mapping,
    )
    print("Removed", len(colored_connections), "edges")
    node_mapping = _create_graph_visualization(
        attention_weights=original_attention, 
        threshold=0.25, 
        run_name="refinement_edges_removed", 
        node_mapping=feature_node_mapping,
        colored_connections=colored_connections)
    return node_mapping, frequencies

def plot_for_added_edges(original_run_id, reference_run_id, local_mlflow_dir, use_node_mapping=False):
    original_attention = load_attention_weights(original_run_id, local_mlflow_dir)
    reference_attention = load_attention_weights(reference_run_id, local_mlflow_dir)
    frequencies = load_input_frequency_dict(original_run_id, local_mlflow_dir)

    config = RefinementConfig()
    config.min_edge_weight = 0.5
    config.max_train_examples = 50
    config.max_edges_to_remove = 1000
    config.max_refinement_metric = 2
    refined_knowledge = knowledge.KnowledgeProcessor(config).load_refined_knowledge(refinement_run_id=original_run_id, reference_run_id=reference_run_id)
    
    refined_attention = {c:{} for c in refined_knowledge}
    for child in original_attention:
        for parent in original_attention[child]:
            if parent in refined_knowledge.get(child, {}):
                refined_attention[child][parent] = original_attention[child][parent]

    feature_node_mapping = convert_to_node_mapping(
        [x for x in original_attention], use_node_mapping
    )
    colored_connections = calculate_colored_connections(
        reference_connections=set(
            [(c,p) for c,ps in reference_attention.items() for p in ps]
        ),
        attention_weights=refined_attention,
        feature_node_mapping=feature_node_mapping,
    )
    print("Added", len(colored_connections), "edges")
    node_mapping = _create_graph_visualization(
        attention_weights=refined_attention, 
        threshold=0.25, 
        run_name="refinement_edges_removed", 
        node_mapping=feature_node_mapping,
        colored_connections=colored_connections)
    return node_mapping, frequencies

In [None]:
_, frequencies = plot_for_removed_edges(
    original_run_id=relevant_mimic_ref_df[
        (relevant_mimic_ref_df['refinement_run'] == mimic_example_runs['edges_removed']['gram']) &
        (relevant_mimic_ref_df['refinement_type'] == "original")
    ]["info_run_id"].iloc[0], 
    reference_run_id=relevant_mimic_ref_df[
        (relevant_mimic_ref_df['refinement_run'] == mimic_example_runs['edges_removed']['gram']) &
        (relevant_mimic_ref_df['refinement_type'] == "reference")
    ]["info_run_id"].iloc[0], 
    local_mlflow_dir=mlflow_helper.local_mlflow_dir, 
    use_node_mapping=False,
)

In [None]:
_, frequencies = plot_for_removed_edges(
    original_run_id=relevant_huawei_ref_df[
        (relevant_huawei_ref_df['refinement_run'] == huawei_example_runs['edges_removed']['gram']) &
        (relevant_huawei_ref_df['refinement_type'] == "original")
    ]["info_run_id"].iloc[0], 
    reference_run_id=relevant_huawei_ref_df[
        (relevant_huawei_ref_df['refinement_run'] == huawei_example_runs['edges_removed']['gram']) &
        (relevant_huawei_ref_df['refinement_type'] == "reference")
    ]["info_run_id"].iloc[0], 
    local_mlflow_dir=mlflow_helper.local_mlflow_dir, 
    use_node_mapping=True,
)

In [None]:
original_run_id = relevant_huawei_ref_df[
    (relevant_huawei_ref_df['refinement_run'] == huawei_example_runs['edges_removed']['gram']) &
    (relevant_huawei_ref_df['refinement_type'] == "original")
]["info_run_id"].iloc[0]
reference_run_id = relevant_huawei_ref_df[
    (relevant_huawei_ref_df['refinement_run'] == huawei_example_runs['edges_removed']['gram']) &
    (relevant_huawei_ref_df['refinement_type'] == "reference")
]["info_run_id"].iloc[0]

original_attention = load_attention_weights(original_run_id, mlflow_helper.local_mlflow_dir)
frequencies = load_input_frequency_dict(original_run_id, mlflow_helper.local_mlflow_dir)

config = RefinementConfig()
config.min_edge_weight = 0.5
config.max_train_examples = 1000
config.max_refinement_metric = 0
refined_knowledge = knowledge.KnowledgeProcessor(config).load_refined_knowledge(refinement_run_id=original_run_id, reference_run_id=reference_run_id)

feature_node_mapping = convert_to_node_mapping(
    [x for x in original_attention], False,
)
colored_connections = calculate_colored_connections(
    reference_connections=set(
        [(c,p) for c,ps in refined_knowledge.items() for p in ps]
    ),
    attention_weights=original_attention,
    feature_node_mapping=feature_node_mapping,
)
print("Removed", len(colored_connections), "edges")

In [None]:
colored_connections

In [None]:
print('\n'.join([str((x, frequencies[x[0]]['absolute_frequency'])) for x in colored_connections if x[1] == "server"]))

In [None]:
with open("asldkfj.txt", "w") as file:
    file.write("\n".join([x for x, ys in original_attention.items() if "server" in ys and float(ys.get("server", -1)) > 0.5]))

In [None]:
node_mapping = _create_graph_visualization(
    attention_weights=original_attention, 
    threshold=0.25, 
    run_name="refinement_edges_removed_gram", 
    node_mapping=feature_node_mapping,
    colored_connections=colored_connections)

In [None]:
refined_knowledge_text = refined_knowledge