In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.features import preprocessing

In [None]:
%run utils/base.py
%run utils/loading.py
%run utils/attention_graph.py
%run utils/mlflow_query.py

In [None]:
mlflow_helper = MlflowHelper(pkl_file=Path("mlflow_run_df.pkl"))
#mlflow_helper.query_all_runs(pkl_file=Path("mlflow_run_df.pkl"))

In [None]:
huawei_df = mlflow_helper.huawei_run_df(
    valid_x_columns=["log_cluster_template", "fine_log_cluster_template", "coarse_log_cluster_template", "attention_log_cluster_template_90"],
    valid_y_columns=["attributes", "coarse_log_cluster_template"],
    include_drain_hierarchy=True,
)
huawei_df = huawei_df[
    huawei_df["data_params_ModelConfigbase_feature_embeddings_trainable"].astype(str) == "False"
]

huawei_df.groupby(by=[
    "data_params_SequenceConfigx_sequence_column_name",
    "data_params_SequenceConfigy_sequence_column_name",
    "data_tags_model_type",
]).agg({
    "info_run_id": len
})

# Drain Hierarchy - Suggested Templates

In [None]:
run_id = huawei_df[
    (huawei_df["data_params_SequenceConfigx_sequence_column_name"] == "fine_log_cluster_template")
    & (huawei_df["data_params_SequenceConfigy_sequence_column_name"] == "attributes")
    & (huawei_df["data_tags_model_type"] == "gram_logs")
]["info_run_id"].iloc[0]
run_id

In [None]:
feature_node_mapping = create_graph_visualization(
    run_id=run_id, 
    local_mlflow_dir=mlflow_helper.local_mlflow_dir,
    threshold=0.2, 
    run_name='drain_hierarchy', 
    use_node_mapping=False)

In [None]:
original_logs = df = pd.DataFrame(
    pd.read_csv('../data/logs_aggregated_concurrent.csv')["Payload"].fillna("").astype(str).replace(np.nan, "", regex=True).dropna().drop_duplicates().reset_index(drop=True)
)
drain = preprocessing.Drain(
    preprocessing.DrainParameters(
        depth=huawei_df[
            huawei_df["info_run_id"] == run_id
        ]["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(int).iloc[0],
        st=huawei_df[
            huawei_df["info_run_id"] == run_id
        ]["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(float).iloc[0],
        rex=[
            ("(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)", ""),
            ("[^a-zA-Z0-9\-\.]", " "),
            ("[^a-zA-Z\d\s:]", ""),
        ],
    ),
    data_df=original_logs,
    data_df_column_name="Payload",
)
drain_result = drain.load_data().drop_duplicates().set_index("log_idx")
log_result_df = pd.merge(
    original_logs, 
    drain_result, 
    left_index=True, 
    right_index=True, 
    how="left"
).rename(columns={
    "cluster_template": "fine_log_cluster_template"
})[["Payload", "fine_log_cluster_template"]]

log_result_df

In [None]:
attention_weights = load_attention_weights(run_id=run_id, local_mlflow_dir=mlflow_helper.local_mlflow_dir)
attention_representations = []

for fine_template in attention_weights:
    best_representation = sorted([(k,float(v)) for k,v in attention_weights[fine_template].items()], key=lambda x: x[1], reverse=True)[0]
    attention_representation = {
        "attention_representation_" + str(x): (best_representation[0] if float(best_representation[1]) > x else fine_template)
        for x in [0.5, 0.9]
    }
    attention_representation["fine_log_cluster_template"] = fine_template[len("fine_log_cluster_template#"):]
    attention_representation["attention_representation"] = best_representation[0]
    attention_representations.append(attention_representation)

pd.DataFrame.from_records(attention_representations)

In [None]:
def extract_type_template(attention_representation: str):
    if "_log_cluster_template#" in attention_representation:
        splitted = attention_representation.split("#")
        return (splitted[0], " ".join(splitted[1:]))
    elif attention_representation.startswith("coarse_log_cluster_path#"):
        splitted = attention_representation.split("#")
        return (splitted[0], " ".join(splitted[1:]) + " ***")
    elif attention_representation.startswith("coarse_log_cluster_path->"):
        splitted = attention_representation.split("->")
        return (splitted[0] + "_" + str(len(splitted)-2), " ".join(splitted[2:]) + " ***")
    else:
        return ("???", attention_representation)

In [None]:
drain_df = pd.merge(
    log_result_df, 
    pd.DataFrame.from_records(attention_representations),
    how="left",
    on="fine_log_cluster_template").fillna("")
for column in [x for x in drain_df.columns if x.startswith("attention_representation")]:
    drain_df[column + "_type"] = drain_df[column].apply(lambda x: extract_type_template(str(x))[0])
    drain_df[column + "_template"] = drain_df[column].apply(lambda x: extract_type_template(str(x))[1])

drain_df[["attention_representation", "attention_representation_type", "attention_representation_template"]].describe()

In [None]:
drain_grouped = drain_df.groupby(by=["attention_representation_type"]).agg({
    "Payload": lambda x: len(set(x)),
    "fine_log_cluster_template": lambda x: len(set(x)), 
    "attention_representation_template": lambda x: len(set(x)),
})
drain_grouped

In [None]:
melt_df = drain_grouped.reset_index(drop=False).melt(
    id_vars=["attention_representation_type"],
    value_vars=["Payload"],# "fine_log_cluster_template", "attention_representation_template"],
    value_name="num_examples",
    var_name="type")
g = sns.catplot(data=melt_df,
    x="attention_representation_type", 
    y="num_examples",
    hue="type",
    order=[
        "fine_log_cluster_template", "0_log_cluster_template", "1_log_cluster_template", "2_log_cluster_template", 
        "coarse_log_cluster_template", "coarse_log_cluster_path"
    ] + ["coarse_log_cluster_path_" + str(x) for x in reversed(range(
        max([int(x.split("_")[-1]) for x in drain_grouped.index if x.startswith("coarse_log_cluster_path_")]) + 1
    ))],
    #col="type",
    kind="bar",
    sharey=False,
    palette="Set2",
    legend=False,
).set_xticklabels(rotation=90).set_axis_labels("", "number of log lines")
plt.tight_layout()
plt.savefig("drain_distribution.png", dpi=100)

In [None]:
drain_df[
    drain_df["attention_representation_type"] == "coarse_log_cluster_path_0"
][["Payload", "attention_representation"]]

In [None]:
drain_df["attention_log_cluster_template_50"] = drain_df["attention_representation_0.5"]
drain_df["attention_log_cluster_template_90"] = drain_df["attention_representation_0.9"]
drain_df[
    ["Payload", "attention_log_cluster_template_50", "attention_log_cluster_template_90"]
].to_csv("drain_attention_clusters.csv", index=False)

In [None]:
pd.read_csv("drain_attention_clusters.csv").describe()

In [None]:
attention_importances = calculate_attention_importances(attention_weights)
examples_extensions = [x[0].split("#")[1] for x in attention_importances["coarse_log_cluster_path->9->extension"] if x[1] > 0.9]
examples_instance = [x[0].split("#")[1] for x in attention_importances["coarse_log_cluster_path->7->instance"] if x[1] > 0.9]
examples_instance2 = [x[0].split("#")[1] for x in attention_importances["coarse_log_cluster_path->9->instance->*"] if x[1] > 0.9]
examples_automatically = [x[0].split("#")[1] for x in attention_importances["coarse_log_cluster_path->10->automatically"] if x[1] > 0.9]
examples_cleaning= [x[0].split("#")[1] for x in attention_importances["coarse_log_cluster_path->4->cleaning->stale"] if x[1] > 0.9]
examples_date = [x[0].split("#")[1] for x in attention_importances["coarse_log_cluster_path->17"] if x[1] > 0.9]

examples_date

In [None]:
attention_weights["fine_log_cluster_template#25 nov 2019 19 * * 0100 get v3 auth tokens http 11 200 * * pythonkeystoneclient"]

In [None]:
attention_weights[[x for x in attention_importances["coarse_log_cluster_path->9->extension"] if x[1] < 0.9][0][0]]

In [None]:
[
    (x, ys) for x,ys in attention_importances.items() if 
    "either ensure your deployment is ready" in x 
    and x.startswith("coarse_log_cluster_template") 
    and "* * * * * * * * * * * *" in x
    and len([y for y in ys if y[1] > 0.2]) == 3
]

# Experiment Results

In [None]:
huawei_metrics_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))
huawei_metrics_df

In [None]:
sns.catplot(
    data=huawei_metrics_df[
        (huawei_metrics_df["data_params_SequenceConfigx_sequence_column_name"].apply(lambda x: x != "log_cluster_template"))
        #& (huawei_metrics_df["data_params_SequenceConfigy_sequence_column_name"].apply(lambda x: x == "attributes"))
        #& (huawei_metrics_df["data_tags_model_type"].apply(lambda x: x == "simple"))
    ],
    hue="data_tags_model_type", 
    x="data_params_SequenceConfigx_sequence_column_name",
    y="val_top_5_categorical_accuracy_history_best",
    row="data_params_SequenceConfigy_sequence_column_name",
    order=["fine_log_cluster_template", "coarse_log_cluster_template", "attention_log_cluster_template_90"],
    kind="box",
    sharey="row",
).set_xticklabels(rotation=90)

In [None]:
df = huawei_metrics_df.copy()
df["data_tags_model_type"] = df["data_tags_model_type"].apply(lambda x: {
    "gram": "hierarchy",
}.get(x, x))
df["data_params_SequenceConfigx_sequence_column_name"] = df["data_params_SequenceConfigx_sequence_column_name"].apply(lambda x: {
    "log_cluster_template": "fine_log_cluster_template",
    "attention_log_cluster_template_90": "attention_log_cluster_template",
}.get(x, x))
df["Log Template"] = df["data_params_SequenceConfigx_sequence_column_name"].apply(lambda x: "attention" if "attention" in x else "drain")
g = sns.catplot(
    data=df[
        df["data_params_SequenceConfigy_sequence_column_name"].apply(lambda x: x == "attributes")
        & df["data_tags_model_type"].apply(lambda x: x == "simple")
        &  df["data_params_ModelConfigbase_hidden_embeddings_trainable"].apply(lambda x: x == "False")
    ],
    hue="Log Template", 
    x="data_params_SequenceConfigx_sequence_column_name",
    y="val_top_5_categorical_accuracy_history_best",
    row="data_params_SequenceConfigy_sequence_column_name",
    order=["fine_log_cluster_template", "coarse_log_cluster_template", "attention_log_cluster_template"],
    kind="box",
    palette="Set2",
    dodge=False,
).set_xticklabels(rotation=45).set_titles("").set_axis_labels('', "val_top_5_categorical_accuracy")
plt.savefig("drain_results.png", dpi=100, bbox_inches="tight")
plt.show()

# Using MIMIC Clusters as Inputs

In [None]:
hierarchy = pd.read_csv('../data/hierarchy_icd9.csv')
hierarchy[hierarchy["level_0"] == "976.1"].iloc[0].to_dict()

In [None]:
mimic_df = mlflow_helper.mimic_run_df()
example_mimic_run_id = mimic_df[
    (mimic_df["data_params_ModelConfigbase_feature_embeddings_trainable"] == "False")
    & (mimic_df["data_tags_model_type"] == "gram")
    & (mimic_df["data_params_SequenceConfigx_sequence_column_name"] == "level_0")
]["info_run_id"].iloc[2]

attention_weights = load_attention_weights(example_mimic_run_id, mlflow_helper.local_mlflow_dir)
frequencies = load_input_frequency_dict(example_mimic_run_id, mlflow_helper.local_mlflow_dir)
hierarchy = pd.read_csv('../data/hierarchy_icd9.csv')

attention_representations = []
diff_09 = []
diff_05 = []
for input in attention_weights:
    levels = hierarchy[hierarchy["level_0"] == input].iloc[0].to_dict()
    best_representation = sorted([(k,float(v)) for k,v in attention_weights[input].items()], key=lambda x: x[1], reverse=True)[0]
    attention_representation = {
        "attention_representation_" + str(x): (best_representation[0] if float(best_representation[1]) > x else input)
        for x in [0.5, 0.9]
    }
    attention_representation["original_level_cluster"] = input
    attention_representation["attention_representation"] = best_representation[0]
    for key in set(attention_representation.keys()):
        attention_representation[key + "_level"] = sorted([
            x for x in levels if levels[x] == attention_representation[key]
        ])[0]

    if attention_representation["attention_representation"] != attention_representation["attention_representation_0.9"]:
        diff_09.append(input)
        
    if attention_representation["attention_representation"] != attention_representation["attention_representation_0.5"]:
        diff_05.append(input)
        
    attention_representations.append(attention_representation)

print(len(diff_05))
print(len(diff_09))
pd.DataFrame.from_records(attention_representations).to_csv("gram_attention_levels2.csv", index=False)
icd_df = pd.read_csv("gram_attention_levels2.csv")
icd_df.head()

In [None]:
icd_df_grouped = icd_df.groupby(by=["attention_representation_level"]).agg({
    "original_level_cluster": lambda x: len(set(x)),
})
icd_df_grouped

In [None]:
melt_df = icd_df_grouped.reset_index(drop=False).melt(
    id_vars=["attention_representation_level"],
    value_vars=["original_level_cluster"],# "fine_log_cluster_template", "attention_representation_template"],
    value_name="num_examples",
    var_name="type")
g = sns.catplot(data=melt_df,
    x="attention_representation_level", 
    y="num_examples",
    hue="type",
    order=[
        "level_0", "level_1", "level_2", 'level_3', "level_4"
    ],
    #col="type",
    kind="bar",
    sharey=False,
    palette="Set2",
    legend=False,
).set_xticklabels(rotation=90).set_axis_labels("", "number of level_0 features")
plt.tight_layout()
plt.savefig("icd_distribution.png", dpi=100)

In [None]:
mimic_df = mlflow_helper.mimic_run_df(    
    valid_x_columns=["level_0", "level_1", "level_2", "level_3", "attention_representation", "attention_representation_0.5", "attention_representation_0.9"],
)
mimic_df = mimic_df[
    mimic_df["data_params_ModelConfigbase_feature_embeddings_trainable"].astype(str) == "False"
]
mimic_df = mlflow_helper.load_best_metrics_for_ids(set(mimic_df["info_run_id"]))
mimic_df

In [None]:
sns.catplot(
    data=mimic_df,
    hue="data_tags_model_type", 
    x="data_params_SequenceConfigx_sequence_column_name",
    y="val_top_20_categorical_accuracy_history_best",
    #row="data_params_SequenceConfigy_sequence_column_name",
    order=["level_0", "level_1", "level_2", "attention_representation", "attention_representation_0.9", "attention_representation_0.5"],
    kind="box",
    sharey="row",
).set_xticklabels(rotation=90)

In [None]:
sns.catplot(
    data=mimic_df[
        mimic_df["data_tags_model_type"] == "simple"
    ],
    #hue="data_tags_model_type", 
    x="data_params_SequenceConfigx_sequence_column_name",
    y="val_top_20_categorical_accuracy_history_best",
    #row="data_params_SequenceConfigy_sequence_column_name",
    order=["level_0", "level_1", "attention_representation"],
    kind="box",
    sharey="row",
).set_xticklabels(rotation=90)

In [None]:
df = mimic_df.copy()
df["Input Feature"] = df["data_params_SequenceConfigx_sequence_column_name"].apply(lambda x: "ICD9" if "attention" not in x else "attention")
g = sns.catplot(
    data=df[
        df["data_params_SequenceConfigy_sequence_column_name"].apply(lambda x: x == "level_3")
        & df["data_tags_model_type"].apply(lambda x: x == "simple")
        &  df["data_params_ModelConfigbase_hidden_embeddings_trainable"].apply(lambda x: x == "False")
    ],
    hue="Input Feature", 
    x="data_params_SequenceConfigx_sequence_column_name",
    y="val_top_20_categorical_accuracy_history_best",
    #row="data_params_SequenceConfigy_sequence_column_name",
    order=["level_0", "level_1", "level_2", "attention_representation"],
    hue_order=["ICD9", "attention"],
    kind="box",
    palette="Set2",
    dodge=False,
).set_xticklabels(rotation=45).set_titles("").set_axis_labels('', "val_top_20_categorical_accuracy")
plt.savefig("mimic_results.png", dpi=100, bbox_inches="tight")
plt.show()