## Import Libraries Modules

In [61]:
import sys
parentdir = '/Users/jerzykaminski/Documents/GitHub/BAMT-old'
sys.path.insert(0, parentdir)


In [62]:
sys.path.append('/Users/jerzykaminski/Documents/GitHub/BAMT-old/bayesian')

In [63]:
import time
from tqdm.notebook import tqdm
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import random

from bayesian.train_bn import structure_learning
from graph.precision_recall import precision_recall
from preprocess.discretization import discretization, get_nodes_type, code_categories
from visualization.visualization import draw_BN, draw_comparative_hist
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import mutual_info_regression

mpl.style.use('seaborn')
plt.rcParams.update({'font.size': 22})


## Functions to calculate possible edges

In [64]:
def get_possible_edges_randomly(df, length):

    possible_edges = []

    for c1 in df.columns:
        for c2 in df.columns:
                possible_edges.append((c1, c2))

    possible_edges = random.sample(possible_edges, int(round(length)))

    return possible_edges

In [65]:
def get_n_nearest(data, columns, corr=False, number_close=5):
    """Returns n nearest neighbors for every column of dataframe, added into list

    Args:
        data (DataFrame): Proximity matrix
        columns (list): df.columns.tolist()
        corr (bool, optional): _description_. Defaults to False.
        number_close (int, optional): Number of nearest neighbors. Defaults to 5.

    Returns:
        groups
    """
    groups = []
    if corr:
        for c in columns:
            close_ind = data[c].sort_values(ascending=False).index.tolist()
            groups.append(close_ind[0:number_close+1])
    else:
        for c in columns:
            close_ind = data[c].sort_values().index.tolist()
            groups.append(close_ind[0:number_close+1])

    return groups


def get_proximity_matrix(df, df_coded, proximity_metric):
    """Returns matrix of mutual information score of the dataframe, dataframe must be coded first if it contains categorical data

    Args:
        df (DataFrame): data
        df_coded (DataFrame): same data, but coded
        proximity_metric (str): 'MI' or 'corr'

    Returns:
        df_distance: mutual information matrix
    """

    df_distance = pd.DataFrame(data=np.zeros(
        (len(df.columns), len(df.columns))), columns=df.columns)
    df_distance.index = df.columns

    if proximity_metric == 'MI':
        for c1 in df.columns:
            for c2 in df.columns:
                dist = mutual_info_score(
                    df_coded[c1].values, df_coded[c2].values)
                df_distance.loc[c1, c2] = dist

    elif proximity_metric == 'corr':
        df_distance = df_coded.corr(method='pearson')

    return df_distance


def get_brave_matrix(df, proximity_matrix, n_nearest=5):
    """Returns matrix Brave coeffitients of the DataFrame, requires proximity measure to be calculated

    Args:
        df (DataFrame): data
        proximity_matrix (DataFrame): might be generated by get_mutual_info_score_matrix() function or correlation from scipy
        n_nearest (int, optional): _description_. Defaults to 5.

    Returns:
        brave_matrix: DataFrame of Brave coefficients
    """

    brave_matrix = pd.DataFrame(data=np.zeros(
        (len(df.columns), len(df.columns))), columns=df.columns)
    brave_matrix.index = df.columns

    groups = get_n_nearest(proximity_matrix, df.columns.tolist(),
                           corr=True, number_close=n_nearest)

    for c1 in df.columns:
        for c2 in df.columns:
            a = .0
            b = .0
            c = .0
            d = .0
            if c1 != c2:
                for g in groups:
                    if (c1 in g) & (c2 in g):
                        a += 1
                    if (c1 in g) & (c2 not in g):
                        b += 1
                    if (c1 not in g) & (c2 in g):
                        c += 1
                    if (c1 not in g) & (c2 not in g):
                        d += 1
                br = (a * len(groups) + (a + c)*(a + b)) / ((math.sqrt((a + c) *
                                                                       (b + d))) * (math.sqrt((a + b) * (c + d))))
                brave_matrix.loc[c1, c2] = br

    return brave_matrix


def get_possible_edges_by_brave(df, proximity_matrix, n_nearest=5, custom_threshold=False, threshold=.0):
    """Returns list of possible edges for structure learning

    Args:
        df (DataFrame): data
        proximity_matrix (DataFrame): might be generated by get_mutual_info_score_matrix() function, correlation etc.
        n_nearest (int, optional): Number of Nearest neighbors, hyperparameter. Defaults to 5.
        custom_threshold (bool, optional): Must be set true to redact threshold, then set threshold. Defaults to False.
        threshold (float, optional): Threshold for edge candidates to be passed to possible edges, threshold. Defaults to 0.3 [0;1].

    Returns:
        Possible edges: list of possible edges
    """
    brave_matrix = get_brave_matrix(df, proximity_matrix, n_nearest)

    possible_edges = []

    if custom_threshold == False:
        for c1 in df.columns:
            for c2 in df.columns:
                if brave_matrix.loc[c1, c2] > brave_matrix.max(numeric_only='true').max()*0.3:
                    possible_edges.append((c1, c2))
    else:
        for c1 in df.columns:
            for c2 in df.columns:
                if brave_matrix.loc[c1, c2] > brave_matrix.max(numeric_only='true').max()*threshold:
                    possible_edges.append((c1, c2))

    return possible_edges


## Import data, code categorical data

In [77]:
dataset = 'andes'

In [78]:
df = pd.read_csv(f'data/{dataset}.csv')
df

Unnamed: 0,GOAL_2,SNode_3,SNode_4,SNode_5,SNode_6,SNode_7,DISPLACEM0,RApp1,GIVEN_1,RApp2,...,RApp13,GOAL_147,TRY76,GOAL_149,APPLY77,GOAL_150,GRAV78,SNode_151,GOAL_153,SNode_155
0,1,1,1,0,1,1,1,1,1,1,...,0,0,1,0,1,1,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,1,0,1,0,0,0,0,0
2,1,1,1,1,1,1,0,0,1,1,...,0,0,1,0,0,0,1,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,0,0
4,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,1,1,1,1,1,1,1,1,...,1,1,0,1,0,0,1,0,0,0
9996,1,1,0,1,1,1,0,0,1,1,...,0,1,1,1,1,0,0,1,0,0
9997,1,1,1,1,1,1,0,0,1,1,...,0,1,1,1,1,0,0,0,1,0
9998,1,1,1,1,1,1,1,1,1,1,...,0,0,1,1,0,1,0,0,0,0


In [79]:
get_nodes_type(df)


{'GOAL_2': 'disc',
 'SNode_3': 'disc',
 'SNode_4': 'disc',
 'SNode_5': 'disc',
 'SNode_6': 'disc',
 'SNode_7': 'disc',
 'DISPLACEM0': 'disc',
 'RApp1': 'disc',
 'GIVEN_1': 'disc',
 'RApp2': 'disc',
 'SNode_8': 'disc',
 'SNode_9': 'disc',
 'SNode_10': 'disc',
 'SNode_11': 'disc',
 'SNode_12': 'disc',
 'SNode_13': 'disc',
 'SNode_14': 'disc',
 'SNode_15': 'disc',
 'SNode_16': 'disc',
 'SNode_17': 'disc',
 'SNode_18': 'disc',
 'SNode_19': 'disc',
 'NEED1': 'disc',
 'SNode_20': 'disc',
 'GRAV2': 'disc',
 'SNode_21': 'disc',
 'VALUE3': 'disc',
 'SNode_24': 'disc',
 'SLIDING4': 'disc',
 'SNode_25': 'disc',
 'CONSTANT5': 'disc',
 'SNode_26': 'disc',
 'KNOWN6': 'disc',
 'VELOCITY7': 'disc',
 'SNode_47': 'disc',
 'RApp3': 'disc',
 'KNOWN8': 'disc',
 'RApp4': 'disc',
 'SNode_27': 'disc',
 'COMPO16': 'disc',
 'GOAL_48': 'disc',
 'TRY12': 'disc',
 'TRY11': 'disc',
 'GOAL_49': 'disc',
 'CHOOSE19': 'disc',
 'GOAL_50': 'disc',
 'SYSTEM18': 'disc',
 'SNode_51': 'disc',
 'KINEMATI17': 'disc',
 'SNode_5

In [81]:
df_coded, coder = code_categories(df, 'label', df.columns)
df_coded


Unnamed: 0,GOAL_2,SNode_3,SNode_4,SNode_5,SNode_6,SNode_7,DISPLACEM0,RApp1,GIVEN_1,RApp2,...,RApp13,GOAL_147,TRY76,GOAL_149,APPLY77,GOAL_150,GRAV78,SNode_151,GOAL_153,SNode_155
0,1,1,1,0,1,1,1,1,1,1,...,0,0,1,0,1,1,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,1,0,1,0,0,0,0,0
2,1,1,1,1,1,1,0,0,1,1,...,0,0,1,0,0,0,1,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,0,0
4,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,1,1,1,1,1,1,1,1,...,1,1,0,1,0,0,1,0,0,0
9996,1,1,0,1,1,1,0,0,1,1,...,0,1,1,1,1,0,0,1,0,0
9997,1,1,1,1,1,1,0,0,1,1,...,0,1,1,1,1,0,0,0,1,0
9998,1,1,1,1,1,1,1,1,1,1,...,0,0,1,1,0,1,0,0,0,0


In [89]:
structure_right = pd.read_csv(
    f'data/{dataset}_true.csv')
sparsebn_edges = pd.read_csv(
    f'data/{dataset}_sparsebn.csv')
bidag_edges = pd.read_csv(
    f'data/{dataset}_bidag.csv')

print(structure_right, sparsebn_edges, bidag_edges)


           V1         V2
0      GOAL_2    GOAL_48
1     SNode_3      RApp1
2     SNode_3   SNode_47
3     SNode_4   SNode_75
4     SNode_4  SNode_123
..        ...        ...
333  GOAL_149   GOAL_150
334   APPLY77   GOAL_150
335  GOAL_150  SNode_151
336    GRAV78  SNode_151
337  GOAL_153  SNode_155

[338 rows x 2 columns]            V1         V2
0      GOAL_2    COMPO16
1      GOAL_2    GOAL_48
2     SNode_3      RApp1
3     SNode_3  VELOCITY7
4     SNode_3   SNode_47
..        ...        ...
349     TRY76   GOAL_147
350     TRY76   GOAL_149
351  GOAL_149   GOAL_147
352  GOAL_150  SNode_151
353    GRAV78  SNode_151

[354 rows x 2 columns]            V1          V2
0      GOAL_2     SNode_7
1      GOAL_2     GOAL_48
2      GOAL_2    CHOOSE35
3     SNode_3  DISPLACEM0
4     SNode_3       RApp1
..        ...         ...
326  GOAL_149    GOAL_150
327   APPLY77    GOAL_150
328  GOAL_150   SNode_151
329    GRAV78   SNode_151
330  GOAL_153   SNode_155

[331 rows x 2 columns]


## Experiments

Calculate SHD score of baselines

In [82]:
pr_sparsebn = precision_recall(sparsebn_edges, structure_right)['SHD']
pr_bidag = precision_recall(bidag_edges, structure_right)['SHD']

print(pr_sparsebn, pr_bidag)


ValueError: too many values to unpack (expected 2)

In [84]:
time_of_experiment = []
brave_shd = []

start_time = time.time()
df_mis = get_proximity_matrix(
    df, df_coded, proximity_metric='MI')
possible_edges = get_possible_edges_by_brave(
    df, proximity_matrix=df_mis)
bn = structure_learning(df_coded, 'HC', get_nodes_type(
    df), 'K2', white_list=possible_edges)
brave_edges = []
for e in bn['E']:
    brave_edges.append((e[0], e[1]))
time_of_experiment.append(time.time() - start_time)
pr_brave = precision_recall(brave_edges, structure_right)['SHD']
brave_shd.append(pr_brave)


In [90]:
time_of_experiment_random = []
random_shd = []

start_time = time.time()
possible_edges = get_possible_edges_randomly(df, len(brave_edges))

pr_random = precision_recall(possible_edges, structure_right)['SHD']


In [92]:
print(brave_shd)
print(pr_random)

[]
651


In [75]:
brave_edges

[('AppOK', 'AppData'),
 ('DataFile', 'AppData'),
 ('AppData', 'DS_LCLOK'),
 ('AppData', 'GDIIN'),
 ('AppData', 'EMFOK'),
 ('PrtOn', 'PrtStatOff'),
 ('PrtOn', 'PrtData'),
 ('PrtPaper', 'PrtStatPaper'),
 ('PrtDriver', 'GDIOUT'),
 ('EMFOK', 'DskLocal'),
 ('GDIIN', 'EMFOK'),
 ('GDIIN', 'GDIOUT'),
 ('GDIIN', 'PrtDriver'),
 ('GDIIN', 'DskLocal'),
 ('DrvSet', 'PrtPScript'),
 ('GDIOUT', 'PrtDataOut'),
 ('PrtDataOut', 'PC2PRT'),
 ('PrtDataOut', 'PrtFile'),
 ('PrtPath', 'NetOK'),
 ('PrtPath', 'DS_NTOK'),
 ('NtwrkCnfg', 'PrtIcon'),
 ('NtwrkCnfg', 'REPEAT'),
 ('PTROFFLINE', 'NetOK'),
 ('PTROFFLINE', 'PrtIcon'),
 ('PTROFFLINE', 'DS_NTOK'),
 ('NetOK', 'DS_NTOK'),
 ('NetOK', 'PC2PRT'),
 ('PrtCbl', 'LclOK'),
 ('PrtCbl', 'DS_LCLOK'),
 ('PrtPort', 'LclOK'),
 ('PrtPort', 'DS_LCLOK'),
 ('CblPrtHrdwrOK', 'LclOK'),
 ('CblPrtHrdwrOK', 'REPEAT'),
 ('CblPrtHrdwrOK', 'DS_LCLOK'),
 ('LclOK', 'DS_LCLOK'),
 ('LclOK', 'PC2PRT'),
 ('PrtMpTPth', 'DS_NTOK'),
 ('DS_NTOK', 'PC2PRT'),
 ('DS_LCLOK', 'PC2PRT'),
 ('PC2PRT',

In [76]:
time_of_experiment

[10.52835988998413]