# Imports

In [1]:
import sys
from typing import Optional, Union, List

parentdir = '/Users/jerzypro/Documents/GitHub/FEDOT/'
bamtdir = '/Users/jerzypro/Documents/GitHub/BAMT'
sys.path.insert(0, parentdir)
sys.path.insert(0, bamtdir)

from math import ceil
from pgmpy.estimators import K2Score
from pgmpy.models import BayesianNetwork
from fedot.core.pipelines.convert import graph_structure_as_nx_graph
from fedot.core.optimisers.optimizer import GraphGenerationParams
from fedot.core.optimisers.graph import OptGraph, OptNode
from fedot.core.optimisers.objective.objective_eval import ObjectiveEvaluate
from fedot.core.optimisers.objective.objective import Objective
from fedot.core.optimisers.gp_comp.operators.selection import SelectionTypesEnum
from fedot.core.optimisers.gp_comp.gp_optimiser import EvoGraphOptimiser, GPGraphOptimiserParameters, \
    GeneticSchemeTypesEnum
from fedot.core.optimisers.adapters import DirectAdapter
from fedot.core.dag.verification_rules import has_no_cycle, has_no_self_cycled_nodes
from examples.divided_bn import DividedBN
from fedot.core.composer.gp_composer.gp_composer import PipelineComposerRequirements
import bamt.Preprocessors as pp
import bamt.Networks as Nets
from sklearn import preprocessing
import random
import pandas as pd
import numpy as np
from copy import deepcopy
from fedot.core.dag.graph import Graph
import time



# Functions

In [2]:
class CustomGraphModel(Graph):

    def __init__(self, nodes: Optional[Union[OptNode, List[OptNode]]] = None):
        super().__init__(nodes)
        self.unique_pipeline_id = 1


class CustomGraphNode(OptNode):
    def __str__(self):
        return self.content["name"]


# задаем метрику
def custom_metric(graph: CustomGraphModel, data: pd.DataFrame):
    graph_nx, labels = graph_structure_as_nx_graph(graph)
    struct = []
    for meta_edge in graph_nx.edges():
        l1 = str(labels[meta_edge[0]])
        l2 = str(labels[meta_edge[1]])
        struct.append([l1, l2])

    bn_model = BayesianNetwork(struct)
    bn_model.add_nodes_from(data.columns)

    global local_edges, root_nodes, child_nodes, initial_df

    # merge local edges into one list

    unpacked_hidden_edges = np.array([])

    for meta_edge in evolutionary_edges:
        meta_parent_node = str(meta_edge[0])
        local_root_nodes = root_nodes[int(str(meta_edge[0]))]
        local_child_nodes = child_nodes[int(str(meta_edge[1]))]
        for root_node in local_root_nodes:
            for child_node in local_child_nodes:
                np.append(unpacked_hidden_edges, [str(meta_parent_node), str(root_node)])
                np.append(unpacked_hidden_edges, [str(child_node), str(meta_parent_node)])

    bn_model.add_nodes_from(initial_df.columns)
    bn_model.add_edges_from(unpacked_hidden_edges)
    for key in local_edges:
        bn_model.add_edges_from(local_edges[key])

    full_data = pd.concat([data, initial_df], axis=1, join='inner')

    score = K2Score(full_data).score(bn_model)
    return [-score]


# задаем кроссовер (обмен ребрами)
def custom_crossover_exchange_edges(graph_first: OptGraph, graph_second: OptGraph, max_depth):
    def find_node(graph: OptGraph, node):
        return graph.nodes[dir_of_nodes[node.content['name']]]

    num_cros = 100
    try:
        for _ in range(num_cros):
            new_graph_first = deepcopy(graph_first)
            new_graph_second = deepcopy(graph_second)

            edges_1 = new_graph_first.operator.get_all_edges()
            edges_2 = new_graph_second.operator.get_all_edges()
            count = ceil(min(len(edges_1), len(edges_2)) / 2)
            choice_edges_1 = random.sample(edges_1, count)
            choice_edges_2 = random.sample(edges_2, count)

            for meta_edge in choice_edges_1:
                new_graph_first.operator.disconnect_nodes(meta_edge[0], meta_edge[1], False)
            for meta_edge in choice_edges_2:
                new_graph_second.operator.disconnect_nodes(meta_edge[0], meta_edge[1], False)

            old_edges1 = new_graph_first.operator.get_all_edges()
            old_edges2 = new_graph_second.operator.get_all_edges()

            new_edges_2 = [[find_node(new_graph_second, i[0]), find_node(new_graph_second, i[1])]
                           for i in choice_edges_1]
            new_edges_1 = [[find_node(new_graph_first, i[0]), find_node(new_graph_first, i[1])] for i in choice_edges_2]
            for meta_edge in new_edges_1:
                if meta_edge not in old_edges1:
                    new_graph_first.operator.connect_nodes(meta_edge[0], meta_edge[1])
            for meta_edge in new_edges_2:
                if meta_edge not in old_edges2:
                    new_graph_second.operator.connect_nodes(meta_edge[0], meta_edge[1])

            if old_edges1 != new_graph_first.operator.get_all_edges() or old_edges2 != new_graph_second.operator.get_all_edges():
                break

        if old_edges1 == new_graph_first.operator.get_all_edges() and new_edges_1 != [] and new_edges_1 != None:
            new_graph_first = deepcopy(graph_first)
        if old_edges2 == new_graph_second.operator.get_all_edges() and new_edges_2 != [] and new_edges_2 != None:
            new_graph_second = deepcopy(graph_second)
    except Exception as ex:
        print(ex)
    return new_graph_first, new_graph_second


# задаем три варианта мутации: добавление узла, удаление узла, разворот узла
def custom_mutation_add(graph: OptGraph, **kwargs):
    num_mut = 100
    try:
        for _ in range(num_mut):
            rid = random.choice(range(len(graph.nodes)))
            random_node = graph.nodes[rid]
            other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))]
            nodes_not_cycling = (random_node.descriptive_id not in
                                 [n.descriptive_id for n in other_random_node.ordered_subnodes_hierarchy()] and
                                 other_random_node.descriptive_id not in
                                 [n.descriptive_id for n in random_node.ordered_subnodes_hierarchy()])
            if nodes_not_cycling:
                graph.operator.connect_nodes(random_node, other_random_node)
                break

    except Exception as ex:
        graph.log.warn(f'Incorrect connection: {ex}')
    return graph


def custom_mutation_delete(graph: OptGraph, **kwargs):
    num_mut = 100
    try:
        for _ in range(num_mut):
            rid = random.choice(range(len(graph.nodes)))
            random_node = graph.nodes[rid]
            other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))]
            if random_node.nodes_from is not None and other_random_node in random_node.nodes_from:
                graph.operator.disconnect_nodes(other_random_node, random_node, False)
                break
    except Exception as ex:
        print(ex)
    return graph


def custom_mutation_reverse(graph: OptGraph, **kwargs):
    num_mut = 100
    try:
        for _ in range(num_mut):
            rid = random.choice(range(len(graph.nodes)))
            random_node = graph.nodes[rid]
            other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))]
            if random_node.nodes_from is not None and other_random_node in random_node.nodes_from:
                graph.operator.reverse_edge(other_random_node, random_node)
                break
    except Exception as ex:
        print(ex)
    return graph


# задаем правила на запрет дублирующих узлов
def _has_no_duplicates(graph):
    _, labels = graph_structure_as_nx_graph(graph)
    if len(labels.values()) != len(set(labels.values())):
        raise ValueError('Custom graph has duplicates')
    return True

# Experiments

In [None]:
# файл с исходными данными (должен лежать в 'examples/data/')
file = 'pigs'
# размер популяции
pop_size = 10
# количество поколений
n_generation = 50
# вероятность кроссовера
crossover_probability = 0.8
# вероятность мутации
mutation_probability = 0.9

data = pd.read_csv(parentdir + 'examples/data/' + file + '.csv')
if 'Unnamed: 0' in data.columns:
    data = data.drop(['Unnamed: 0'], axis=1, inplace=True)

data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)

global local_edges, root_nodes, child_nodes, initial_df

initial_df = data

# initialize divided_bn

start_time = time.time()

divided_bn = DividedBN(data=data, max_local_structures=20)

divided_bn.set_local_structures(data, datatype="discrete")

local_edges = divided_bn.local_structures_edges

divided_bn.set_hidden_nodes(data=data)

hidden_df = pd.DataFrame.from_dict(divided_bn.hidden_nodes)

hidden_df.columns = hidden_df.columns.astype(str)

root_nodes = divided_bn.root_nodes
child_nodes = divided_bn.child_nodes

vertices = list(hidden_df.columns)

encoder = preprocessing.LabelEncoder()
discretizer = preprocessing.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
p = pp.Preprocessor([('encoder', encoder), ('discretizer', discretizer)])
discretized_data, _ = p.apply(hidden_df)

# словарь: {имя_узла: уникальный_номер_узла}
global dir_of_nodes
dir_of_nodes = {hidden_df.columns[i]: i for i in range(len(hidden_df.columns))}

# правила для байесовских сетей: нет петель, нет циклов, нет повторяющихся узлов
rules = [has_no_self_cycled_nodes, has_no_cycle, _has_no_duplicates]

# задаем для оптимизатора fitness-функцию
objective = Objective(custom_metric)
objective_eval = ObjectiveEvaluate(objective, data=discretized_data)
# инициализация начальной сети (пустая)
initial = [CustomGraphModel(nodes=[CustomGraphNode(nodes_from=None,
                                                   content={'name': vertex}) for vertex in vertices])]

requirements = PipelineComposerRequirements(
    primary=vertices,
    secondary=vertices,
    max_arity=100,
    max_depth=100,
    pop_size=pop_size,
    num_of_generations=n_generation,
    crossover_prob=crossover_probability,
    mutation_prob=mutation_probability
)

optimiser_parameters = GPGraphOptimiserParameters(
    genetic_scheme_type=GeneticSchemeTypesEnum.steady_state,
    selection_types=[SelectionTypesEnum.tournament],
    mutation_types=[custom_mutation_add, custom_mutation_delete, custom_mutation_reverse],
    crossover_types=[custom_crossover_exchange_edges]
)

graph_generation_params = GraphGenerationParams(
    adapter=DirectAdapter(base_graph_class=CustomGraphModel, base_node_class=CustomGraphNode),
    rules_for_constraint=rules)

optimiser = EvoGraphOptimiser(
    graph_generation_params=graph_generation_params,
    parameters=optimiser_parameters,
    requirements=requirements,
    initial_graph=initial,
    objective=objective)

# запуск оптимизатора
optimized_graph = optimiser.optimise(objective_eval)[0]
# вывод полученного графа

print("--- %s seconds ---" % (time.time() - start_time))

evolutionary_edges = optimized_graph.operator.get_all_edges()

print('evolutionary_edges', evolutionary_edges)


In [None]:
# optimized_graph.show()

external_edges = []

local_edges_merged = []

# for key in local_edges:
#     local_edges_merged += local_edges[key]

# for meta_edge in evolutionary_edges:
#     for root_node in root_nodes[meta_edge[0]]:
#         for child_node in child_nodes[meta_edge[1]]:
#             external_edges.append([root_node, child_node])

unpacked_hidden_edges = []

for meta_edge in evolutionary_edges:
    meta_parent_node = int(str(meta_edge[0]))
    local_root_nodes = root_nodes[int(str(meta_edge[0]))]
    local_child_nodes = child_nodes[int(str(meta_edge[1]))]
    for root_node in local_root_nodes:
        for child_node in local_child_nodes:
            unpacked_hidden_edges.append([meta_parent_node, root_node])
            unpacked_hidden_edges.append([child_node, meta_parent_node])

print('Unpacked hidden edges:', unpacked_hidden_edges)

all_edges = local_edges_merged + external_edges

print("Evo edges:", evolutionary_edges)

In [None]:
def custom_metric(graph: CustomGraphModel, data: pd.DataFrame):
    graph_nx, labels = graph_structure_as_nx_graph(graph)
    struct = []
    for meta_edge in graph_nx.edges():
        l1 = str(labels[meta_edge[0]])
        l2 = str(labels[meta_edge[1]])
        struct.append([l1, l2])

    bn_model = BayesianNetwork(struct)
    bn_model.add_nodes_from(data.columns)

    global local_edges, root_nodes, child_nodes, initial_df

    # merge local edges into one list

    unpacked_hidden_edges = []

    for meta_edge in evolutionary_edges:
        meta_parent_node = int(str(meta_edge[0]))
        local_root_nodes = root_nodes[int(str(meta_edge[0]))]
        local_child_nodes = child_nodes[int(str(meta_edge[1]))]
        for root_node in local_root_nodes:
            for child_node in local_child_nodes:
                unpacked_hidden_edges.append([meta_parent_node, root_node])
                unpacked_hidden_edges.append([child_node, meta_parent_node])

    bn_model.add_nodes_from(initial_df.columns)
    bn_model.add_edges_from(unpacked_hidden_edges)
    for key in local_edges:
        bn_model.add_edges_from(local_edges[key])

    score = K2Score(data).score(bn_model)
    return [-score]

In [60]:
bn_model = BayesianNetwork()
bn_model.add_nodes_from(hidden_df.columns)

unpacked_hidden_edges = np.array([])

for meta_edge in evolutionary_edges:
    meta_parent_node = str(meta_edge[0])
    local_root_nodes = root_nodes[int(str(meta_edge[0]))]
    local_child_nodes = child_nodes[int(str(meta_edge[1]))]
    for root_node in local_root_nodes:
        for child_node in local_child_nodes:
            np.append(unpacked_hidden_edges, [str(meta_parent_node), str(root_node)])
            np.append(unpacked_hidden_edges, [str(child_node), str(meta_parent_node)])

bn_model.add_nodes_from(initial_df.columns)
bn_model.add_edges_from(unpacked_hidden_edges)
for key in local_edges:
    bn_model.add_edges_from(local_edges[key])

In [61]:
bn_model.nodes()

NodeView(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'p630400490', 'p48124091', 'p627270088', 'p392115290', 'p627276488', 'p392150190', 'p48109691', 'p630071089', 'p630067789', 'p630384190', 'p48109791', 'p83306289', 'p83456290', 'p277195691', 'p277114088', 'p197132888', 'p277155690', 'p277195791', 'p277111088', 'p277162190', 'p216124491', 'p230416387', 'p216077190', 'p216124591', 'p630396290', 'p630182291', 'p630031389', 'p630299990', 'p82019685', 'p803043885', 'p751512889', 'p392157391', 'p48147992', 'p630439091', 'p48148092', 'p630388390', 'p83567891', 'p83314589', 'p83470790', 'p48084891', 'p630014189', 'p630323790', 'p630155091', 'p543072191', 'p543472088', 'p543654190', 'p543072291', 'p547629489', 'p547097990', 'p197119188', 'p197140688', 'p753023491', 'p609183992', 'p543036891', 'p547633289', 'p547620489', 'p547054590', 'p543036991', 'p543378087', 'p197125588', 'p543517389', 'p543657690', 'p543084792', 'p63043009

In [None]:
score = K2Score(data).score(bn_model)