In [51]:
# Importar bibliotecas necessárias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

# Para estrutura de rede e aprendizado dos parâmetros
from pgmpy.estimators import HillClimbSearch, BDeuScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination

# Exemplo: Carregar o dataset "asia"
import bnlearn as bn
data = bn.import_example('asia')


# Visualizar as primeiras linhas do dataset
print("Dataset ASIA:")
print(data.head())

from pgmpy.estimators import HillClimbSearch, K2Score

hc = HillClimbSearch(data)
best_model = hc.estimate(scoring_method=K2Score(data))
print("Arestas aprendidas:", best_model.edges())

print("\nEstrutura da rede aprendida (arestas):")
print(best_model.edges())

# Criar o modelo bayesiano com base na estrutura aprendida
model = BayesianModel(best_model.edges())

# Aprender os CPDs (tabelas de probabilidades condicionais) com estimador de máxima verossimilhança
model.fit(data, estimator=MaximumLikelihoodEstimator)

print("\nCPDs da rede:")
for cpd in model.get_cpds():
    print(cpd, "\n")


Dataset ASIA:
   asia  tub  smoke  lung  bronc  either  xray  dysp
0     1    1      0     1      1       1     1     1
1     1    1      1     1      0       1     1     1
2     1    1      1     1      0       1     1     1
3     1    1      1     1      1       1     1     1
4     1    1      1     1      1       1     1     1


  0%|          | 0/1000000 [00:00<?, ?it/s]



Arestas aprendidas: [('asia', 'tub'), ('tub', 'either'), ('tub', 'bronc'), ('lung', 'either'), ('lung', 'smoke'), ('bronc', 'smoke'), ('bronc', 'dysp'), ('either', 'xray'), ('either', 'dysp'), ('either', 'bronc')]

Estrutura da rede aprendida (arestas):
[('asia', 'tub'), ('tub', 'either'), ('tub', 'bronc'), ('lung', 'either'), ('lung', 'smoke'), ('bronc', 'smoke'), ('bronc', 'dysp'), ('either', 'xray'), ('either', 'dysp'), ('either', 'bronc')]

CPDs da rede:
+---------+--------+
| asia(0) | 0.0089 |
+---------+--------+
| asia(1) | 0.9911 |
+---------+--------+ 

+--------+----------------------+----------------------+
| asia   | asia(0)              | asia(1)              |
+--------+----------------------+----------------------+
| tub(0) | 0.056179775280898875 | 0.011098779134295227 |
+--------+----------------------+----------------------+
| tub(1) | 0.9438202247191011   | 0.9889012208657048   |
+--------+----------------------+----------------------+ 

+-----------+---------+------

In [52]:
print("Nodos do modelo:", model.nodes())
print("Arestas do modelo:", model.edges())

Nodos do modelo: ['asia', 'tub', 'either', 'bronc', 'lung', 'smoke', 'dysp', 'xray']
Arestas do modelo: [('asia', 'tub'), ('tub', 'either'), ('tub', 'bronc'), ('either', 'xray'), ('either', 'dysp'), ('either', 'bronc'), ('bronc', 'smoke'), ('bronc', 'dysp'), ('lung', 'either'), ('lung', 'smoke')]


In [66]:
import plotly.graph_objects as go

G = nx.DiGraph(best_model.edges())

pos = nx.spring_layout(G)

# Prepare edge coordinates
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color='#888'),
    hoverinfo='none',
    mode='lines'
)

# Prepare node coordinates and texts (including CPT if available)
node_x = []
node_y = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    # Search for the CPD corresponding to the node.
    cpd_str = None
    for cpd in model.get_cpds():
        if cpd.variable == node:
            percent_values = np.array(cpd.values) * 100
            # Format the CPD as a table-like string
            if percent_values.ndim == 0:
                rows = [f"{percent_values.item():.0f}%"]
            elif percent_values.ndim == 1:
                rows = [" | ".join(f"{val:.0f}%" for val in percent_values)]
            elif percent_values.ndim == 2:
                rows = []
                for row in percent_values:
                    rows.append(" | ".join(f"{val:.0f}%" for val in row))
            elif percent_values.ndim == 3:
                rows = []
                for matrix in percent_values:
                    rows.append("\n".join(" | ".join(f"{val:.0f}%" for val in row) for row in matrix))
            else:
                rows = [str(percent_values)]
            cpd_str = "\n".join(rows)
            break
    if cpd_str:
        node_text.append(f"{node}\n{cpd_str}")
    else:
        node_text.append(node)

node_trace = go.Scatter(
    x=node_x, 
    y=node_y,
    mode='markers+text',
    text=node_text,
    textposition='bottom center',
    marker=dict(
        size=20,
        color='lightblue',
        line_width=2
    ),
    hoverinfo='text'
)

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title="Interactive DAG da Rede Bayesiana com CPTs",
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40)
                )
)

fig.show()


In [None]:
# Criar o motor de inferência (equivalente ao InferenceEngine)
inference = VariableElimination(model)

# Consultar as marginais para a variável 'dysp'
marginal_dysp = inference.query(variables=['dysp'])
print("\nMarginais para 'dysp':")
print(marginal_dysp)

# Exemplo de intervenção: fixar o valor da variável 'tub' para 1 e ver o efeito em 'dysp'
marginal_dysp_intervened = inference.query(variables=['dysp'], evidence={'tub': 1})
print("\nMarginais para 'dysp' com intervenção (tub=1):")
print(marginal_dysp_intervened)





Marginais para 'dysp':
+---------+-------------+
| dysp    |   phi(dysp) |
| dysp(0) |      0.4467 |
+---------+-------------+
| dysp(1) |      0.5533 |
+---------+-------------+

Marginais para 'dysp' com intervenção (tub=1):
+---------+-------------+
| dysp    |   phi(dysp) |
| dysp(0) |      0.4427 |
+---------+-------------+
| dysp(1) |      0.5573 |
+---------+-------------+


In [55]:
# ============================================================
# Imputação de Dados Faltantes
# ============================================================
# Para imputação de dados faltantes, podemos utilizar o IterativeImputer do scikit-learn
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Simular dados faltantes: introduzindo NaN em 10% das linhas da coluna 'dysp'
data_missing = data.copy()
missing_indices = data_missing.sample(frac=0.1, random_state=42).index
data_missing.loc[missing_indices, 'dysp'] = np.nan

# Aplicar imputação iterativa
imputer = IterativeImputer(random_state=0)
data_imputed = pd.DataFrame(imputer.fit_transform(data_missing), columns=data_missing.columns)
print("\nDataset com dados imputados:")
print(data_imputed.head())



Dataset com dados imputados:
   asia  tub  smoke  lung  bronc  either  xray      dysp
0   1.0  1.0    0.0   1.0    1.0     1.0   1.0  0.876489
1   1.0  1.0    1.0   1.0    0.0     1.0   1.0  1.000000
2   1.0  1.0    1.0   1.0    0.0     1.0   1.0  1.000000
3   1.0  1.0    1.0   1.0    1.0     1.0   1.0  0.893497
4   1.0  1.0    1.0   1.0    1.0     1.0   1.0  1.000000


In [56]:
# ============================================================
# Bootstrapping do Dataset
# ============================================================
# Para realizar bootstrapping, podemos usar a função 'resample' do scikit-learn
from sklearn.utils import resample

data_bootstrap = resample(data, n_samples=len(data), random_state=0)
print("\nExemplo de dataset bootstrapped:")
print(data_bootstrap.head())


Exemplo de dataset bootstrapped:
      asia  tub  smoke  lung  bronc  either  xray  dysp
2732     1    1      0     1      0       1     1     0
9845     1    1      0     1      0       1     1     0
3264     1    1      1     1      0       1     1     1
4859     1    1      1     1      1       1     1     0
9225     1    1      1     1      1       1     1     1
