# Learn CPTs of Bayesian Netork

In [47]:
%matplotlib inline
from pylab import *
import matplotlib.pyplot as plt

import os

In [48]:
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

## Loading two BNs

In [49]:
bn=gum.loadBN(os.path.join("out","VisitAsia.bif"))
bn2=gum.loadBN(os.path.join("out","VisitAsia.bif"))

gnb.sideBySide(bn,bn2,
               captions=['First bn','Second bn'])

0,1
G A A T T A->T E E T->E B B D D B->D E->D X X E->X S S S->B L L S->L L->E,G A A T T A->T E E T->E B B D D B->D E->D X X E->X S S S->B L L S->L L->E
First bn,Second bn


In [50]:
bn.cpt("D")

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D
E,B,0,1
0,0,0.9,0.1
0,1,0.2,0.8
1,0,0.3,0.7
1,1,0.1,0.9


## Randomizing the parameters

In [51]:
bn.generateCPTs()
bn2.generateCPTs()

## Direct comparison of parameters

In [52]:
from IPython.display import HTML

gnb.sideBySide(bn.cpt(3),
               bn2.cpt(3),
               captions=['<h3>cpt of node 3 in first bn</h3>','<h3>same cpt in second bn</h3>'])


0,1
L S01 00.47190.5281 10.80730.1927,L S01 00.88720.1128 10.04900.9510
cpt of node 3 in first bn,same cpt in second bn

Unnamed: 0_level_0,L,L
S,0,1
0,0.4719,0.5281
1,0.8073,0.1927

Unnamed: 0_level_0,L,L
S,0,1
0,0.8872,0.1128
1,0.049,0.951


## Exact KL-divergence 

Since the BN is not too big, BruteForceKL can be computed ...

In [53]:
g1=gum.ExactBNdistance(bn,bn2)
before_learning=g1.compute()
print(before_learning['klPQ'])

4.6894916879993085


Just to be sure that the distance between a BN and itself is 0 :

In [54]:
g0=gum.ExactBNdistance(bn,bn)
print(g0.compute()['klPQ'])

0.0


## Generate a database from the original BN

In [24]:
gum.generateCSV(bn,os.path.join("out","test.csv"),10000,False)

 out/test.csv : [ ############################################################ ] 100%
Log2-Likelihood : -32684.47588536407


-32684.47588536407

## Using pandas for _counting

As an exercise, we will use pandas to learn the parameters. However **<font color="red">the simplest way to learn parameters is to use `BNLearner` :-)</font>**. Moreover, you will be able to add priors, etc.


In [55]:
# using bn as a template for the specification of variables in test.csv
learner=gum.BNLearner(os.path.join("out","test.csv"),bn) 
bn3=learner.learnParameters(bn.dag())

#the same but we add a Laplace adjustment as a Prior
learner=gum.BNLearner(os.path.join("out","test.csv"),bn) 
learner.useAprioriSmoothing(1000) # a count C is replaced by C+1000
bn4=learner.learnParameters(bn.dag())

after_pyAgrum_learning=gum.ExactBNdistance(bn,bn3).compute()
after_pyAgrum_learning_with_laplace=gum.ExactBNdistance(bn,bn4).compute()
print("without priori :{}".format(after_pyAgrum_learning['klPQ']))
print("with prior smooting(1000):{}".format(after_pyAgrum_learning_with_laplace['klPQ']))

without priori :3.1708503197342965
with prior smooting(1000):2.2272613442612275


### Now, let's try to learn the parameters with pandas

In [56]:
import pandas
df=pandas.read_csv(os.path.join("out","test.csv"))
df.head()

Unnamed: 0,X,L,A,D,T,B,S,E
0,0,0,0,1,0,1,1,0
1,0,0,0,1,0,1,1,0
2,0,0,0,0,0,1,1,0
3,0,0,0,1,0,1,0,0
4,0,0,0,1,0,1,1,0


We use the crosstab function in pandas

In [57]:
c=pandas.crosstab(df['D'],[df['T'],df['B']])
c

T,0,0,1,1
B,0,1,0,1
D,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4679,911,15,5
1,697,3603,47,43


Playing with numpy reshaping, we retrieve the good form for the CPT from the pandas cross-table

In [58]:
gnb.sideBySide('<pre>'+str(np.array((c/c.sum().apply(np.float32)).transpose()).reshape(2,2,2))+'</pre>',
               bn.cpt(bn.idFromName('D')),
               captions=["<h3>Learned parameters in crosstab","<h3>Original parameters in bn</h3>"])

0,1
[[[0.8703497 0.1296503 ]  [0.20181657 0.79818343]]  [[0.24193548 0.75806452]  [0.10416667 0.89583333]]],D EB01 000.97350.0265 10.66760.3324 100.48850.5115 10.85650.1435
Learned parameters in crosstab,Original parameters in bn

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D
E,B,0,1
0,0,0.9735,0.0265
0,1,0.6676,0.3324
1,0,0.4885,0.5115
1,1,0.8565,0.1435


## A global method for estimating Bayesian network parameters from CSV file using PANDAS

In [59]:
def computeCPTfromDF(bn,df,name):
    """
    Compute the CPT of variable "name" in the BN bn from the database df
    """
    id=bn.idFromName(name)
    domains=[bn.variableFromName(name).domainSize() 
             for name in bn.cpt(id).var_names]

    parents=list(bn.cpt(id).var_names)
    parents.pop()
    
    if (len(parents)>0):
        c=pandas.crosstab(df[name],[df[parent] for parent in parents])
        s=c/c.sum().apply(np.float32)
    else:
        s=df[name].value_counts(normalize=True)
        
    bn.cpt(id)[:]=np.array((s).transpose()).reshape(*domains)
    
def ParametersLearning(bn,df):
    """
    Compute the CPTs of every varaible in the BN bn from the database df
    """
    for name in bn.names():
        computeCPTfromDF(bn,df,name)

In [60]:
ParametersLearning(bn2,df)

KL has decreased a lot (if everything's OK)

In [61]:
g1=gum.ExactBNdistance(bn,bn2)
print("BEFORE LEARNING")
print(before_learning['klPQ'])
print
print("AFTER LEARNING")
print(g1.compute()['klPQ'])

BEFORE LEARNING
4.6894916879993085
AFTER LEARNING
3.1780361926481295


And CPTs should be close

In [62]:
gnb.sideBySide(bn.cpt(3),
               bn2.cpt(3),
               captions=["<h3>Original BN","<h3>learned BN</h3>"])

0,1
L S01 00.47190.5281 10.80730.1927,L S01 00.98830.0117 10.89470.1053
Original BN,learned BN

Unnamed: 0_level_0,L,L
S,0,1
0,0.4719,0.5281
1,0.8073,0.1927

Unnamed: 0_level_0,L,L
S,0,1
0,0.9883,0.0117
1,0.8947,0.1053


## Influence of the size of the database on the quality of learned parameters

What is the effect of increasing the size of the database on the KL ? We expect that the KL decreases to 0.

In [63]:
res=[]
for i in range(200,10001,50):
    ParametersLearning(bn2,df[:i])
    g1=gum.ExactBNdistance(bn,bn2)
    res.append(g1.compute()['klPQ'])
fig=figure(figsize=(10,6))
ax  = fig.add_subplot(1, 1, 1)
ax.plot(range(200,10001,50),res)
ax.set_xlabel("size of the database")
ax.set_ylabel("KL")
t=ax.set_title("klPQ(bn,learnedBN(x))")

ValueError: cannot reshape array of size 6 into shape (2,2,2)