# Probabilistic Graphical Models

In [1]:
# import subprocess
# import sys

# def install(packages):
#     for package in packages:
#         subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        
# install(["pgmpy"])

pip install pgmpy

conda install -c ankurankan pgmpy

In [2]:
import pgmpy as pgm
print(pgm.__version__)

from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD

0.1.17


* **Joint Probability Distribution**: Probabilities are distributed simultaneously. All variables related to one another occur in the same time, so you need to specify all variables behavior to get an answer.
* **Conditional Probability Distribution**: Probabilities of A are distributed given a certain value for probaility B.
* **Marginal Probability Distribution**: Average out the probability of A when A and B are related.

<img src="PGM1.png">

In [3]:
model = BayesianModel([('Diff', 'Grade'), ('Intel', 'Grade'), ('Grade', 'Letter'), ('Intel', 'SAT')])

#diff = 'Diff'

cpd_d = TabularCPD(variable='Diff', variable_card=2, values=[[0.6], [0.4]])
cpd_i = TabularCPD(variable='Intel', variable_card=2, values=[[0.7], [0.3]])



In [4]:
cpd_g = TabularCPD(variable='Grade', variable_card=3,
                   values=[[0.3, 0.05, 0.9, 0.5],
                          [0.4, 0.25, 0.08, 0.3],
                          [0.3, 0.7, 0.02, 0.2]],
                  evidence = ['Intel', 'Diff'],
                   evidence_card=[2, 2])

cpd_l = TabularCPD(variable='Letter', variable_card=2,
                   values=[[0.1, 0.4, 0.99],
                          [0.9, 0.6, 0.01]],
                  evidence = ['Grade'],
                   evidence_card=[3])

cpd_s = TabularCPD(variable='SAT', variable_card=2,
                   values=[[0.95, 0.2],
                          [0.05, 0.8]],
                  evidence = ['Intel'],
                   evidence_card=[2])

In [5]:
model.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)

In [6]:
model.check_model()

True

In [7]:
print(cpd_g)

+----------+----------+----------+----------+----------+
| Intel    | Intel(0) | Intel(0) | Intel(1) | Intel(1) |
+----------+----------+----------+----------+----------+
| Diff     | Diff(0)  | Diff(1)  | Diff(0)  | Diff(1)  |
+----------+----------+----------+----------+----------+
| Grade(0) | 0.3      | 0.05     | 0.9      | 0.5      |
+----------+----------+----------+----------+----------+
| Grade(1) | 0.4      | 0.25     | 0.08     | 0.3      |
+----------+----------+----------+----------+----------+
| Grade(2) | 0.3      | 0.7      | 0.02     | 0.2      |
+----------+----------+----------+----------+----------+


In [8]:
print(model.get_cpds('Grade'))

+----------+----------+----------+----------+----------+
| Intel    | Intel(0) | Intel(0) | Intel(1) | Intel(1) |
+----------+----------+----------+----------+----------+
| Diff     | Diff(0)  | Diff(1)  | Diff(0)  | Diff(1)  |
+----------+----------+----------+----------+----------+
| Grade(0) | 0.3      | 0.05     | 0.9      | 0.5      |
+----------+----------+----------+----------+----------+
| Grade(1) | 0.4      | 0.25     | 0.08     | 0.3      |
+----------+----------+----------+----------+----------+
| Grade(2) | 0.3      | 0.7      | 0.02     | 0.2      |
+----------+----------+----------+----------+----------+


In [9]:
model.get_cpds()

[<TabularCPD representing P(Diff:2) at 0x7f7a71bf3750>,
 <TabularCPD representing P(Intel:2) at 0x7f7a78cb88d0>,
 <TabularCPD representing P(Grade:3 | Intel:2, Diff:2) at 0x7f7a78cfa090>,
 <TabularCPD representing P(Letter:2 | Grade:3) at 0x7f7a78cfa050>,
 <TabularCPD representing P(SAT:2 | Intel:2) at 0x7f7a78cb8a90>]

In [10]:
cpd_d_sn = TabularCPD(variable='Diff', variable_card=2, values=[[0.6], [0.4]], 
                      state_names={'Diff': ['Easy', 'Hard']})
cpd_i_sn = TabularCPD(variable='Intel', variable_card=2, values=[[0.7], [0.3]], 
                      state_names={'Intel': ['Dumb', 'Intelligent']})
cpd_g_sn = TabularCPD(variable='Grade', variable_card=3, 
                      values=[[0.3, 0.05, 0.9,  0.5],
                              [0.4, 0.25, 0.08, 0.3],
                              [0.3, 0.7,  0.02, 0.2]],
                      evidence=['Intel', 'Diff'],
                      evidence_card=[2, 2],
                      state_names={'Grade': ['A', 'B', 'C'],
                                   'Intel': ['Dumb', 'Intelligent'],
                                   'Diff': ['Easy', 'Hard']})

cpd_l_sn = TabularCPD(variable='Letter', variable_card=2, 
                      values=[[0.1, 0.4, 0.99],
                              [0.9, 0.6, 0.01]],
                      evidence=['Grade'],
                      evidence_card=[3],
                      state_names={'Letter': ['Bad', 'Good'],
                                   'Grade': ['A', 'B', 'C']})

cpd_s_sn = TabularCPD(variable='SAT', variable_card=2,
                      values=[[0.95, 0.2],
                              [0.05, 0.8]],
                      evidence=['Intel'],
                      evidence_card=[2],
                      state_names={'SAT': ['Bad', 'Good'],
                                   'Intel': ['Dumb', 'Intelligent']})

In [11]:
model.add_cpds(cpd_d_sn, cpd_i_sn, cpd_g_sn, cpd_l_sn, cpd_s_sn)
model.check_model()

True

In [12]:
print(model.get_cpds('Grade'))

+----------+-------------+-----+--------------------+
| Intel    | Intel(Dumb) | ... | Intel(Intelligent) |
+----------+-------------+-----+--------------------+
| Diff     | Diff(Easy)  | ... | Diff(Hard)         |
+----------+-------------+-----+--------------------+
| Grade(A) | 0.3         | ... | 0.5                |
+----------+-------------+-----+--------------------+
| Grade(B) | 0.4         | ... | 0.3                |
+----------+-------------+-----+--------------------+
| Grade(C) | 0.3         | ... | 0.2                |
+----------+-------------+-----+--------------------+


In [13]:
# Getting the local independencies of a variable.
model.local_independencies('Grade')

(Grade ⟂ SAT | Intel, Diff)

In [14]:
# Getting all the local independencies in the network.
model.local_independencies(['Diff', 'Intel', 'SAT', 'Grade', 'Letter'])

(Diff ⟂ SAT, Intel)
(Intel ⟂ Diff)
(SAT ⟂ Grade, Diff, Letter | Intel)
(Grade ⟂ SAT | Intel, Diff)
(Letter ⟂ SAT, Intel, Diff | Grade)

In [15]:
model.active_trail_nodes('Diff')

{'Diff': {'Diff', 'Grade', 'Letter'}}

In [16]:
model.active_trail_nodes('Diff', observed='Grade')

{'Diff': {'Diff', 'Intel', 'SAT'}}

In [17]:
model.get_parents('Grade')

['Diff', 'Intel']

## Case 1 - Head to Tail

What is the probability of someone getting a Grade A if we know nothing about it?

In [18]:
from pgmpy.inference import VariableElimination

infer = VariableElimination(model)
infG = infer.query(['Grade'])
print(infG)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

+----------+--------------+
| Grade    |   phi(Grade) |
| Grade(A) |       0.3620 |
+----------+--------------+
| Grade(B) |       0.2884 |
+----------+--------------+
| Grade(C) |       0.3496 |
+----------+--------------+


What is the probability of someone getting a Grade A in a particular case?

In [19]:
print(infer.query(['Grade'], evidence={'Diff': 'Easy', 'Intel': 'Intelligent'}))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

+----------+--------------+
| Grade    |   phi(Grade) |
| Grade(A) |       0.9000 |
+----------+--------------+
| Grade(B) |       0.0800 |
+----------+--------------+
| Grade(C) |       0.0200 |
+----------+--------------+


### Highest probability

In [20]:
infer.map_query(['Grade'])

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

{'Grade': 'A'}

In [21]:
#Highest probability of a particular case
infer.map_query(['Grade'], evidence={'Diff': 'Easy', 'Intel': 'Intelligent'})

0it [00:00, ?it/s]

0it [00:00, ?it/s]

{'Grade': 'A'}

What is the highest probability of a Letter given one evidence?

In [22]:
infer.map_query(['Letter'], evidence={'Diff': 'Hard'})

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

{'Letter': 'Bad'}

## Case 2 - Tail to Tail

In [23]:
print(infer.query(['Grade'], evidence={'Intel': 'Dumb'}))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

+----------+--------------+
| Grade    |   phi(Grade) |
| Grade(A) |       0.2000 |
+----------+--------------+
| Grade(B) |       0.3400 |
+----------+--------------+
| Grade(C) |       0.4600 |
+----------+--------------+


What is the probability of someone getting a Grade A knowing the SAT score?

In [24]:
print(infer.query(['Grade'], evidence={'SAT': 'Bad'}))

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

+----------+--------------+
| Grade    |   phi(Grade) |
| Grade(A) |       0.2447 |
+----------+--------------+
| Grade(B) |       0.3258 |
+----------+--------------+
| Grade(C) |       0.4295 |
+----------+--------------+


## Case 3 - Head to Head

In [25]:
print(infer.query(['Diff']))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

+------------+-------------+
| Diff       |   phi(Diff) |
| Diff(Easy) |      0.6000 |
+------------+-------------+
| Diff(Hard) |      0.4000 |
+------------+-------------+


In [26]:
print(infer.query(['Diff'], evidence={'Intel': 'Intelligent'}))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

+------------+-------------+
| Diff       |   phi(Diff) |
| Diff(Easy) |      0.6000 |
+------------+-------------+
| Diff(Hard) |      0.4000 |
+------------+-------------+


In [27]:
grade = infer.map_query(['Grade'], evidence={'Intel': 'Intelligent'})

print(infer.query(['Diff'], evidence={'Grade':grade['Grade'], 'Intel': 'Intelligent'}))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

+------------+-------------+
| Diff       |   phi(Diff) |
| Diff(Easy) |      0.7297 |
+------------+-------------+
| Diff(Hard) |      0.2703 |
+------------+-------------+


In [28]:
from pgmpy.inference import BeliefPropagation

bp = BeliefPropagation(model)
bp.calibrate()
print(bp.query(variables=['Diff', 'Intel', 'SAT', 'Grade', 'Letter']))

0it [00:00, ?it/s]

+--------------------+------------+----------+-----------+--------------+------------------------------------+
| Intel              | Diff       | Grade    | SAT       | Letter       |   phi(Intel,Diff,Grade,SAT,Letter) |
| Intel(Dumb)        | Diff(Easy) | Grade(A) | SAT(Bad)  | Letter(Bad)  |                             0.0120 |
+--------------------+------------+----------+-----------+--------------+------------------------------------+
| Intel(Dumb)        | Diff(Easy) | Grade(A) | SAT(Bad)  | Letter(Good) |                             0.1077 |
+--------------------+------------+----------+-----------+--------------+------------------------------------+
| Intel(Dumb)        | Diff(Easy) | Grade(A) | SAT(Good) | Letter(Bad)  |                             0.0006 |
+--------------------+------------+----------+-----------+--------------+------------------------------------+
| Intel(Dumb)        | Diff(Easy) | Grade(A) | SAT(Good) | Letter(Good) |                             0.0057 |
+

### Classification

In [29]:
import pandas as pd
import numpy as np

raw_data = np.random.randint(low=0,high=2,size=(1000, 5))


data = pd.DataFrame(raw_data,columns=["Diff", "Intel", "Grade","Letter", "SAT"])

print(data)

     Diff  Intel  Grade  Letter  SAT
0       1      1      0       1    1
1       1      0      1       1    0
2       1      1      1       0    1
3       0      0      1       1    1
4       1      0      0       1    0
..    ...    ...    ...     ...  ...
995     0      0      1       0    0
996     1      1      1       0    0
997     1      0      0       1    0
998     1      0      0       1    1
999     1      1      0       0    1

[1000 rows x 5 columns]


In [30]:
train = data[: int(data.shape[0] * 0.75)]

print(train)

     Diff  Intel  Grade  Letter  SAT
0       1      1      0       1    1
1       1      0      1       1    0
2       1      1      1       0    1
3       0      0      1       1    1
4       1      0      0       1    0
..    ...    ...    ...     ...  ...
745     1      1      1       1    0
746     1      1      1       1    1
747     0      0      0       1    0
748     0      1      0       1    0
749     0      0      0       1    0

[750 rows x 5 columns]


In [31]:
model.fit(train)
print(model.get_cpds('Diff'))

+---------+----------+
| Diff(0) | 0.497333 |
+---------+----------+
| Diff(1) | 0.502667 |
+---------+----------+


In [32]:
test = data[int(0.75 * data.shape[0]) : data.shape[0]]

test.drop('Grade', axis=1, inplace=True)

model.predict(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,Grade
0,0
1,0
2,0
3,0
4,0
...,...
245,1
246,0
247,0
248,0
