### Construct a Bayesian Belief Network model using pomegranate

## Importing required libraries

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn import preprocessing
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from pomegranate import *

### Loading dataset

In [2]:
kmdf = pd.read_excel('Data_User_Modeling_Dataset.xls', sheet_name='Training_Data')
print(kmdf.head())

    STG   SCG   STR   LPR   PEG       UNS
0  0.00  0.00  0.00  0.00  0.00  very_low
1  0.08  0.08  0.10  0.24  0.90      High
2  0.06  0.06  0.05  0.25  0.33       Low
3  0.10  0.10  0.15  0.65  0.30    Middle
4  0.08  0.08  0.08  0.98  0.24       Low


# Exploratory Data Analysis

### Performing pandas profile to understand the given data

In [3]:
profile = ProfileReport(kmdf)
profile.to_file(output_file='data_profiling_output.html')

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=19.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [4]:
print (kmdf.info())
print("---------------------------")
print (kmdf.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     258 non-null    float64
 1   SCG     258 non-null    float64
 2   STR     258 non-null    float64
 3   LPR     258 non-null    float64
 4   PEG     258 non-null    float64
 5    UNS    258 non-null    object 
dtypes: float64(5), object(1)
memory usage: 12.2+ KB
None
---------------------------
              STG         SCG         STR         LPR         PEG
count  258.000000  258.000000  258.000000  258.000000  258.000000
mean     0.371147    0.355674    0.468004    0.432713    0.458539
std      0.210271    0.211962    0.245940    0.248108    0.255211
min      0.000000    0.000000    0.000000    0.000000    0.000000
25%      0.240750    0.210000    0.291250    0.250000    0.250000
50%      0.327000    0.302500    0.490000    0.330000    0.500000
75%      0.495000    0.497500    0.690000    0.64750

In [5]:
# Checking for any null values in the dataset
print(kmdf.isnull().values.any())
print("---------------------------")
print(kmdf.isnull().sum())

False
---------------------------
STG     0
SCG     0
STR     0
LPR     0
PEG     0
 UNS    0
dtype: int64


In [6]:
# List of columns from dataset
print (kmdf.columns)
print (list(kmdf.columns))

Index(['STG', 'SCG', 'STR', 'LPR', 'PEG', ' UNS'], dtype='object')
['STG', 'SCG', 'STR', 'LPR', 'PEG', ' UNS']


# Data Preprocessing

In [7]:
# Replacing extra spaces in the column names
kmdf.columns = kmdf.columns.str.replace(' ', '')
kmdf.replace(to_replace='very_low', value='Very Low', inplace=True)

#### Mapping UNS values to numericals {'UNS': {'High': 1, 'Low': 2, 'Middle': 3, 'Very Low': 4}}

In [8]:
replace_map_comp = {}
labels = kmdf['UNS'].astype('category').cat.categories.tolist()
replace_map_comp['UNS'] = {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}

print (replace_map_comp)


{'UNS': {'High': 1, 'Low': 2, 'Middle': 3, 'Very Low': 4}}


In [9]:
kmdf.replace(replace_map_comp, inplace=True)
print (kmdf.head())
print("---------------------------")
print (kmdf.info())
print("---------------------------")
kmdf['UNS'] = kmdf['UNS'].astype('float64')
print (kmdf.info())

    STG   SCG   STR   LPR   PEG  UNS
0  0.00  0.00  0.00  0.00  0.00    4
1  0.08  0.08  0.10  0.24  0.90    1
2  0.06  0.06  0.05  0.25  0.33    2
3  0.10  0.10  0.15  0.65  0.30    3
4  0.08  0.08  0.08  0.98  0.24    2
---------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     258 non-null    float64
 1   SCG     258 non-null    float64
 2   STR     258 non-null    float64
 3   LPR     258 non-null    float64
 4   PEG     258 non-null    float64
 5   UNS     258 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 12.2 KB
None
---------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     258 non-null    float64
 1   SCG     258 non-null    float64
 2   STR     2

In [10]:
# Processed data
print (kmdf.head())

    STG   SCG   STR   LPR   PEG  UNS
0  0.00  0.00  0.00  0.00  0.00  4.0
1  0.08  0.08  0.10  0.24  0.90  1.0
2  0.06  0.06  0.05  0.25  0.33  2.0
3  0.10  0.10  0.15  0.65  0.30  3.0
4  0.08  0.08  0.08  0.98  0.24  2.0


## 1) Constructing the Bayesian Belief Network (BBN) model from given data

#### Assigning dataframe to X

In [11]:
X=kmdf.copy()

### Assuming edges between nodes (variables)

In [12]:
columns_list = list(kmdf.columns)
columns_tuples = [('STG','SCG'),('SCG','STR'),('SCG','LPR'),('SCG','PEG'),('LPR','UNS'),('UNS','PEG'),('STR','UNS')]

## Algorithm : chow-liu

In [13]:
model_Chowliu = BayesianNetwork.from_samples(X, algorithm='chow-liu')
print(model_Chowliu.structure)
model_Chowliu.bake()
model_Chowliu.fit(X)

((), (0,), (0,), (0,), (0,), (4,))


{
    "class" : "BayesianNetwork",
    "name" : "2041195769920",
    "structure" : [
        [],
        [
            0
        ],
        [
            0
        ],
        [
            0
        ],
        [
            0
        ],
        [
            4
        ]
    ],
    "states" : [
        {
            "class" : "State",
            "distribution" : {
                "class" : "Distribution",
                "dtype" : "numpy.float64",
                "name" : "DiscreteDistribution",
                "parameters" : [
                    {
                        "0.0" : 0.007751937984496124,
                        "0.24" : 0.007751937984496124,
                        "0.25" : 0.007751937984496124,
                        "0.26" : 0.003875968992248062,
                        "0.276" : 0.003875968992248062,
                        "0.385" : 0.003875968992248062,
                        "0.27" : 0.015503875968992248,
                        "0.28" : 0.015503875968992248,
   

# 2) Predict the probability of the user having the following characteristics :
#### STG - Study time for Goal Objective : 0.09
#### SCG - repetitions for Goal Objective : 0.15
#### STR - study time for relative objective : 0.4
#### LPR - Performance of user for relative objective : 0.1
#### PEG - Performace of user in goal objective : 0.6
#### UNS - 'middle' level performer

In [14]:
print(model_Chowliu.probability([[0.09, 0.15, 0.4, 0.1, 0.6, 3]]))

0.0


# 3) Predict the type of the user’s knowledge with the following characteristics:
#### STG - Study time for Goal Objective : 0
#### SCG - repetitions for Goal Objective : 0
#### STR - study time for relative objective : 0.5
#### LPR - Performance of user for relative objective : 0.2
#### PEG - Performace of user in goal objective : 0.85

In [19]:
print(model_Chowliu.predict([[0.00, 0.00, 0.5, 0.2, 0.85, None]]))

[array([0.0, 0.0, 0.5, 0.2, 0.85, 1.0], dtype=object)]


# 4) Infer the probability for the data :
#### STG - Study time for Goal Objective : 0.8
#### SCG - repetitions for Goal Objective : 0
#### STR - study time for relative objective : 0.78
#### STR - study time for relative objective : 0.8
#### LPR - Performance of user for relative objective : 0.9
#### PEG - Performace of user in goal objective : 0.9
#### UNS - 'middle' level performer

In [20]:
print(model_Chowliu.predict_proba([[0.8, 0, 0.78, 0.9, 0.9, None]]))

[array([0.8, 0, 0.78, 0.9, 0.9, {
                                   "class" : "Distribution",
                                   "dtype" : "numpy.float64",
                                   "name" : "DiscreteDistribution",
                                   "parameters" : [
                                       {
                                           "1.0" : 1.0,
                                           "2.0" : 0.0,
                                           "3.0" : 0.0,
                                           "4.0" : 0.0
                                       }
                                   ],
                                   "frozen" : false
                               }                                   ],
      dtype=object)]
