# Installing Libraries to run Bayesian Model in python

In [1]:
#pip install --upgrade pip
#pip install pgmpy

# Default Bayesian Model Python Code

In [2]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator
from pgmpy.estimators import HillClimbSearch, BicScore
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset into a pandas DataFrame
data = pd.read_csv('/Users/hittanshubhanderi/Downloads/train.csv')

# Select the relevant columns
cols = ['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term', 'Batch Enrolled',
        'Interest Rate', 'Employment Duration', 'Home Ownership', 'Verification Status', 'Loan Title',
        'Debt to Income', 'Delinquency - two years', 'Inquiries - six months', 'Open Account',
        'Public Record', 'Revolving Balance', 'Revolving Utilization', 'Total Accounts',
        'Initial List Status', 'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
        'Collection Recovery Fee', 'Collections 12 Months Excl Med', 'Application Type',
        'Last Week Pay', 'Accounts Now Delinquent', 'Total Collection Amount',
        'Total Current Balance', 'Total Revolving Credit Limit', 'Loan Status']
data = data[cols]

# Preprocess the dataset by converting categorical variables into numerical values using one-hot encoding
data = pd.get_dummies(data, columns=['Term', 'Batch Enrolled', 'Home Ownership', 'Verification Status',
                                     'Loan Title', 'Initial List Status', 'Application Type', 'Loan Status'])

# Split the data into training and testing sets
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)

# Define the structure of the Bayesian Network using the BayesianModel class from pgmpy
model = BayesianModel([('Loan Amount', 'Loan Status'),
                       ('Funded Amount', 'Loan Status'),
                       ('Funded Amount Investor', 'Loan Status'),
                       ('Term_ 36 months', 'Loan Status'),
                       ('Batch Enrolled_', 'Loan Status'),
                       ('Interest Rate', 'Loan Status'),
                       ('Employment Duration', 'Loan Status'),
                       ('Home Ownership_MORTGAGE', 'Loan Status'),
                       ('Home Ownership_NONE', 'Loan Status'),
                       ('Home Ownership_OTHER', 'Loan Status'),
                       ('Home Ownership_OWN', 'Loan Status'),
                       ('Home Ownership_RENT', 'Loan Status'),
                       ('Verification Status_Not Verified', 'Loan Status'),
                       ('Verification Status_Source Verified', 'Loan Status'),
                       ('Verification Status_Verified', 'Loan Status'),
                       ('Loan Title_Business Loan', 'Loan Status'),
                       ('Loan Title_Car financing', 'Loan Status'),
                       ('Loan Title_Credit card refinancing', 'Loan Status'),
                       ('Loan Title_Debt consolidation', 'Loan Status'),
                       ('Loan Title_Green loan', 'Loan Status'),
                       ('Loan Title_Home buying', 'Loan Status'),
                       ('Loan Title_Home improvement', 'Loan Status'),
                       ('Loan Title_Major purchase', 'Loan Status'),
                       ('Loan Title_Medical expenses', 'Loan Status'),
                       ('Loan Title_Moving and relocation', 'Loan Status'),
                       ('Loan Title_Other', 'Loan Status'),
                       ('Loan Title_Vacation', 'Loan Status'),
                       ('Loan Title_Wedding Loan', 'Loan Status'),
                       ('Debt to Income', 'Loan Status'),
                       ('Delinquency - two years', 'Loan Status'),
                       ('Inquiries - six months', 'Loan Status'),
                       ('Open Account', 'Loan Status'),
                       ('Public Record', 'Loan Status'),
                       ('Revolving Balance', 'Loan Status'),
                       ('Revolving Utilization', 'Loan Status'),
                       ('Total Accounts', 'Loan Status'),
                       ('Initial List Status_f', 'Loan Status'),
                       ('Initial List Status_w', 'Loan Status'),
                       ('Total Received Interest', 'Loan Status'),
                       ('Total Received Late Fee', 'Loan Status'),
                       ('Recoveries', 'Loan Status'),
                       ('Collection Recovery Fee', 'Loan Status'),
                       ('Collections 12 Months Excl Med', 'Loan Status'),
                       ('Application Type_INDIVIDUAL', 'Loan Status'),
                       ('Application Type_JOINT', 'Loan Status'),
                       ('Last Week Pay', 'Loan Status'),
                       ('Accounts Now Delinquent', 'Loan Status'),
                       ('Total Collection Amount', 'Loan Status'),
                       ('Total Current Balance', 'Loan Status'),
                       ('Total Revolving Credit Limit', 'Loan Status')])

# Use Tan's algorithm to learn the structure of the Bayesian Network from the training data
estimator = BayesianEstimator(model, train_data)
model.fit(train_data, estimator=estimator, prior_type='BDeu')

# Evaluate the performance of the Bayesian Network on the testing data
preds = model.predict(test_data.drop('Loan Status', axis=1))
accuracy = accuracy_score(test_data['Loan Status'], preds)
precision = precision_score(test_data['Loan Status'], preds)
recall = recall_score(test_data['Loan Status'], preds)
f1 = f1_score(test_data['Loan Status'], preds)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

# Use the Markov algorithm to learn the structure of the Bayesian Network from the training data
estimator = MaximumLikelihoodEstimator(model, train_data)
model.fit(train_data, estimator=estimator)

# Evaluate the performance of the Bayesian Network on the testing data
preds = model.predict(test_data.drop('Loan Status', axis=1))
accuracy = accuracy_score(test_data['Loan Status'], preds)
precision = precision_score(test_data['Loan Status'], preds)
recall = recall_score(test_data['Loan Status'], preds)
f1 = f1_score(test_data['Loan Status'], preds)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

# Use the Markov algorithm with feature selection to learn the structure of the Bayesian Network from the training data
hc = HillClimbSearch(train_data, scoring_method=BicScore(train_data))
best_model = hc.estimate()
estimator = MaximumLikelihoodEstimator(best_model, train_data)
best_model.fit(train_data, estimator=estimator)

# Evaluate the performance of the Bayesian Network on the testing data
preds = best_model.predict(test_data.drop('Loan Status', axis=1))
accuracy = accuracy_score(test_data['Loan Status'], preds)
precision = precision_score(test_data['Loan Status'], preds)
recall = recall_score(test_data['Loan Status'], preds)
f1 = f1_score(test_data['Loan Status'], preds)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/hittanshubhanderi/Downloads/train.csv'

# Bayesian Model Python Code with Discretizetion the continuous variables

In [3]:
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator
from pgmpy.estimators import HillClimbSearch, BDeuScore, K2Score
from pgmpy.estimators import PC
from pgmpy.inference import VariableElimination
from sklearn.preprocessing import KBinsDiscretizer

# Load the dataset
data = pd.read_csv('/Users/hittanshubhanderi/Downloads/trtr.csv')

# Select relevant columns for analysis
columns = ['Loan Amount', 'Term', 'Interest Rate', 'Employment Duration', 'Home Ownership', 'Verification Status', 'Loan Title', 'Debit to Income', 'Delinquency - two years', 'Inquires - six months', 'Open Account', 'Revolving Balance', 'Total Accounts', 'Loan Status']
data = data[columns]

# Discretize the continuous variables
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
discretized_cols = discretizer.fit_transform(data[['Loan Amount', 'Interest Rate', 'Debit to Income', 'Delinquency - two years', 'Inquires - six months', 'Open Account', 'Revolving Balance', 'Total Accounts']])
data[['Loan Amount', 'Interest Rate', 'Debit to Income', 'Delinquency - two years', 'Inquires - six months', 'Open Account', 'Revolving Balance', 'Total Accounts']] = discretized_cols.astype('int64')

# Split the data into training and test sets
train_data = data.sample(frac=0.7, random_state=1)
test_data = data.drop(train_data.index)

# Define the structure of the Bayesian Network using the Tan algorithm
model_tan = BayesianModel([('Loan Amount', 'Loan Status'), ('Term', 'Loan Status'), ('Interest Rate', 'Loan Status'), ('Employment Duration', 'Loan Status'),
                           ('Home Ownership', 'Loan Status'), ('Verification Status', 'Loan Status'), ('Loan Title', 'Loan Status'), ('Debit to Income', 'Loan Status'),
                           ('Delinquency - two years', 'Loan Status'), ('Inquires - six months', 'Loan Status'), ('Open Account', 'Loan Status'), ('Revolving Balance', 'Loan Status'),
                           ('Total Accounts', 'Loan Status')])

# Estimate the parameters of the Bayesian Network using the Maximum Likelihood Estimator
estimator_tan = MaximumLikelihoodEstimator(model_tan, train_data)
model_tan.fit(train_data)

# Define the structure of the Bayesian Network using the Markov algorithm
hc = HillClimbSearch(train_data, scoring_method=BDeuScore(train_data))
best_model = hc.estimate()
model_markov = BayesianModel(best_model.edges())

# Estimate the parameters of the Bayesian Network using the Bayesian Estimator
estimator_markov = BayesianEstimator(model_markov, train_data)
model_markov.fit(train_data, estimator_markov)

# Define the structure of the Bayesian Network using the Markov with feature selection algorithm
est = PC(train_data)
skeleton = est.estimate_skeleton()
model_markov_fs = est.skeleton_to_bn(skeleton)
model_markov_fs.fit(train_data, estimator=BayesianEstimator)

# Predict the loan status for the test data using the Variable Elimination method for the Tan algorithm
inference_tan = VariableElimination(model_tan)
query_tan = inference_tan.query(variables=['Loan Status'], evidence=test_data.to_dict('records')[0], joint=False)
print('Tan Algorithm:', query_tan['Loan Status'])

# Predict the loan status for the test data using the Variable Elimination method for the Markov algorithm
inference_markov = VariableElimination(model_markov)
query_markov = inference_markov.query(variables=['Loan Status'], evidence=test_data.to_dict('records')[0], joint=False)
print('Markov Algorithm:', query_markov['Loan Status'])

# Predict the loan status for the test data using the Variable Elimination method for the Markov with feature selection algorithm
inference_markov_fs = VariableElimination(model_markov_fs)
query_markov_fs = inference_markov_fs.query(variables=['Loan Status'], evidence=test_data.to_dict('records')[0], joint=False)
print('Markov with Feature Selection Algorithm:', query_markov_fs['Loan Status'])

: 

: 

# Bayesian Model with smaller sample dataset
### Taken from the SPSS Modeler with sample node

In [None]:
import numpy as np
import pandas as pd
import csv 
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination

heartDisease = pd.read_csv('/Users/hittanshubhanderi/Downloads/outputpy.csv')
heartDisease = heartDisease.replace('?',np.nan)

print('Sample instances from the dataset are given below')
print(heartDisease.head())

print('\n Attributes and datatypes')
print(heartDisease.dtypes)

model= BayesianModel([('Loan Amount', 'Loan Status'), ('Term', 'Loan Status'), ('Interest Rate', 'Loan Status'), ('Home Amount', 'Loan Status'), ('Debit to Income', 'Loan Status'), ('Delinquency - two years', 'Loan Status'), ('Total Accounts', 'Loan Status')])
print('\nLearning CPD using Maximum likelihood estimators')
model.fit(heartDisease,estimator=MaximumLikelihoodEstimator)

print('\n Inferencing with Bayesian Network:')
HeartDiseasetest_infer = VariableElimination(model)

print('\n 1. Probability of HeartDisease given evidence= restecg')
q1=HeartDiseasetest_infer.query(variables=['Loan Status'],evidence={'Term':1})
print(q1)

print('\n 2. Probability of HeartDisease given evidence= cp')
q2=HeartDiseasetest_infer.query(variables=['Loan Status'],evidence={'Home Ownership':2})
print(q2)

Sample instances from the dataset are given below
         ID  Loan Amount  Funded Amount  Funded Amount Investor  Term   
0  65087372        10000          32236            12329.362860    59  \
1  41969824        34653          18985             4917.125714    58   
2  16437877        10997           7637            13290.569560    58   
3   4359149        17583          31569             8380.168440    58   
4  29423305        11410          25249            12361.413910    36   

   Interest Rate Grade Sub Grade Home Type   Home Amount  ...   
0      11.135007     B        C4  MORTGAGE  176346.62670  ...  \
1      10.590153     F        B1  MORTGAGE   60883.55633  ...   
2      14.649411     D        D2       OWN   65773.90572  ...   
3       8.999372     B        D1  MORTGAGE  187490.51610  ...   
4      13.415264     C        A1  MORTGAGE   82789.56019  ...   

  Total Collection Amount  Total Current Balance   
0                      31                 311301  \
1               



MemoryError: Unable to allocate 414. TiB for an array with shape (227862581760000,) and data type int16

# Bayesian Model with public runned model dataset 

In [None]:
import numpy as np
import pandas as pd
import csv 
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination

heartDisease = pd.read_csv('/Users/hittanshubhanderi/Downloads/7-dataset.csv')
heartDisease = heartDisease.replace('?',np.nan)

print('Sample instances from the dataset are given below')
print(heartDisease.head())

print('\n Attributes and datatypes')
print(heartDisease.dtypes)

model= BayesianModel([('age','heartdisease'),('gender','heartdisease'),('exang','heartdisease'),('cp','heartdisease'),('heartdisease','restecg'),('heartdisease','chol')])
print('\nLearning CPD using Maximum likelihood estimators')
model.fit(heartDisease,estimator=MaximumLikelihoodEstimator)

print('\n Inferencing with Bayesian Network:')
HeartDiseasetest_infer = VariableElimination(model)

print('\n 1. Probability of HeartDisease given evidence= restecg')
q1=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'restecg':1})
print(q1)

print('\n 2. Probability of HeartDisease given evidence= cp ')
q2=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'cp':2})
print(q2)

Sample instances from the dataset are given below
   age  gender  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak   
0   63       1   1       145   233    1        2      150      0      2.3  \
1   67       1   4       160   286    0        2      108      1      1.5   
2   67       1   4       120   229    0        2      129      1      2.6   
3   37       1   3       130   250    0        0      187      0      3.5   
4   41       0   2       130   204    0        2      172      0      1.4   

   slope ca thal  heartdisease  
0      3  0    6             0  
1      2  3    3             2  
2      2  2    7             1  
3      3  0    3             0  
4      1  0    3             0  

 Attributes and datatypes
age               int64
gender            int64
cp                int64
trestbps          int64
chol              int64
fbs               int64
restecg           int64
thalach           int64
exang             int64
oldpeak         float64
slope             int6

  tabular_cpd.values = (cpd / cpd.sum(axis=0)).reshape(tabular_cpd.cardinality)


ValueError: Sum or integral of conditional probabilities for node heartdisease is not equal to 1.