In [23]:
import pandas as pd
import numpy as np
from config import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


from pgmpy.models import BayesianNetwork
from pgmpy.estimators import *
from pgmpy.inference import DBNInference, VariableElimination, BeliefPropagation

In [24]:

# read csv
application_df = pd.read_csv(application_record, index_col=None)
credit_df = pd.read_csv(credit_record, index_col=None)
# get label for each sample
ID_set = set(application_df.ID) & set(credit_df.ID)
eval_credit_record = credit_df[credit_df["ID"].isin(ID_set)]
# We consider people who past due 30 days pluse as bad credit
val = { "3", "4", "5"}
bad_record_ID = eval_credit_record[credit_df["STATUS"].isin(val)].drop(["MONTHS_BALANCE"], axis = 1).reset_index(drop=True).ID.unique()
bad_record_ID = list(bad_record_ID)
good_record_ID = list(set(eval_credit_record.ID.unique()) - set(bad_record_ID))
# 0 for good user, 1 for bad user 
label_df = pd.DataFrame({"ID": good_record_ID + (bad_record_ID), "Label": ([0] * len(good_record_ID)) + ([1]*len(bad_record_ID))})
label_df = label_df.sort_values(["ID"])
# split numercial feature into categorical 
feature_df = application_df[application_df.ID.isin(set(label_df.ID))]
feature_df["AMT_INCOME_TOTAL"] = pd.qcut(feature_df["AMT_INCOME_TOTAL"], q=5, labels=False) 
feature_df = feature_df.sort_values(["ID"])
# split dataset
X_train, X_test, y_train, y_test = train_test_split(feature_df, label_df,test_size=0.2, random_state=42)
train_df = pd.merge(X_train, y_train, on="ID")
test_df = pd.merge(X_test, y_test, on="ID")
test_dict = test_df[["AMT_INCOME_TOTAL"]].to_dict("records")


  bad_record_ID = eval_credit_record[credit_df["STATUS"].isin(val)].drop(["MONTHS_BALANCE"], axis = 1).reset_index(drop=True).ID.unique()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df["AMT_INCOME_TOTAL"] = pd.qcut(feature_df["AMT_INCOME_TOTAL"], q=5, labels=False)


In [33]:
len(feature_df)

36457

In [25]:
model = BayesianNetwork([('AMT_INCOME_TOTAL', 'Label'), ('NAME_EDUCATION_TYPE', 'Label')])
model.fit(train_df, estimator=BayesianEstimator)

In [26]:
# Perform inference
infer = BeliefPropagation(model)
result = []
for i in test_dict:
    query_result = infer.query(variables=['Label'], evidence=i)
    result.append(np.argmax(query_result.values))
confusion_matrix(y_test["Label"], result, labels=[0, 1])

array([[7237,    0],
       [  55,    0]])

In [27]:
from pgmpy.factors.discrete.CPD import TabularCPD

def print_full(cpd):
    backup = TabularCPD._truncate_strtable
    TabularCPD._truncate_strtable = lambda self, x: x
    print(cpd)
    TabularCPD._truncate_strtable = backup

In [28]:
print_full(model.get_cpds('Label'))

+---------------------+--------------------------------------+---------------------------------------+----------------------------------------+--------------------------------------+----------------------------------------------------+--------------------------------------+---------------------------------------+----------------------------------------+--------------------------------------+----------------------------------------------------+--------------------------------------+---------------------------------------+----------------------------------------+--------------------------------------+----------------------------------------------------+--------------------------------------+---------------------------------------+----------------------------------------+--------------------------------------+----------------------------------------------------+--------------------------------------+---------------------------------------+----------------------------------------+---------

In [30]:
print_full(model.get_cpds('AMT_INCOME_TOTAL'))

+---------------------+----------+
| AMT_INCOME_TOTAL(0) | 0.199073 |
+---------------------+----------+
| AMT_INCOME_TOTAL(1) | 0.162049 |
+---------------------+----------+
| AMT_INCOME_TOTAL(2) | 0.220301 |
+---------------------+----------+
| AMT_INCOME_TOTAL(3) | 0.214421 |
+---------------------+----------+
| AMT_INCOME_TOTAL(4) | 0.204156 |
+---------------------+----------+


In [32]:
test_dict

[{'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 4},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 4},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 4},
 {'AMT_INCOME_TOTAL': 4},
 {'AMT_INCOME_TOTAL': 1},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 1},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 1},
 {'AMT_INCOME_TOTAL': 1},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 4},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 1},
 {'AMT_INCOME_TOTAL': 1},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 1},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 4},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 2},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOME_TOTAL': 3},
 {'AMT_INCOME_TOTAL': 0},
 {'AMT_INCOM