In [123]:
from causalnex.evaluation import classification_report
from sklearn.model_selection import train_test_split
from causalnex.network import BayesianNetwork
from causalnex.structure import StructureModel
import warnings
import pandas as pd
from functions import cross_entropy
from credit import *

warnings.filterwarnings('ignore')

In [125]:
credit_card = pd.read_csv('credit_default.csv')
credit_card = credit_card.rename(
    columns={'default payment next month': 'TARGET'})

for col in credit_card.columns.tolist():
    credit_card[col] = pd.to_numeric(credit_card[col])

credit_card.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,TARGET
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [126]:
credit_card['TOTAL_PAY'] = credit_card[['PAY_AMT1', 'PAY_AMT2',
                                        'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].sum(axis=1)
credit_card['TOTAL_BILL'] = credit_card[['BILL_AMT1', 'BILL_AMT2',
                                         'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].sum(axis=1)

credit_card['PAY_TO_BILL'] = credit_card['TOTAL_PAY'] / \
    credit_card['TOTAL_BILL']

credit_card['DISCRETIZED_PAY_TO_BILL'] = credit_card.PAY_TO_BILL.apply(
    pay_to_bill_discretizer)

credit_card_ss = credit_card[["EDUCATION",
                              "AGE", "MARRIAGE", "TARGET", "DISCRETIZED_PAY_TO_BILL"]]
credit_card_ss["AGE_GROUP"] = credit_card_ss.AGE.apply(age_discretizer)
del credit_card_ss["AGE"]

In [127]:
credit_card_ss

Unnamed: 0,EDUCATION,MARRIAGE,TARGET,DISCRETIZED_PAY_TO_BILL,AGE_GROUP
0,2,1,1,0,0
1,2,2,1,1,1
2,2,2,0,0,1
3,2,1,0,0,2
4,2,1,0,2,3
...,...,...,...,...,...
29995,3,1,0,0,2
29996,3,2,0,2,2
29997,2,2,1,1,2
29998,3,1,1,2,2


# LABELS

In [128]:
education_map = {
    1: 'graduate school',
    2: 'university',
    3: 'high school',
    4: 'others',
    5: 'others',
    6: 'others',
    0: 'others',

}

marriage_map = {
    1: 'married',
    2: 'single',
    3: 'others',
    0: 'others'
}

target_map = {
    1: 'default',
    0: 'no default'
}

age_map = {
    0: '< 25',
    1: '< 35',
    2: '< 45',
    3: '>= 45'
}

ptb_map = {
    0: '< 0.25',
    1: '< 0.50',
    2: '< 1',
    3: '>= 1'
}

In [129]:
credit_card_ss['EDUCATION'] = credit_card_ss.EDUCATION.map(education_map)
credit_card_ss['AGE_GROUP'] = credit_card_ss.AGE_GROUP.map(age_map)
credit_card_ss['MARRIAGE'] = credit_card_ss.MARRIAGE.map(marriage_map)
credit_card_ss['TARGET'] = credit_card_ss.TARGET.map(target_map)
credit_card_ss['DISCRETIZED_PAY_TO_BILL'] = credit_card_ss.DISCRETIZED_PAY_TO_BILL.map(ptb_map)

In [130]:
credit_card_ss

Unnamed: 0,EDUCATION,MARRIAGE,TARGET,DISCRETIZED_PAY_TO_BILL,AGE_GROUP
0,university,married,default,< 0.25,< 25
1,university,single,default,< 0.50,< 35
2,university,single,no default,< 0.25,< 35
3,university,married,no default,< 0.25,< 45
4,university,married,no default,< 1,>= 45
...,...,...,...,...,...
29995,high school,married,no default,< 0.25,< 45
29996,high school,single,no default,< 1,< 45
29997,university,single,default,< 0.50,< 45
29998,high school,married,default,< 1,< 45


# MODEL

In [131]:
X, y = credit_card_ss[[
    i for i in credit_card_ss.columns if i != "TARGET"]], credit_card_ss["TARGET"]

print(X)
print(y)


train, test = train_test_split(credit_card_ss, train_size=0.9)


# Build Model
sm = StructureModel()

sm.add_edges_from([
    ('AGE_GROUP', 'MARRIAGE'),
    ('AGE_GROUP', 'EDUCATION'),
    ('EDUCATION', 'TARGET'),
    ('AGE_GROUP', 'TARGET'),
    ('DISCRETIZED_PAY_TO_BILL', 'TARGET'),
    ('AGE_GROUP', 'DISCRETIZED_PAY_TO_BILL'),
    ('EDUCATION', 'DISCRETIZED_PAY_TO_BILL')
])

# Train Model

bn = BayesianNetwork(sm)

bn = bn.fit_node_states(credit_card_ss)
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

classification_report(bn, test, "TARGET")

         EDUCATION MARRIAGE DISCRETIZED_PAY_TO_BILL AGE_GROUP
0       university  married                  < 0.25      < 25
1       university   single                  < 0.50      < 35
2       university   single                  < 0.25      < 35
3       university  married                  < 0.25      < 45
4       university  married                     < 1     >= 45
...            ...      ...                     ...       ...
29995  high school  married                  < 0.25      < 45
29996  high school   single                     < 1      < 45
29997   university   single                  < 0.50      < 45
29998  high school  married                     < 1      < 45
29999   university  married                  < 0.25     >= 45

[30000 rows x 4 columns]
0           default
1           default
2        no default
3        no default
4        no default
            ...    
29995    no default
29996    no default
29997       default
29998       default
29999       default
Name: TARG

{'TARGET_default': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 664},
 'TARGET_no default': {'precision': 0.7786666666666666,
  'recall': 1.0,
  'f1-score': 0.8755622188905547,
  'support': 2336},
 'accuracy': 0.7786666666666666,
 'macro avg': {'precision': 0.3893333333333333,
  'recall': 0.5,
  'f1-score': 0.43778110944527737,
  'support': 3000},
 'weighted avg': {'precision': 0.6063217777777777,
  'recall': 0.7786666666666666,
  'f1-score': 0.6817711144427786,
  'support': 3000}}

In [132]:
bn.cpds['TARGET']

AGE_GROUP,< 25,< 25,< 25,< 25,< 25,< 25,< 25,< 25,< 25,< 25,...,>= 45,>= 45,>= 45,>= 45,>= 45,>= 45,>= 45,>= 45,>= 45,>= 45
DISCRETIZED_PAY_TO_BILL,< 0.25,< 0.25,< 0.25,< 0.25,< 0.50,< 0.50,< 0.50,< 0.50,< 1,< 1,...,< 0.50,< 0.50,< 1,< 1,< 1,< 1,>= 1,>= 1,>= 1,>= 1
EDUCATION,graduate school,high school,others,university,graduate school,high school,others,university,graduate school,high school,...,others,university,graduate school,high school,others,university,graduate school,high school,others,university
TARGET,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
default,0.285714,0.313084,0.166667,0.314597,0.226415,0.137931,0.1,0.114286,0.179775,0.25,...,0.125,0.186567,0.123596,0.18,0.2,0.152027,0.255738,0.268421,0.125,0.271654
no default,0.714286,0.686916,0.833333,0.685403,0.773585,0.862069,0.9,0.885714,0.820225,0.75,...,0.875,0.813433,0.876404,0.82,0.8,0.847973,0.744262,0.731579,0.875,0.728346


# PREDICTIONS

In [133]:
random_sample = pd.DataFrame({'EDUCATION': ['high school'], 'MARRIAGE': ['married'], 'AGE_GROUP': ['>= 45'], 'DISCRETIZED_PAY_TO_BILL': ['>= 1']})

prediction = bn.predict_probability(random_sample, 'TARGET')

prediction

Unnamed: 0,TARGET_default,TARGET_no default
0,0.268421,0.731579


In [134]:
predictions = bn.predict_probability(test, 'TARGET')
predictions

Unnamed: 0,TARGET_default,TARGET_no default
7175,0.180000,0.820000
27240,0.207455,0.792545
4313,0.136725,0.863275
7555,0.246183,0.753817
25852,0.271654,0.728346
...,...,...
8697,0.207455,0.792545
10075,0.172234,0.827766
17475,0.152778,0.847222
13063,0.207455,0.792545


In [135]:
default = [1 if i == 'default' else 0 for i in test.TARGET.values]

In [136]:
cross_entropy(predictions['TARGET_no default'], default)

0.06007739163750479

# INFERENCE ENGINE

## PURE INFERENCE

In [137]:
from causalnex.inference import InferenceEngine

ie = InferenceEngine(bn)
marginals = ie.query()
marginals

{'AGE_GROUP': {'< 25': 0.08880165901347947,
  '< 35': 0.4358243223226188,
  '< 45': 0.29873352095985783,
  '>= 45': 0.17664049770404386},
 'MARRIAGE': {'married': 0.45446777142795114,
  'others': 0.012697699469718951,
  'single': 0.53283452910233},
 'EDUCATION': {'graduate school': 0.3524357190937588,
  'high school': 0.16422882523000518,
  'others': 0.0161385104517222,
  'university': 0.4671969452245137},
 'TARGET': {'default': 0.22260303322544223, 'no default': 0.777396966774558},
 'DISCRETIZED_PAY_TO_BILL': {'< 0.25': 0.608182075788787,
  '< 0.50': 0.08600339793313116,
  '< 1': 0.1594711639758108,
  '>= 1': 0.1463433623022711}}

## INFERENCE INTERVENTION

In [138]:
ie.do_intervention('EDUCATION',{'graduate school': 0.0,
  'high school': 0.0,
  'others': 0.0,
  'university': 1.0}
  )

ie.query()['EDUCATION']

{'graduate school': 0.0, 'high school': 0.0, 'others': 0.0, 'university': 1.0}

In [139]:
ie.query()

{'AGE_GROUP': {'< 25': 0.08880165901347947,
  '< 35': 0.4358243223226189,
  '< 45': 0.2987335209598578,
  '>= 45': 0.17664049770404386},
 'MARRIAGE': {'married': 0.45446777142795114,
  'others': 0.012697699469718951,
  'single': 0.53283452910233},
 'EDUCATION': {'graduate school': 0.0,
  'high school': 0.0,
  'others': 0.0,
  'university': 1.0},
 'TARGET': {'default': 0.23829874779520982, 'no default': 0.7617012522047902},
 'DISCRETIZED_PAY_TO_BILL': {'< 0.25': 0.6770727458547593,
  '< 0.50': 0.08503392185177683,
  '< 1': 0.1267394266403521,
  '>= 1': 0.11115390565311171}}