In [1]:
import sys
sys.path.append("..")

In [21]:
import random
import pandas as pd
import numpy as np
from sklearn.neural_network import BernoulliRBM
import matplotlib.pyplot as plt
import seaborn
import models.xgb as xgb
import models.feature_util
from models.data_util import get_col_matcher

In [3]:
db = pd.read_pickle("../data/clean.pkl")
model = xgb.from_file("../data/xgb_model.bin")

x, y_true = db.drop('isfailed', axis=1).to_numpy(), db[['isfailed']].to_numpy()

In [14]:
dbcols = list(db.drop('isfailed', axis=1).columns)

In [22]:
pred = get_col_matcher(models.feature_util.optional_cols)
optional_cols = [col for col in db.columns if pred(col)]

In [5]:
db_present = db[optional_cols].notna()  # will store the presence/absence of each value

In [6]:
rbm = BernoulliRBM(n_components=100,
                   learning_rate=0.01,
                   n_iter=10,
                   random_state=0)

rbm.fit(db_present)

BernoulliRBM(batch_size=10, learning_rate=0.01, n_components=100, n_iter=10,
       random_state=0, verbose=0)

In [31]:
# consider the effect of removing the accounting fields for a given company:
for i in range(1000):
    comp_index = random.randint(0, 599999)
    
    present = db_present.iloc[comp_index].to_numpy()
    values = x[comp_index]
    
    p_failed_prior = model.predict_proba(values.reshape((1, -1)))[0][1]
    p_true_prior = np.exp(rbm.score_samples(present.reshape((1, -1))))
    
    # corrupt a single column (whose data is already present)
    published_cols = [col for i, col in enumerate(optional_cols) if present[i]]
    
    if len(published_cols) > 0:
        col = random.choice(published_cols)
        
        present[optional_cols.index(col)] = False
        values[dbcols.index(col)] = None
        
        p_failed_posterior = model.predict_proba(values.reshape((1, -1)))[0][1]
        p_true_posterior = np.exp(rbm.score_samples(present.reshape((1, -1))))
        
        if p_failed_posterior != p_failed_prior:
            print("Company", comp_index,
                  "model prediction:",
                  p_failed_prior,
                  "likelihood:",
                  p_true_prior)

            print("---> corrupting", col, "gives",
                  "model prediction difference:",
                  (p_failed_posterior - p_failed_prior),
                  "likelihood difference:",
                  (p_true_posterior - p_true_prior))
            
            print()
    

Company 247581 model prediction: 0.19516353 likelihood: 0.9999730992755771
---> corrupting Field465 gives model prediction difference: 0.0028456002 likelihood difference: 2.1479216573250248e-05

Company 315943 model prediction: 0.19339311 likelihood: 0.9999999939323536
---> corrupting Field306 gives model prediction difference: 0.0028268695 likelihood difference: -5.07142191574772e-08

Company 177838 model prediction: 0.19398965 likelihood: 0.9999999985136516
---> corrupting dAccountsNextDueDate gives model prediction difference: 0.0015010238 likelihood difference: -9.674661072267554e-11

Company 152 model prediction: 0.32914627 likelihood: 0.10977987263121303
---> corrupting dReturnsLastMadeUpDate gives model prediction difference: 0.0041635633 likelihood difference: 0.2115530775733886

Company 330457 model prediction: 0.19339311 likelihood: 0.9999999961884596
---> corrupting Field306 gives model prediction difference: 0.0028268695 likelihood difference: -1.6411205505661997e-08

Compa