In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [2]:
bs_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/Processing_v2/air_937_business_shell_v31_baseline_202206_results_w20230725-073810.csv"
# test1_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/Processing_v2/air_937_business_shell_v31_test1_202206_results_w20230725-120106.csv"
test1_path = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/Processing_v3/air_937_business_shell_v31_test1_202201_results_w20230729-102909.csv'
test2_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/Processing_v2/air_937_business_shell_v31_test2_202206_results_w20230725-153558.csv"

In [3]:
string_list = {'lnlexidsele': str, 'outbestbusname': str, 'outbestbusstreetaddr': str, 'outbestbuscity': str, 'outbestbusstate': str, 'outbestbuszip': str, 
               'outbestbustin': str, 'outbestbusphone': str, 'outbestnaics': str, 'outbestsic': str, 'liennewesttype': str, 'judgmentnewesttype': str, 
               'inputaddrtype': str, }

In [4]:
baseline = pd.read_adls(bs_path, reader = pd.read_csv, encoding='iso-8859-1', dtype = string_list, verbose = False, low_memory = False)
print(baseline.shape)
test_1 = pd.read_adls(test1_path, reader = pd.read_csv, encoding='iso-8859-1', dtype = string_list, verbose = False, low_memory = False)
print(test_1.shape)
test_2 = pd.read_adls(test2_path, reader = pd.read_csv, encoding='iso-8859-1', dtype = string_list, verbose = False, low_memory = False)
print(test_2.shape)

(99988, 2304)
(100000, 2304)
(100000, 2304)


In [5]:
bs_seleid = baseline[["accountnumber", "seleid"]]
bs_seleid.columns = ["accountnumber", "bs_seleid"]
test_1 = test_1.merge(bs_seleid, on = "accountnumber")
print(test_1.shape)
test_1 = test_1[test_1["seleid"] == test_1["bs_seleid"]]
print(test_1.shape)

test_2 = test_2.merge(bs_seleid, on = "accountnumber")
print(test_2.shape)
test_2 = test_2[test_2["seleid"] == test_2["bs_seleid"]]
print(test_2.shape)

(99988, 2305)
(99988, 2305)
(99988, 2305)
(99988, 2305)


In [6]:
exclude = ['accountnumber', 'historydateyyyymm', 'bus_company_name', 'powid', 'proxid', 'seleid', 'orgid', 'ultid', 'lnlexidsele','model1name', 'model1score',
           'model1rc1', 'model1rc2', 'model1rc3', 'model1rc4', 'model1rc5', 'model1rc6', 'model2name', 'model2score', 'model2rc1', 'model2rc2', 'model2rc3', 'model2rc4',
           'model2rc5', 'model2rc6', 'model3name', 'model3score', 'model3rc1', 'model3rc2', 'model3rc3', 'model3rc4', 'model3rc5', 'model3rc6', 'model4name', 'model4score',
           'model4rc1', 'model4rc2', 'model4rc3', 'model4rc4', 'model4rc5', 'model4rc6', 'model5name', 'model5score', 'model5rc1', 'model5rc2', 'model5rc3', 'model5rc4',
           'model5rc5', 'model5rc6', 'model6name', 'model6score', 'model6rc1', 'model6rc2', 'model6rc3', 'model6rc4', 'model6rc5', 'model6rc6', 'model7name', 'model7score',
           'model7rc1', 'model7rc2', 'model7rc3', 'model7rc4', 'model7rc5', 'model7rc6', 'model8name', 'model8score', 'model8rc1', 'model8rc2', 'model8rc3', 'model8rc4',
           'model8rc5', 'model8rc6', 'model9name', 'model9score', 'model9rc1', 'model9rc2', 'model9rc3', 'model9rc4', 'model9rc5', 'model9rc6', 'model10name', 'model10score',
           'model10rc1', 'model10rc2', 'model10rc3', 'model10rc4', 'model10rc5', 'model10rc6', 'errorcode', 'time_ms']
sbfe_attr = [x for x in baseline.columns if "sbfe" in x]
rv_attr = [x for x in baseline.columns if x not in sbfe_attr and x not in exclude]
model = ["model1score", "model2score"]

exception_dict = dict()
for x in baseline.columns:
    if x in sbfe_attr:
        exception_dict[x] = [-99, -98, -97]
    if x in rv_attr:
        exception_dict[x] = [-1]
    if x in model:
        exception_dict[x] = [0, 100, 200, 222]

In [7]:
from dsgtools.reporting import ImpactAnalysisReport
fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
cr = ImpactAnalysisReport(baseline, test_1, keys="accountnumber",
           score_columns = ["model1score", "model2score", "sbfehitindex"], 
           score_formats = {"model1score" : fmt_score, 
                         'model2score': fmt_score, },
           exceptions= exception_dict,
           dataframe_names = ("baseline", "test1"))
cr.to_xlsx("./_temp/air937_202201_test1_data-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7f9910113070>

<Figure size 640x480 with 0 Axes>

In [8]:
fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
cr = ImpactAnalysisReport(baseline, test_2, keys="accountnumber",
           score_columns = ["model1score", "model2score", "sbfehitindex"], 
           score_formats = {"model1score" : fmt_score, 
                         'model2score': fmt_score, },
           exceptions= exception_dict, 
           dataframe_names = ("baseline", "test2"))
cr.to_xlsx("./_temp/air937_202201_test2_data-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7f980dd42a10>

<Figure size 640x480 with 0 Axes>