In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [2]:
bs_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/air_937_business_shell_v31_baseline_unchanged_archive_date_results_w20230812-084055.csv"
test1_path = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/air_937_business_shell_v31_test1_unchanged_archive_date_results_w20230812-110244.csv'
test2_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/air_937_business_shell_v31_test2_unchanged_archive_date_results_w20230812-172620.csv"

In [3]:
string_list = {'lnlexidsele': str, 'outbestbusname': str, 'outbestbusstreetaddr': str, 'outbestbuscity': str, 'outbestbusstate': str, 'outbestbuszip': str, 
               'outbestbustin': str, 'outbestbusphone': str, 'outbestnaics': str, 'outbestsic': str, 'liennewesttype': str, 'judgmentnewesttype': str, 
               'inputaddrtype': str, }

In [4]:
baseline = pd.read_adls(bs_path, reader = pd.read_csv, encoding='iso-8859-1', dtype = string_list, verbose = False, low_memory = False)
print(baseline.shape)
test_1 = pd.read_adls(test1_path, reader = pd.read_csv, encoding='iso-8859-1', dtype = string_list, verbose = False, low_memory = False)
print(test_1.shape)
test_2 = pd.read_adls(test2_path, reader = pd.read_csv, encoding='iso-8859-1', dtype = string_list, verbose = False, low_memory = False)
print(test_2.shape)

(99978, 2304)
(99889, 2304)
(100000, 2304)


In [5]:
bs_seleid = baseline[["accountnumber", "seleid"]]
bs_seleid.columns = ["accountnumber", "bs_seleid"]
test_1 = test_1.merge(bs_seleid, on = "accountnumber")
print(test_1.shape)
test_1 = test_1[test_1["seleid"] == test_1["bs_seleid"]]
print(test_1.shape)

test_2 = test_2.merge(bs_seleid, on = "accountnumber")
print(test_2.shape)
test_2 = test_2[test_2["seleid"] == test_2["bs_seleid"]]
print(test_2.shape)

(99867, 2305)
(99867, 2305)
(99978, 2305)
(99978, 2305)


In [6]:
input_df = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/sbfe_sample_for_DS_wperf.parquet")
print(input_df.shape)

(50000, 53)


In [7]:
input_df.unique_id = input_df.unique_id.astype(str)

In [8]:
input_df["year"] = input_df.ArchiveDate.str.slice(0, 4)

In [9]:
keep = ['accountnumber', 'model1score', 'model2score']
test_1 = test_1[keep]
test_1.columns = ["t1_"+x for x in test_1.columns]
test_2 = test_2[keep]
test_2.columns = ["t2_"+x for x in test_2.columns]
baseline = baseline[keep]
baseline = baseline
baseline.columns = ["b_"+x for x in keep]
merged = test_1.merge(test_2, left_on = "t1_accountnumber", right_on = "t2_accountnumber")
merged = merged.merge(baseline, left_on = "t1_accountnumber", right_on = "b_accountnumber")
merged = merged.merge(input_df[['unique_id', 'AnalysisCreditBad', "ArchiveDate"]], left_on = "t1_accountnumber", right_on = "unique_id")
print(merged.shape)
merged["t1_b_mld1score"] = np.where((merged.t1_model1score >500) & (merged.b_model1score >500), merged.t1_model1score - merged.b_model1score, -999_999)
merged["t1_b_mld2score"] = np.where((merged.t1_model2score >500) & (merged.b_model2score >500), merged.t1_model2score - merged.b_model2score, -999_999)
merged["t2_b_mld1score"] = np.where((merged.t2_model1score >500) & (merged.b_model1score >500), merged.t2_model1score - merged.b_model1score, -999_999)
merged["t2_b_mld2score"] = np.where((merged.t2_model2score >500) & (merged.b_model2score >500), merged.t2_model2score - merged.b_model2score, -999_999)

(49956, 12)


In [10]:
merged.AnalysisCreditBad = merged.AnalysisCreditBad.astype(int)

In [11]:
from dsgtools.reporting import ks_tables
score_fmt = make_format(cuts = [-np.inf, 299, 400, 425, 450, 475, 500, 525, 550, 575, 600, 625, 
                               650, 675, 700, 725, 750, 775, 800, 850, np.inf], exceptions = [0, 100, 200, 222])

kss_tbl = ks_tables(['b_model1score', 't1_model1score', 't2_model1score', 'b_model2score', 't1_model2score', 't2_model2score'],
    y = "AnalysisCreditBad",
    format = score_fmt,
    df=merged,
    groups = "year"
)
kss_tbl.to_xlsx("./_temp/ks.xlsx", dof = [0.05, 0.1, 0.2, 0.3], overwrite = True)

Exception: year is not a column of df.

In [17]:
fmt = make_format(cuts = [-np.inf, -51, -41, -31, -21, -11, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-999_999])
result = dict()

result["f1"] = freq(merged["t1_b_mld1score"], format = fmt, observed = True)
result["f2"] = freq(merged["t1_b_mld2score"], format = fmt, observed = True)
result["f3"] = freq(merged["t2_b_mld1score"], format = fmt, observed = True)
result["f4"] = freq(merged["t2_b_mld2score"], format = fmt, observed = True)

wb = TableWriter(filename = "./_temp/freq_201506.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[1],
    )
wb.close()

In [6]:
exclude = ['accountnumber', 'historydateyyyymm', 'bus_company_name', 'powid', 'proxid', 'seleid', 'orgid', 'ultid', 'lnlexidsele','model1name', 'model1score',
           'model1rc1', 'model1rc2', 'model1rc3', 'model1rc4', 'model1rc5', 'model1rc6', 'model2name', 'model2score', 'model2rc1', 'model2rc2', 'model2rc3', 'model2rc4',
           'model2rc5', 'model2rc6', 'model3name', 'model3score', 'model3rc1', 'model3rc2', 'model3rc3', 'model3rc4', 'model3rc5', 'model3rc6', 'model4name', 'model4score',
           'model4rc1', 'model4rc2', 'model4rc3', 'model4rc4', 'model4rc5', 'model4rc6', 'model5name', 'model5score', 'model5rc1', 'model5rc2', 'model5rc3', 'model5rc4',
           'model5rc5', 'model5rc6', 'model6name', 'model6score', 'model6rc1', 'model6rc2', 'model6rc3', 'model6rc4', 'model6rc5', 'model6rc6', 'model7name', 'model7score',
           'model7rc1', 'model7rc2', 'model7rc3', 'model7rc4', 'model7rc5', 'model7rc6', 'model8name', 'model8score', 'model8rc1', 'model8rc2', 'model8rc3', 'model8rc4',
           'model8rc5', 'model8rc6', 'model9name', 'model9score', 'model9rc1', 'model9rc2', 'model9rc3', 'model9rc4', 'model9rc5', 'model9rc6', 'model10name', 'model10score',
           'model10rc1', 'model10rc2', 'model10rc3', 'model10rc4', 'model10rc5', 'model10rc6', 'errorcode', 'time_ms']
sbfe_attr = [x for x in baseline.columns if "sbfe" in x]
rv_attr = [x for x in baseline.columns if x not in sbfe_attr and x not in exclude]
model = ["model1score", "model2score"]

exception_dict = dict()
for x in baseline.columns:
    if x in sbfe_attr:
        exception_dict[x] = [-99, -98, -97]
    if x in rv_attr:
        exception_dict[x] = [-1]
    if x in model:
        exception_dict[x] = [0, 100, 200, 222]

In [7]:

from dsgtools.reporting import ImpactAnalysisReport
fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
cr = ImpactAnalysisReport(baseline, test_1, keys="accountnumber",
           score_formats = {"model1score" : fmt_score, 
                         'model2score': fmt_score, },
           score_columns = ["model1score", "model2score", "sbfehitindex"], exceptions = exception_dict,
           dataframe_names = ("baseline", "test1"))
cr.to_xlsx("./_temp/air937_perf_test1_data-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7f44b8e9e8f0>

<Figure size 640x480 with 0 Axes>

In [8]:
fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
cr = ImpactAnalysisReport(baseline, test_2, keys="accountnumber",
           score_columns = ["model1score", "model2score", "sbfehitindex"], 
           score_formats = {"model1score" : fmt_score, 
                         'model2score': fmt_score, },
           exceptions=exception_dict, 
           dataframe_names = ("baseline", "test2"))
cr.to_xlsx("./_temp/air937_perf_test2_data-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7f44b9547340>

<Figure size 640x480 with 0 Axes>

In [16]:

# from dsgtools.reporting import ImpactAnalysisReport
# fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
# cr = ImpactAnalysisReport(test_1, test_2, keys="accountnumber",
#            score_columns = ["model1score", "model2score", "sbfehitindex"], 
#            score_formats = {"model1score" : fmt_score, 
#                          'model2score': fmt_score, },
#            exceptions=exception_dict, 
#            dataframe_names = ("test_1", "test2"))
# cr.to_xlsx("./_temp/temp_air937_201506_test1_test2_data-comparison.xlsx", overwrite= True)