In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

Matplotlib is building the font cache; this may take a moment.


In [2]:
bs_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/Processing/202306/baseline_air_937_business_shell_v31_baseline_results_w20230707-114805.csv"
test1_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/Processing/202306/test1_air_937_business_shell_v31_baseline_results_w20230707-140142.csv"
test2_path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_937/Processing/202306/test2_air_937_business_shell_v31_baseline_results_w20230707-155449.csv"

In [3]:
baseline = pd.read_adls(bs_path, reader = pd.read_csv, encoding='iso-8859-1')
print(baseline.shape)
test_1 = pd.read_adls(test1_path, reader = pd.read_csv, encoding='iso-8859-1')
print(test_1.shape)
test_2 = pd.read_adls(test2_path, reader = pd.read_csv, encoding='iso-8859-1')
print(test_2.shape)

  data = reader(file, **kwargs)


(99995, 2304)


  data = reader(file, **kwargs)


(99997, 2304)


  data = reader(file, **kwargs)


(99582, 2304)


In [41]:
baseline.columns

Index(['accountnumber', 'historydateyyyymm', 'bus_company_name', 'powid',
       'proxid', 'seleid', 'orgid', 'ultid', 'lnlexidsele',
       'inputcheckbusname',
       ...
       'model10score', 'model10rc1', 'model10rc2', 'model10rc3', 'model10rc4',
       'model10rc5', 'model10rc6', 'errorcode', 'time_ms', 'id'],
      dtype='object', length=2305)

In [42]:
baseline.loc[baseline["id"].isin(['172863781R584309_ALFRED_W_AMORE', '172862631R433824_BULK_CANDY_STORE_MAIN', 
                                  '172862611R456068_MANAGEMENT_SYS_INTL', '172863881R462978_SIMPLY_SOLAR_LLC', 
                                  '172862631R660998_TRINITY_EVANGELICAL_LUTHERAN_CHURCH']), ['accountnumber', 'historydateyyyymm', 'bus_company_name', 'powid',
       'proxid', 'seleid', 'orgid', 'ultid']]

Unnamed: 0,accountnumber,historydateyyyymm,bus_company_name,powid,proxid,seleid,orgid,ultid
50120,172863781R584309,202306,ALFRED W AMORE,15812976747,15812976747,15812976747,15812976747,15812976747
50671,172862631R433824,202306,BULK CANDY STORE MAIN,15039071657,15039071657,15039071657,15039071657,15039071657
51610,172862631R660998,202306,TRINITY EVANGELICAL LUTHERAN CHURCH,149586581,149586581,149586581,149586581,149586581
52453,172863881R462978,202306,SIMPLY SOLAR LLC,137356580125,137356580125,28881689682,28881689682,28881689682
62227,172862611R456068,202306,MANAGEMENT SYS INTL,7252432254,7252432254,7252432254,7252432254,7252432254


In [36]:
baseline["id"] = baseline.accountnumber + "_" + baseline.bus_company_name.fillna("").replace(' ', '_', regex=True)
test_1["id"] = test_1.accountnumber + "_" + test_1.bus_company_name.fillna("").replace(' ', '_', regex=True)
test_2["id"] = test_2.accountnumber + "_" + test_2.bus_company_name.fillna("").replace(' ', '_', regex=True)

In [32]:
baseline[['liennewesttype', 'judgmentnewesttype', 'inputaddrtype']] = baseline[['liennewesttype', 'judgmentnewesttype', 'inputaddrtype']].astype(str)
test_1[['liennewesttype', 'judgmentnewesttype', 'inputaddrtype']] = test_1[['liennewesttype', 'judgmentnewesttype', 'inputaddrtype']].astype(str)
test_2[['liennewesttype', 'judgmentnewesttype', 'inputaddrtype']] = test_2[['liennewesttype', 'judgmentnewesttype', 'inputaddrtype']].astype(str)

In [38]:
bs_seleid = baseline[["id", "seleid"]]
bs_seleid.columns = ["id", "bs_seleid"]
test_1 = test_1.merge(bs_seleid, on = "id")
print(test_1.shape)
test_1 = test_1[test_1["seleid"] == test_1["bs_seleid"]]
print(test_1.shape)

test_2 = test_2.merge(bs_seleid, on = "id")
print(test_2.shape)
test_2 = test_2[test_2["seleid"] == test_2["bs_seleid"]]
print(test_2.shape)

(99994, 2306)
(99994, 2306)
(99579, 2306)
(98777, 2306)


In [39]:
99579 - 98777

802

In [31]:
baseline.select_dtypes(include=[object]).columns

Index(['accountnumber', 'bus_company_name', 'outbestbusname',
       'outbestbusstreetaddr', 'outbestbuscity', 'outbestbusstate',
       'liennewesttype', 'judgmentnewesttype', 'inputaddrtype', 'model1name',
       'model1rc1', 'model1rc2', 'model1rc3', 'model1rc4', 'model2name',
       'model2rc1', 'model2rc2', 'model2rc3', 'model2rc4', 'errorcode', 'id'],
      dtype='object')

In [35]:
from dsgtools.reporting import ImpactAnalysisReport
fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
cr = ImpactAnalysisReport(baseline, test_1, keys="id",
           score_columns = ["model1score", "model2score", "sbfehitindex"], 
           score_formats = {"model1score" : fmt_score, 
                         'model2score': fmt_score, },
           exceptions=[-1, -99, -98, -97, 0, 100, 200, 222],
           dataframe_names = ("baseline", "test1"))
cr.to_xlsx("./_temp/air937_202306_test1_data-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7f204dea6a40>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [40]:
fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
cr = ImpactAnalysisReport(baseline, test_2, keys="id",
           score_columns = ["model1score", "model2score", "sbfehitindex"], 
           score_formats = {"model1score" : fmt_score, 
                         'model2score': fmt_score, },
           exceptions=[-1, -99, -98, -97, 0, 100, 200, 222], 
           dataframe_names = ("baseline", "test2"))
cr.to_xlsx("./_temp/air937_202306_test2_data-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7f21ce658fd0>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>