In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [2]:
path = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/air_1587/'

In [3]:
from dsgtools import azure
## link to 20 ipynb
base = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/air_1587/1019_air_1587_business_shell_v31_base_w20231018-164942_sas_layout_busshell.csv", 
                    reader = pd.read_csv, encoding='iso-8859-1', verbose = False, low_memory = False)
test = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/air_1587/1019_air_1587_business_shell_v31_test_w20231018-174956_sas_layout_busshell.csv", 
                    reader = pd.read_csv, encoding='iso-8859-1', verbose = False, low_memory = False)
print(base.shape)
print(test.shape)

(99995, 3115)
(99997, 3115)


In [4]:
exclude = ['account', 'seq', 'id_powid', 'id_proxid', 'id_seleid', 'id_orgid', 'id_ultid', 'id_seleid_change_flag', 'id_seleid_change_code', 'id_weight',
           'id_score', 'id_category', 'id_status', 'id_truebiz', 'history_date', 'history_datetime',]
sbfe_attr = [x for x in base.columns if "sbfe" in x]
rv_attr = [x for x in base.columns if x not in sbfe_attr and x not in exclude]

exception_dict = dict()
for x in base.columns:
    if x in sbfe_attr:
        exception_dict[x] = [-99, -98, -97]
    if x in rv_attr:
        exception_dict[x] = [-1]

In [5]:
from dsgtools.reporting import ImpactAnalysisReport
cr = ImpactAnalysisReport(base, test, keys="account",
           exceptions= exception_dict,
           dataframe_names = ("base", "test"))
cr.to_xlsx("./_temp/air_1587comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7fbbc418f550>

In [6]:
# interested attributes: 'AssetPropAssessedTotalEver', "InputAddrOwnership", "InputAddrAssessedTotal"
# prop_assessed_value_total: -1, 0, pos
# addr_input_ownership: -1, 0, 1
# addr_input_assessed_value: -1, 0, pos
fmt = make_format(cuts = [-np.inf, 0, np.inf], exceptions = [-1])
freq("addr_input_ownership", "addr_input_assessed_value", "prop_assessed_value_total", df = base, format = [None, fmt, fmt], cross = False, observed = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Count,Pct,Cuml Count,Cuml Pct
addr_input_ownership,addr_input_assessed_value,prop_assessed_value_total,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,<= 0,<= 0,27.0,0.00027,27.0,0.00027
1,<= 0,1+,160.0,0.0016,187.0,0.00187
1,1+,<= 0,14553.0,0.145537,14740.0,0.147407
1,1+,1+,84440.0,0.844442,99180.0,0.99185
0,<= 0,<= 0,29.0,0.00029,99209.0,0.99214
0,<= 0,1+,113.0,0.00113,99322.0,0.99327
0,1+,<= 0,228.0,0.00228,99550.0,0.99555
0,1+,1+,326.0,0.00326,99876.0,0.99881
-1,1+,-1,8.0,8e-05,99884.0,0.99889
-1,-1,-1,111.0,0.00111,99995.0,1.0


In [7]:
freq("addr_input_ownership", "addr_input_assessed_value", "prop_assessed_value_total", df = test, format = [None, fmt, fmt], cross = False, observed = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Count,Pct,Cuml Count,Cuml Pct
addr_input_ownership,addr_input_assessed_value,prop_assessed_value_total,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,<= 0,<= 0,4.0,4e-05,4.0,4e-05
1,<= 0,1+,181.0,0.00181,185.0,0.00185
1,1+,<= 0,706.0,0.00706,891.0,0.00891
1,1+,1+,96336.0,0.963389,97227.0,0.972299
0,<= 0,<= 0,15.0,0.00015,97242.0,0.972449
0,<= 0,1+,129.0,0.00129,97371.0,0.973739
0,1+,<= 0,216.0,0.00216,97587.0,0.975899
0,1+,1+,2291.0,0.022911,99878.0,0.99881
-1,1+,-1,8.0,8e-05,99886.0,0.99889
-1,-1,-1,111.0,0.00111,99997.0,1.0


In [16]:
test.loc[test.id_seleid == 28892246199, "prop_assessed_value_total"]

29362    0
Name: prop_assessed_value_total, dtype: int64

In [18]:
base.loc[base.id_seleid == 28892246199, "prop_assessed_value_total"]

48086    0
Name: prop_assessed_value_total, dtype: int64

In [4]:
keep = ["account", "id_seleid", 'prop_assessed_value_total', 'prop_assessed_value_total_curr', 'prop_state_count_curr', 'prop_bldg_size_total_curr', 'prop_bldg_size_total', 
        'prop_count_curr', 'prop_lot_size_total', 'prop_count', 'prop_lot_size_total_curr', 'prop_state_count', 'addr_input_ownership', ]
test_temp = test[keep].copy()
test_temp.columns = ["t_" + x for x in test_temp.columns]
base_temp = base[keep].copy()

merged = test_temp.merge(base_temp, left_on = "t_account", right_on = "account")
print(merged[(merged.prop_assessed_value_total >0) & (merged.t_prop_assessed_value_total ==0)].shape)
merged[(merged.prop_assessed_value_total >0) & (merged.t_prop_assessed_value_total ==0)].head()

(47, 26)


Unnamed: 0,t_account,t_id_seleid,t_prop_assessed_value_total,t_prop_assessed_value_total_curr,t_prop_state_count_curr,t_prop_bldg_size_total_curr,t_prop_bldg_size_total,t_prop_count_curr,t_prop_lot_size_total,t_prop_count,...,prop_assessed_value_total_curr,prop_state_count_curr,prop_bldg_size_total_curr,prop_bldg_size_total,prop_count_curr,prop_lot_size_total,prop_count,prop_lot_size_total_curr,prop_state_count,addr_input_ownership
605,371375,159253993,0,0,1,0,0,1,0,1,...,0,1,0,0,1,0,1,0,1,1
681,844307,129958016,0,0,0,0,0,0,0,1,...,20000,1,0,0,1,0,1,0,1,1
2192,874304,2561352286,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,1,0,1,1
2943,55664,52874755,0,0,1,0,0,1,0,1,...,0,1,0,0,1,0,1,0,1,1
4142,935337,23421450,0,0,0,0,0,0,0,0,...,0,0,0,14940,0,140,1,0,1,1


In [8]:
diff_list = ['prop_assessed_value_total', 'prop_assessed_value_total_curr', 'prop_state_count_curr', 'prop_bldg_size_total_curr', 'prop_bldg_size_total', 'prop_count_curr', 
             'prop_lot_size_total', 'prop_count', 'prop_lot_size_total_curr', 'prop_state_count', 'addr_input_ownership', ]

for x in diff_list:
    merged[x + "_diff"] = np.where((merged[x] < 0) & (merged["t_" + x] < 0), -999_999_999, np.where(
        (merged[x] >= 0) & (merged["t_" + x] < 0), -999_999_998, np.where(
            (merged[x] < 0) & (merged["t_" + x] >= 0), -999_999_997, merged["t_" + x] - merged[x]
        )
    ))

In [12]:
# pd.options.display.float_format = '{:.2f}'.format
# merged[[x + "_diff" for x in diff_list]].describe([0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95])
result = dict()
fmt_1 = make_format(cuts = [-np.inf, -500_000_001, -1_000_001, -500_001, -100_001, -50_001, -10_001, -5_001, -1, 0, 5_000, 10_000, 50_000, 
                            100_000, 500_000, 1_000_000, 500_000_000, np.inf], exceptions = [-999_999_999, -999_999_998, -999_999_997])
fmt_2 = make_format(cuts = [-np.inf, -51, -21, -16, -11, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 10, 15, 20, 50, np.inf], exceptions = [-999_999_999, -999_999_998, -999_999_997])
result["f1"] = freq(merged.addr_input_ownership_diff, observed = True)
result["f2"] = freq(merged.prop_assessed_value_total_diff, format = [fmt_1], observed = True)
result["f3"] = freq(merged.prop_assessed_value_total_curr_diff, format = [fmt_1], observed = True)
result["f4"] = freq(merged.prop_bldg_size_total_diff, format = [fmt_1], observed = True)
result["f5"] = freq(merged.prop_bldg_size_total_curr_diff, format = [fmt_1], observed = True)
result["f6"] = freq(merged.prop_lot_size_total_diff, format = [fmt_1], observed = True)
result["f7"] = freq(merged.prop_lot_size_total_curr_diff, format = [fmt_1], observed = True)
result["f8"] = freq(merged.prop_count_diff, format = [fmt_2], observed = True)
result["f9"] = freq(merged.prop_count_curr_diff, format = [fmt_2], observed = True)
result["f10"] = freq(merged.prop_state_count_diff, format = [fmt_2], observed = True)
result["f11"] = freq(merged.prop_state_count_curr_diff, format = [fmt_2], observed = True)
									
wb = TableWriter(filename = "./_temp/temp.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="fair_lending",
    )
wb.close()

In [16]:
merged[(merged.prop_assessed_value_total_diff > -500000000) & (merged.prop_assessed_value_total_diff < -100000000)]

Unnamed: 0,t_account,t_id_seleid,t_prop_assessed_value_total,t_prop_assessed_value_total_curr,t_prop_state_count_curr,t_prop_bldg_size_total_curr,t_prop_bldg_size_total,t_prop_count_curr,t_prop_lot_size_total,t_prop_count,t_prop_lot_size_total_curr,t_prop_state_count,t_addr_input_ownership,account,id_seleid,prop_assessed_value_total,prop_assessed_value_total_curr,prop_state_count_curr,prop_bldg_size_total_curr,prop_bldg_size_total,prop_count_curr,prop_lot_size_total,prop_count,prop_lot_size_total_curr,prop_state_count,addr_input_ownership,prop_assessed_value_total_diff,prop_assessed_value_total_curr_diff,prop_state_count_curr_diff,prop_bldg_size_total_curr_diff,prop_bldg_size_total_diff,prop_count_curr_diff,prop_lot_size_total_diff,prop_count_diff,prop_lot_size_total_curr_diff,prop_state_count_diff,addr_input_ownership_diff
1940,1875285,13501654,383869,383869,1,580642,580642,1,0,1,0,1,1,1875285,13501654,241458830,241204630,1,580642,580642,1,0,1,0,1,1,-241074961,-240820761,0,0,0,0,0,0,0,0,0
17242,1300415,64622154,33274455,33267464,1,57323,1044237,2,0,2,0,1,1,1300415,64622154,149797544,106997544,1,313061,1299975,1,0,1,0,1,1,-116523089,-73730080,0,-255738,-255738,1,0,1,0,0,0
19776,2275539,150427651,886300,0,0,0,3615,0,160,3,0,1,1,2275539,150427651,171576408,171037008,1,0,3615,1,0,2,0,1,1,-170690108,-171037008,-1,0,0,-1,160,1,0,0,0
24526,1098992,63201768,16684670,16684670,1,407409,407409,1,0,1,0,1,1,1098992,63201768,175036100,175036100,1,407409,407409,1,0,1,0,1,1,-158351430,-158351430,0,0,0,0,0,0,0,0,0
31485,1260235,58574316,99007818,0,0,0,569304,0,0,1,0,0,0,1260235,58574316,237675674,237675674,0,107876,107876,1,0,1,0,1,1,-138667856,-237675674,0,-107876,461428,-1,0,0,0,-1,-1
40631,1509394,11447953,13690,0,0,0,125000,0,0,1,0,1,1,1509394,11447953,297592994,297592994,1,0,0,1,0,1,0,1,1,-297579304,-297592994,-1,0,125000,-1,0,0,0,0,0
41318,1723784,90839988,299100,0,1,0,273214,1,71516,1,71516,1,1,1723784,90839988,178035600,0,1,0,125726,1,71516,1,71516,1,1,-177736500,0,0,0,147488,0,0,0,0,0,0
53517,2668712,11262373,189583100,189583100,1,424316,424316,1,0,1,0,1,1,2668712,11262373,564873656,564873656,1,424316,424316,1,0,1,0,1,1,-375290556,-375290556,0,0,0,0,0,0,0,0,0
66973,3018075,53522960,123204000,123204000,1,0,0,1,0,1,0,1,1,3018075,53522960,238897400,238897400,1,492110,492110,1,0,1,0,1,1,-115693400,-115693400,0,-492110,-492110,0,0,0,0,0,0
67213,1432941,105643100,453317150,338859500,1,6097,409060,4,0,8,0,1,0,1432941,105643100,739375750,200000000,0,701,368035,1,0,5,0,2,1,-286058600,138859500,1,5396,41025,3,0,3,0,-1,-1


In [14]:
merged[(merged.addr_input_ownership ==1) & (merged.t_addr_input_ownership==0)]

Unnamed: 0,t_account,t_id_seleid,t_prop_assessed_value_total,t_prop_assessed_value_total_curr,t_prop_state_count_curr,t_prop_bldg_size_total_curr,t_prop_bldg_size_total,t_prop_count_curr,t_prop_lot_size_total,t_prop_count,t_prop_lot_size_total_curr,t_prop_state_count,t_addr_input_ownership,account,id_seleid,prop_assessed_value_total,prop_assessed_value_total_curr,prop_state_count_curr,prop_bldg_size_total_curr,prop_bldg_size_total,prop_count_curr,prop_lot_size_total,prop_count,prop_lot_size_total_curr,prop_state_count,addr_input_ownership,prop_assessed_value_total_diff,prop_assessed_value_total_curr_diff,prop_state_count_curr_diff,prop_bldg_size_total_curr_diff,prop_bldg_size_total_diff,prop_count_curr_diff,prop_lot_size_total_diff,prop_count_diff,prop_lot_size_total_curr_diff,prop_state_count_diff,addr_input_ownership_diff
108,920431,50204287,514899,514899,0,5000,5000,1,0,1,0,0,0,920431,50204287,60416,60416,1,5000,5000,1,0,1,0,1,1,454483,454483,-1,0,0,0,0,0,0,-1,-1
180,963606,96583435,0,0,0,0,0,0,0,0,0,0,0,963606,96583435,0,0,1,0,0,1,0,1,0,1,1,0,0,-1,0,0,-1,0,-1,0,-1,-1
195,1190833,122316337882,0,0,0,0,0,0,0,0,0,0,0,1190833,122316337882,0,0,1,0,0,1,100,1,100,1,1,0,0,-1,0,0,-1,-100,-1,-100,-1,-1
221,455436,60350772,171978,171978,0,11814,11814,1,0,1,0,0,0,455436,60350772,180656,180656,1,11814,11814,1,0,1,0,1,1,-8678,-8678,-1,0,0,0,0,0,0,-1,-1
351,1682514,65589880,280117,0,0,0,4400,0,0,1,0,0,0,1682514,65589880,152019,152019,1,0,0,1,0,1,0,1,1,128098,-152019,-1,0,4400,-1,0,0,0,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99828,22628,151672935,127511,0,0,0,954084,0,0,1,0,0,0,22628,151672935,3848516,3848516,1,431971,431971,1,0,1,0,1,1,-3721005,-3848516,-1,-431971,522113,-1,0,0,0,-1,-1
99843,2282994,101824669,479948,345661,0,288,3648,1,0,1,0,0,0,2282994,101824669,272202,272202,1,4852,4852,1,0,1,0,1,1,207746,73459,-1,-4564,-1204,0,0,0,0,-1,-1
99876,257310,69198389,1740,1740,0,0,0,1,0,1,0,0,0,257310,69198389,587640,1740,0,0,2385,1,0,1,0,1,1,-585900,0,0,0,-2385,0,0,0,0,-1,-1
99917,2911459,32943914,1366940,1366940,0,21100,21100,1,0,1,0,0,0,2911459,32943914,116689,116689,1,0,0,1,0,1,0,1,1,1250251,1250251,-1,21100,21100,0,0,0,0,-1,-1


In [7]:
pd.set_option('display.max_columns', None)
merged[(merged.prop_count <=0) & (merged.prop_assessed_value_total<=0) & ((merged.t_prop_count >0) | (merged.t_prop_assessed_value_total>0))]

Unnamed: 0,t_account,t_id_seleid,t_prop_assessed_value_total,t_prop_assessed_value_total_curr,t_prop_state_count_curr,t_prop_bldg_size_total_curr,t_prop_bldg_size_total,t_prop_count_curr,t_prop_lot_size_total,t_prop_count,t_prop_lot_size_total_curr,t_prop_state_count,t_addr_input_ownership,account,id_seleid,prop_assessed_value_total,prop_assessed_value_total_curr,prop_state_count_curr,prop_bldg_size_total_curr,prop_bldg_size_total,prop_count_curr,prop_lot_size_total,prop_count,prop_lot_size_total_curr,prop_state_count,addr_input_ownership
72873,358524,107073775,4968,4968,0,0,0,1,0,1,0,0,0,358524,107073775,0,0,0,0,0,0,0,0,0,0,0
97456,3082758,159233466,119416,119416,0,2837,2837,1,0,1,0,0,0,3082758,159233466,0,0,0,0,0,0,0,0,0,0,0


In [7]:
base_temp[(base_temp.prop_count >0) & (base_temp.prop_assessed_value_total<=0)]

Unnamed: 0,account,id_seleid,prop_assessed_value_total,prop_assessed_value_total_curr,prop_state_count_curr,prop_bldg_size_total_curr,prop_bldg_size_total,prop_count_curr,prop_lot_size_total,prop_count,prop_lot_size_total_curr,prop_state_count,addr_input_ownership
0,28547,28914536474,0,0,1,0,0,1,0,1,0,1,1
1,30314,727771974,0,0,0,0,0,0,0,1,0,1,1
2,20789,127476823,0,0,1,0,0,1,0,1,0,1,1
3,56728,161105404,0,0,0,0,0,0,0,1,0,1,1
4,38181,122659884,0,0,1,0,0,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99599,2907039,54237904276,0,0,0,0,0,0,0,1,0,1,1
99600,2908384,52958339027,0,0,0,0,0,0,0,1,0,1,1
99601,2911355,22435794292,0,0,0,0,0,0,0,1,0,1,1
99602,2919034,133787799955,0,0,1,0,0,1,0,1,0,1,1


In [8]:
merged.loc[merged.prop_count_compare <0, ["t_account", 't_id_seleid', "prop_count", "t_prop_count"]]

Unnamed: 0,t_account,t_id_seleid,prop_count,t_prop_count
22,1530356,19534803,1,2
73,552396,137393276,1,2
95,891650,1285986279,1,3
97,892468,18164533,3,4
125,968318,161110133,1,2
...,...,...,...,...
99961,941060,44591014075,2,5
99973,2134477,52005885,2,4
99977,2212131,21387926,2,3
99989,1412672,13419188,1,2


In [6]:
merged["prop_count_compare"] = merged.prop_count - merged.t_prop_count
freq(merged["prop_count_compare"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
prop_count_compare,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-32,1.0,1e-05,1.0,1e-05
-22,1.0,1e-05,2.0,2e-05
-14,1.0,1e-05,3.0,3e-05
-11,3.0,3e-05,6.0,6e-05
-9,1.0,1e-05,7.0,7e-05
-8,2.0,2e-05,9.0,9e-05
-7,4.0,4e-05,13.0,0.00013
-6,12.0,0.00012,25.0,0.00025
-5,29.0,0.00029,54.0,0.00054
-4,74.0,0.00074,128.0,0.00128


In [13]:
test.prop_bldg_size_total.describe()

count    99997.00000
mean        -0.00098
std          0.03129
min         -1.00000
25%          0.00000
50%          0.00000
75%          0.00000
max          0.00000
Name: prop_bldg_size_total, dtype: float64

In [14]:
base.prop_bldg_size_total.describe()

count    9.999900e+04
mean     1.460349e+04
std      3.817340e+05
min     -1.000000e+00
25%      0.000000e+00
50%      2.640000e+03
75%      8.280000e+03
max      7.888696e+07
Name: prop_bldg_size_total, dtype: float64