In [1]:
# Libraries needed: 
import time
import pickle
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate
from dsgtools.reporting import col_summary
from dsgtools import azure

In [5]:
layout = ['seleid_1', 
'rep_1_lexid', 
'rep_2_lexid', 
'rep_3_lexid', 
'ultid', 
'orgid', 
'seleid_2', 
'proxid', 
'powid', 
'acctno', 
'slbb1809_1_0', 
'slbb_rc1', 
'slbb_rc2', 
'slbb_rc3', 
'slbb_rc4', 
'slbo1809_1_0', 
'slbo_rc1', 
'slbo_rc2', 
'slbo_rc3', 
'slbo_rc4', 
'inputcheckbusname', 
'inputcheckbusaltname', 
'inputcheckbusaddr', 
'inputcheckbuscity', 
'inputcheckbusstate', 
'inputcheckbuszip', 
'inputcheckbustin', 
'inputcheckbusphone', 
'inputcheckbussic', 
'inputcheckbusnaics', 
'filler_1', 
'inputcheckbusage', 
'filler_2', 
'filler_3', 
'inputcheckbusfax', 
'inputcheckauthrepfirstname', 
'inputcheckauthreplastname', 
'inputcheckauthrepmiddlename', 
'inputcheckauthrepaddr', 
'inputcheckauthrepcity', 
'inputcheckauthrepstate', 
'inputcheckauthrepzip', 
'inputcheckauthrepssn', 
'inputcheckauthrepphone', 
'inputcheckauthrepdob', 
'inputcheckauthrepage', 
'filler_4', 
'inputcheckauthrepdl', 
'inputcheckauthrepdlstate', 
'inputcheckauthrep2firstname', 
'inputcheckauthrep2lastname', 
'inputcheckauthrep2middlename', 
'inputcheckauthrep2addr', 
'inputcheckauthrep2city', 
'inputcheckauthrep2state', 
'inputcheckauthrep2zip', 
'inputcheckauthrep2ssn', 
'inputcheckauthrep2phone', 
'inputcheckauthrep2dob', 
'inputcheckauthrep2age', 
'filler_5', 
'inputcheckauthrep2dl', 
'inputcheckauthrep2dlstate', 
'inputcheckauthrep3firstname', 
'inputcheckauthrep3lastname', 
'inputcheckauthrep3middlename', 
'inputcheckauthrep3addr', 
'inputcheckauthrep3city', 
'inputcheckauthrep3state', 
'inputcheckauthrep3zip', 
'inputcheckauthrep3ssn', 
'inputcheckauthrep3phone', 
'inputcheckauthrep3dob', 
'inputcheckauthrep3age', 
'filler_6', 
'inputcheckauthrep3dl', 
'inputcheckauthrep3dlstate', 
'verificationbusinputname', 
'verificationbusinputaddr', 
'verificationbusinputphone', 
'verificationbusinputtin', 
'verificationbusinputindustry', 
'businessrecordtimeoldest', 
'businessrecordtimenewest', 
'businessrecordupdated12m', 
'businessactivity03m', 
'businessactivity06m', 
'businessactivity12m', 
'businessaddrcount', 
'firmageobserved', 
'outbestsic', 
'outbestnaics', 
'firmemployeecount', 
'firmreportedsales', 
'firmreportedearnings', 
'firmirsretirementplan', 
'firmnonprofit', 
'orglocationcount', 
'orgrelatedcount', 
'firmparentcompanyind', 
'orglegalentitycount', 
'orgaddrlegalentitycount', 
'orgsinglelocation', 
'sosincorporationtimeoldest', 
'sostimeagentchange', 
'sosstandingdefunct', 
'sosstatecount', 
'sosforeignstateflag', 
'bankruptcycount', 
'bankruptcycount12m', 
'bankruptcycount24m', 
'bankruptcychapter', 
'bankruptcytimenewest', 
'liencount', 
'liencount12m', 
'liencount24m', 
'liennewesttype', 
'lientimenewest', 
'lientimeoldest', 
'lientotalamount', 
'judgmentcount', 
'judgmentcount12m', 
'judgmentcount24m', 
'judgmentnewesttype', 
'judgmenttimenewest', 
'judgmenttimeoldest', 
'judgmenttotalamount', 
'lienjudgmentdollartotal', 
'assetpropcountever', 
'assetpropstatecountcurrent', 
'assetproplotsizetotalever', 
'assetpropassessedtotalever', 
'assetpropsqfootagetotalever', 
'assetaircraftcount', 
'assetwatercraftcount', 
'ucccount', 
'ucctimenewest', 
'ucctimeoldest', 
'governmentdebarred', 
'inquiryhighriskcount12m', 
'inquiryhighriskcount03m', 
'inquirycreditcount12m', 
'inquirycreditcount03m', 
'inquirycount12m', 
'inquirycount03m', 
'inquiryconsumeraddress', 
'inquiryconsumerphone', 
'inquiryconsumeraddressssn', 
'busexeclinkauthreplexid', 
'busexeclinkrepnameonfile', 
'busexeclinkrepaddronfile', 
'busexeclinkrepssnonfile', 
'busexeclinkrepphoneonfile', 
'busexeclinkbusnamerepfirst', 
'busexeclinkbusnamereplast', 
'busexeclinkbusnamerepfull', 
'busexeclinkrepssnbustin', 
'busexeclinkpropoverlapcount', 
'busexeclinkrepaddrbusaddr', 
'busexeclinkutiloverlapcount', 
'busexeclinkinqoverlapcount', 
'busexeclinkbusaddrrepowned', 
'busexeclinkrepphonebusphone', 
'busexeclinkauthrep2lexid ', 
'busexeclinkrep2nameonfile', 
'busexeclinkrep2addronfile', 
'busexeclinkrep2phoneonfile', 
'busexeclinkrep2ssnonfile', 
'busexeclinkbusnamerep2first', 
'busexeclinkbusnamerep2last', 
'busexeclinkbusnamerep2full', 
'busexeclinkrep2ssnbustin', 
'busexeclinkpropoverlapcount2', 
'busexeclinkbusaddrrep2owned', 
'busexeclinkutiloverlapcount2', 
'busexeclinkinqoverlapcount2', 
'busexeclinkrep2addrbusaddr', 
'busexeclinkrep2phonebusphone', 
'busexeclinkauthrep3lexid ', 
'busexeclinkrep3nameonfile', 
'busexeclinkrep3addronfile', 
'busexeclinkrep3phoneonfile', 
'busexeclinkrep3ssnonfile', 
'busexeclinkbusnamerep3first', 
'busexeclinkbusnamerep3last', 
'busexeclinkbusnamerep3full', 
'busexeclinkrep3ssnbustin', 
'busexeclinkpropoverlapcount3', 
'busexeclinkbusaddrrep3owned', 
'busexeclinkutiloverlapcount3', 
'busexeclinkinqoverlapcount3', 
'busexeclinkrep3addrbusaddr', 
'busexeclinkrep3phonebusphone', 
'bustinpersonoverlap', 
'bustinpersonaddroverlap', 
'bustinpersonphoneoverlap', 
'busaddrpersonnameoverlap', 
'inputaddrconsumercount', 
'inputaddrsourcecount', 
'inputaddrtype', 
'inputaddrownership', 
'inputaddrlotsize', 
'inputaddrassessedtotal', 
'inputaddrbuildingsize', 
'inputphoneproblems', 
'inputphoneentitycount', 
'inputphonetype', 
'associatecount', 
'associatebankruptcount', 
'associatecountwithbankruptcy', 
'associatebankrupt12mcount', 
'associateliencount', 
'associatecountwithlien', 
'associatejudgmentcount', 
'associatecountwithjudgment', 
'associatehighriskaddrcount', 
'associatewatchlistcount', 
'associatebusinesscount', 
'associatecitycount', 
'associatecountycount', 
'outbestbusname', 
'outbestbusstreetaddr', 
'outbestbuscity', 
'outbestbusstate', 
'outbestbuszip', 
'outbestbustin', 
'outbestbusphone', 
'firmemployeerangecount', 
'firmownershiptype', 
'firmreportedsalesrange', 
'sosincorporationfilingscount', 
'sosstandingbest', 
'bankruptcycount84m', 
'liencount03m', 
'liencount36m', 
'lienfedtaxcount', 
'lienfedtaxtotalamount', 
'lienforeclosurecount', 
'lienforeclosuretotalamount', 
'lientenantcount', 
'lientenanttotalamount', 
'lienmechanicscount', 
'lienmechanicstotalamount', 
'lienothercount', 
'lienothertotalamount', 
'judgmentcount03m', 
'judgmentcount36m', 
'judgmentcivilcourtcount', 
'judgmentcivilcourttotalamount', 
'judgmentsmallclaimscount', 
'judgmentsmallclaimstotalamount', 
'judgmentsuitscount', 
'judgmentssuitstotalamount', 
'judgmentsothercount', 
'judgmentothertotalamount', 
'assetpropcountcurrent', 
'assetproplotsizetotalcurrent', 
'assetpropsqfootagetotalcurrent', 
'assetpropassessedtotalcurrent', 
'uccactivecount', 
'uccroles', 
'uccrolesactive', 
'inquiryothercount03m', 
'inquiryothercount12m', 
'inputtinentitycount', 
'inputbusaddrcurrentcount', 
'inputaddrtincount', 
'inputbusnameotherbusnamematch', 
'inputaddrzipmismatch', 
'inputaddrvacancy', 
'inputaddrtimeoldest', 
'inputphoneresidential', 
'inputtinbiimismatch', 
'inputtinhitindex', 
'associatecurrcount', 
'associatecurrcountwithfelony', 
'associatecurrcountwithbkrpt', 
'associatecurrcountwithlien', 
'associatecurrcountwithjudgment', 
'associatecurrcountwithprop', 
'associatecurrbusinessestotal', 
'associatecurrsoscount', 
'b2bcnt2y', 
'b2bcarrcnt2y', 
'b2bfltcnt2y', 
'b2bmatcnt2y', 
'b2bopscnt2y', 
'b2bothcnt2y', 
'b2bcarrpct2y', 
'b2bfltpct2y', 
'b2bmatpct2y', 
'b2bopspct2y', 
'b2bothpct2y', 
'b2boldmsnc2y', 
'b2bnewmsnc2y', 
'b2bactvcnt', 
'b2bactvcarrcnt', 
'b2bactvfltcnt', 
'b2bactvmatcnt', 
'b2bactvopscnt', 
'b2bactvothcnt', 
'b2bactvcarrpct', 
'b2bactvfltpct', 
'b2bactvmatpct', 
'b2bactvopspct', 
'b2bactvothpct', 
'b2bactvcntgrow1y', 
'b2bactvcarrcntgrow1y', 
'b2bactvfltcntgrow1y', 
'b2bactvmatcntgrow1y', 
'b2bactvopscntgrow1y', 
'b2bactvothcntgrow1y', 
'b2bactvbaltot', 
'b2bactvcarrbaltot', 
'b2bactvfltbaltot', 
'b2bactvmatbaltot', 
'b2bactvopsbaltot', 
'b2bactvothbaltot', 
'b2bactvcarrbaltotpct', 
'b2bactvfltbalpct', 
'b2bactvmatbalpct', 
'b2bactvopsbalpct', 
'b2bactvothbalpct', 
'b2bactvbaltotrnge', 
'b2bactvcarrbaltotrnge', 
'b2bactvfltbaltotrnge', 
'b2bactvmatbaltotrnge', 
'b2bactvopsbaltotrnge', 
'b2bactvothbaltotrnge', 
'b2bactvbalavg', 
'b2bactvcarrbalavg', 
'b2bactvfltbalavg', 
'b2bactvmatbalavg', 
'b2bactvopsbalavg', 
'b2bactvothbalavg', 
'b2bactvbaltotgrow1y', 
'b2bactvcarrbaltotgrow1y', 
'b2bactvfltbaltotgrow1y', 
'b2bactvmatbaltotgrow1y', 
'b2bactvopsbaltotgrow1y', 
'b2bactvothbaltotgrow1y', 
'b2bactvbaltotgrowindx1y', 
'b2bactvcarrbaltotgrowindx1y', 
'b2bactvfltbaltotgrowindx1y', 
'b2bactvmatbaltotgrowindx1y', 
'b2bactvopsbaltotgrowindx1y', 
'b2bactvothbaltotgrowindx1y', 
'b2bbalmax2y', 
'b2bcarrbalmax2y', 
'b2bfltbalmax2y', 
'b2bmatbalmax2y', 
'b2bopsbalmax2y', 
'b2bothbalmax2y', 
'b2bbalmaxsegtype2y', 
'b2bbalmaxmsnc2y', 
'b2bcarrbalmaxmsnc2y', 
'b2bfltbalmaxmsnc2y', 
'b2bmatbalmaxmsnc2y', 
'b2bopsbalmaxmsnc2y', 
'b2bothbalmaxmsnc2y', 
'b2bactvworstperfindx', 
'b2bactvcarrworstperfindx', 
'b2bactvfltworstperfindx', 
'b2bactvmatworstperfindx', 
'b2bactvopsworstperfindx', 
'b2bactvothworstperfindx', 
'b2bworstperfindx2y', 
'b2bcarrworstperfindx2y', 
'b2bfltworstperfindx2y', 
'b2bmatworstperfindx2y', 
'b2bopsworstperfindx2y', 
'b2bothworstperfindx2y', 
'b2bworstperfmsnc2y', 
'b2bcarrworstperfmsnc2y', 
'b2bfltworstperfmsnc2y', 
'b2bmatworstperfmsnc2y', 
'b2bopsworstperfmsnc2y', 
'b2bothworstperfmsnc2y', 
'b2bactv1pdpdcnt', 
'b2bactv31pdpdcnt', 
'b2bactv61pdpdcnt', 
'b2bactv91pdpdcnt', 
'b2bactv1pdpdpct', 
'b2bactv31pdpdpct', 
'b2bactv61pdpdpct', 
'b2bactv91pdpdpct', 
'b2bactv1pdpdbaltot', 
'b2bactv31pdpdbaltot', 
'b2bactv61pdpdbaltot', 
'b2bactv91pdpdbaltot', 
'b2bactv1pdpdbaltotpct', 
'b2bactv31pdpdbaltotpct', 
'b2bactv61pdpdbaltotpct', 
'b2bactv91pdpdbaltotpct', 
'b2bactv1pdpdbaltotgrow1y', 
'b2bactv31pdpdbaltotgrow1y', 
'b2bactv61pdpdbaltotgrow1y', 
'b2bactv91pdpdbaltotgrow1y', 
'lexidbusinesschangeflag', 
'lexidbusinesschangecode', 
]

In [10]:
path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/LN_Output_AEGC20230904.txt"
amex_sample = pd.read_adls(path, reader = pd.read_csv,  encoding='iso-8859-1', low_memory = False, verbose = False, header = None,
                           names = layout, delimiter = "|", dtype = str, skiprows = 0)
print(amex_sample.shape)

(3124881, 400)


In [11]:
amex_sample.head()

Unnamed: 0,seleid_1,rep_1_lexid,rep_2_lexid,rep_3_lexid,ultid,orgid,seleid_2,proxid,powid,acctno,...,b2bactv1pdpdbaltotpct,b2bactv31pdpdbaltotpct,b2bactv61pdpdbaltotpct,b2bactv91pdpdbaltotpct,b2bactv1pdpdbaltotgrow1y,b2bactv31pdpdbaltotgrow1y,b2bactv61pdpdbaltotgrow1y,b2bactv91pdpdbaltotgrow1y,lexidbusinesschangeflag,lexidbusinesschangecode
0,H000104092023000000001US,,,,,,,,,,...,,,,,,,,,,
1,58655779,1282371444.0,0.0,0.0,58655779.0,58655779.0,58655779.0,0.0,0.0,1.0,...,100.0,100.0,0.0,0.0,-99998.0,-99998.0,-99998.0,-99998.0,False,0.0
2,58667766,1205158212.0,0.0,0.0,58667766.0,58667766.0,58667766.0,0.0,0.0,2.0,...,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,False,0.0
3,58726293,451285081.0,0.0,0.0,58726293.0,58726293.0,58726293.0,0.0,0.0,3.0,...,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,False,0.0
4,58732188,2183147626.0,0.0,0.0,58732188.0,58732188.0,58732188.0,0.0,0.0,4.0,...,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,-99998.0,False,0.0


In [12]:
amex_sample.to_adls("Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/LN_Output_AEGC20230904.parquet")

In [16]:
keep = ['seleid_1', 'rep_1_lexid', 'rep_2_lexid', 'rep_3_lexid', 'ultid', 'orgid', 'seleid_2', 'proxid', 'powid', 'acctno', 'slbb1809_1_0','slbo1809_1_0', 
        'AssetPropAssessedTotalEver', "InputAddrOwnership", "InputAddrAssessedTotal"]
keep = [x.lower() for x in keep]
amex_sample = amex_sample[keep]
print(amex_sample.shape)

(3124881, 15)


In [14]:
keep

['seleid_1',
 'rep_1_lexid',
 'rep_2_lexid',
 'rep_3_lexid',
 'ultid',
 'orgid',
 'seleid_2',
 'proxid',
 'powid',
 'acctno',
 'slbb1809_1_0',
 'slbo1809_1_0',
 'assetpropassessedtotalever',
 'inputaddrownership',
 'inputaddrassessedtotal']

In [17]:
amex_sample[['assetpropassessedtotalever', 'inputaddrownership', 'inputaddrassessedtotal']] = amex_sample[['assetpropassessedtotalever', 'inputaddrownership', 'inputaddrassessedtotal']].astype(float)

In [18]:
temp = amex_sample[(amex_sample.assetpropassessedtotalever == 0) & (amex_sample.inputaddrownership == 1) & (amex_sample.inputaddrassessedtotal >0)].copy()
print(temp.shape)

(15056, 15)


In [22]:
temp_2 = amex_sample[(amex_sample.assetpropassessedtotalever > 0) & (amex_sample.inputaddrownership == 1) & (amex_sample.inputaddrassessedtotal >0) & 
                     (amex_sample.assetpropassessedtotalever < amex_sample.inputaddrassessedtotal)].copy()
print(temp_2.shape)

(66039, 15)


In [23]:
100_000-66039-15056

18905

In [24]:
temp_3 = amex_sample[(amex_sample.assetpropassessedtotalever > 0) & (amex_sample.inputaddrownership == 1) & (amex_sample.inputaddrassessedtotal >0) & 
                     (amex_sample.assetpropassessedtotalever >= amex_sample.inputaddrassessedtotal)].copy()
temp_3 = temp_3.sample(n = 18905, ignore_index = True)
print(temp_3.shape)

(18905, 15)


In [26]:
temp["issue"] = "own_input_with_value_no_total"
temp_2["issue"] = "own_input_with_value_total_less"
temp_3["issue"] = "work_as_expected"
sample = pd.concat([temp, temp_2, temp_3], ignore_index = True)

In [27]:
sample.to_adls('Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/amex_sample_100K_1004.parquet')
sample.to_adls('Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/amex_sample_100K_1004.csv', index = False)

In [21]:
fmt = make_format(cuts = [-np.inf, -1, 0, 100_000, 200_000, 300_000, 500_000, 1_000_000, np.inf], exceptions = [-1])
freq(amex_sample.assetpropassessedtotalever, format = fmt, observed = True)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
assetpropassessedtotalever,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2390552.0,0.765006,2390552.0,0.765006
1-100000,242683.0,0.077662,2633235.0,0.842667
100001-200000,49838.0,0.015949,2683073.0,0.858616
200001-300000,31366.0,0.010038,2714439.0,0.868654
300001-500000,37989.0,0.012157,2752428.0,0.880811
500001-1000000,42820.0,0.013703,2795248.0,0.894513
1000001+,60984.0,0.019516,2856232.0,0.914029
-1,268447.0,0.085906,3124679.0,0.999935
Missing,202.0,6.5e-05,3124881.0,1.0
