In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

#### fraud_intelli

In [2]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/"

In [3]:
biid = pd.read_adls(path + "/_temp/" + "fraud_intelli" + "_all.parquet", reader = pd.read_parquet)
print(biid.shape)

(7971880, 163)


In [4]:
# use input_df - we don't need this step
# input_df = pd.read_parquet("./_temp/profile_seleid.parquet")
# print(input_df.shape)
# input_df.columns = ["accountnumber", "ecl_seleid"]

In [5]:
biid.columns

Index(['accountnumber', 'p_lexid', 'p_inpacct', 'p_inpclnarchdt',
       'p_inpssnflag', 'p_inpssnlength', 'p_inpnamefirstflag',
       'p_inpnamemidflag', 'p_inpnamelastflag', 'p_inpaddrstflag',
       ...
       'pi_inpemaillnameconseccharcnt', 'pi_inpemailhasphoneall10flag',
       'pi_inpemailhasdobmonthdayflag', 'pi_inpemailhasdobyearlast2flag',
       'pi_inpemailhasdobyearall4flag', 'pi_inpemailhas13pcharflag',
       'pi_inpemailhas4pnumflag', 'pi_inpemailqwertyonerowflag',
       'pi_inpemailhas3repeatcharflag', 'pi_inpemailhasonly1charflag'],
      dtype='object', length=163)

In [6]:
biid['p_lexid'] = biid['p_lexid'].astype(float)
fmt = make_format(cuts = [-np.inf, -1, 0, np.inf])
freq(biid['p_lexid'], format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
p_lexid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= -1,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0
1+,1249854.0,0.156783,1249854.0,0.156783
Missing,6722026.0,0.843217,7971880.0,1.0


In [7]:
biid.head()

Unnamed: 0,accountnumber,p_lexid,p_inpacct,p_inpclnarchdt,p_inpssnflag,p_inpssnlength,p_inpnamefirstflag,p_inpnamemidflag,p_inpnamelastflag,p_inpaddrstflag,...,pi_inpemaillnameconseccharcnt,pi_inpemailhasphoneall10flag,pi_inpemailhasdobmonthdayflag,pi_inpemailhasdobyearlast2flag,pi_inpemailhasdobyearall4flag,pi_inpemailhas13pcharflag,pi_inpemailhas4pnumflag,pi_inpemailqwertyonerowflag,pi_inpemailhas3repeatcharflag,pi_inpemailhasonly1charflag
0,AAA000000040222346,,1,20211028,0,0,0,0,0,1,...,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999
1,AAA000000106363593,,2,20210801,0,0,0,0,0,1,...,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999
2,AAA000000055985951,1320427000.0,3,20211227,0,0,0,0,0,1,...,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999
3,AAA000000037609379,,1,20211019,0,0,0,0,0,1,...,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999
4,AAA000000083180600,,10,20220329,0,0,0,0,0,1,...,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999,-99999


In [8]:
def left_closed_labels(b, include_max=False, fmt=".3f", step=1):
    lbs = []
    for i in range(len(b) - 1):
        if b[i] == (b[i + 1] - step):
            lbs.append(f"{b[i]:{fmt}}")
        else:
            lbs.append(f"{b[i]:{fmt}}-{b[i+1] - step:{fmt}}")
    if not include_max:
        lbs[-1] = f"{b[-2]:{fmt}}+"
    return lbs

In [None]:
biid = biid.apply(pd.to_numeric, errors='ignore')

In [None]:
keep = [x for x in biid.columns if x not in ['accountnumber', 'p_lexid', 'p_inpacct', 'p_inpclnarchdt',]]
print(len(keep))

In [18]:
biv_all = {}
biv_client = {}
for s in keep:
    if pd.api.types.is_string_dtype(biid[s].dtype):
        temp_biv = freq(biid[s], observed = True).fillna("n/a")
    else:
        if biid[s].nunique() <=15:
            temp_biv = freq(biid[s, observed = True]).fillna("n/a")
        else:
            brks = (
                    biid[s].pipe(lambda x: x[x.gt(-1)])
                    .quantile([0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
                )
            dup = [x+1 for x in brks if brks.tolist().count(x)>1]
            brks = brks.tolist() + dup
            brks = np.unique(brks)

            if brks.max() == 0:
                brks = np.append(brks, 1)
            brks.sort()
            brks = np.append(brks, np.inf) 

            labs = left_closed_labels(brks, fmt=".0f")
            fmt = make_format(
                cuts=brks, labels=labs, right=False, exceptions=[-99999, -99998, -99997]
            )
            temp_biv = freq(biid[s], format=fmt, observed = True).fillna("n/a")
            
    biv_all[s] = temp_biv
    
wb = TableWriter(filename = "./_temp/temp_fraud.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in biv_all.keys():
    wb.write_table(
        biv_all[k],
        sheetname="biv",
        conditional_fmt_cols=[1],
    )
wb.close()
