In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [2]:
## Input Data
chase = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/Chase_11449/" + "chase_stacked_final.parquet")
print(chase.shape)

tmobile = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/tmobile_10823/" + "tmbile_file_to_use.parquet")
print(tmobile.shape)

stripe = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/stripe_11363/" + "stripe_file_to_use.parquet")
print(stripe.shape)

fleector = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/fleector_11124/" + "fleector_sample_to_use.parquet")
print(fleector.shape)

(640958, 36)
(114460, 39)
(75000, 22)
(83704, 92)


In [3]:
## keep seleid, full address, phone, taxid, performance flag >> business information >> might need to dedup

#### SBFE Inquiry

In [3]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/sbfe_seleid_append_deduped_cleaned_0501.parquet"
sbfe = pd.read_adls(path, reader = pd.read_parquet)
print(sbfe.shape)

(79429086, 20)


In [4]:
sbfe["full_addr"] = sbfe["addr"] + np.where(sbfe["city"] != "", ", " + sbfe["city"], "") + \
    np.where(sbfe["state"] != "", ", " + sbfe["state"], "") + np.where(sbfe["zip"] != "", ", " + sbfe["zip"].str.slice(0, 5), "")
sbfe["full_addr_clean"] = sbfe['full_addr'].str.replace('.', '')
sbfe["full_addr_clean"] = sbfe['full_addr_clean'].str.replace(',', '')
sbfe["full_addr_clean"] = sbfe['full_addr_clean'].str.replace('\W\s', '')
sbfe["full_addr_clean"] = sbfe["full_addr_clean"].str.lower()

  sbfe["full_addr_clean"] = sbfe['full_addr'].str.replace('.', '')
  sbfe["full_addr_clean"] = sbfe['full_addr_clean'].str.replace('\W\s', '')


In [5]:
sbfe.columns

Index(['accountnumber_x', 'companyname', 'alternatecompanyname_x', 'addr',
       'city', 'state', 'zip', 'businessphone', 'taxidnumber', 'historydate_x',
       'sufficient_input', 'accountnumber_y', 'alternatecompanyname_y',
       'historydate_y', 'powid', 'proxid', 'seleid', 'orgid', 'ultid',
       'overallweight', 'full_addr', 'full_addr_clean'],
      dtype='object')

In [6]:
sbfe = sbfe[["seleid", "historydate_x", "full_addr_clean", 'businessphone', 'taxidnumber']]
sbfe.columns = ["seleid_inquiry_sbfe", "inquiry_date", "full_addr_clean", 'businessphone', 'taxidnumber']

In [7]:
sbfe["inquiry_date"] = sbfe["inquiry_date"].astype(int)
sbfe["inquiry_date_fmt"] = pd.to_datetime(sbfe["inquiry_date"], format='%Y%m%d')

In [8]:
sbfe["seleid_inquiry_sbfe"] = sbfe["seleid_inquiry_sbfe"].astype(float)

In [9]:
sbfe["full_addr_clean"] = sbfe["full_addr_clean"].str.lower()

In [10]:
sbfe.head()

Unnamed: 0,seleid_inquiry_sbfe,inquiry_date,full_addr_clean,businessphone,taxidnumber,inquiry_date_fmt
0,133615600000.0,20210701,115 golf course rd ste e logan ut 84321,4355353654,,2021-07-01
1,1330122000.0,20210701,3200 palm tree dr lithonia ga 30038,7708857033,,2021-07-01
2,906413600.0,20210701,1476 highway 159 e bellville tx 77418,9798653142,,2021-07-01
5,61367720.0,20210701,492 koller st san francisco ca 94110,6505550000,,2021-07-01
6,137145200000.0,20210701,107 lassiter lane bellville tx 77418,9798859397,,2021-07-01


#### Stripe 11363

In [11]:
stripe.columns

Index(['Unnamed: 0', 'uniqueid', 'historydate', 'companyname',
       'alternatecompanyname', 'bus_streetaddress1', 'bus_streetaddress2',
       'bus_city', 'bus_state', 'bus_zip', 'bus_phone', 'bus_website',
       'firstname', 'lastname', 'streetaddress1', 'streetaddress2', 'city',
       'state', 'zip', 'email', 'ip_address', 'performance_indicator'],
      dtype='object')

In [10]:
stripe = stripe.fillna("")
stripe = stripe.drop_duplicates(subset = ['companyname', 'alternatecompanyname', 'bus_streetaddress1', 
                                          'bus_streetaddress2', 'bus_city', 'bus_state', 'bus_zip', 'bus_phone', 'bus_website'], ignore_index = True) 
print(stripe.shape)
print(stripe.uniqueid.is_unique)

(74443, 22)
True


#### TMobile 10823

In [14]:
tmobile[tmobile["federaltaxid"] == ""].shape

(0, 39)

In [13]:
tmobile.federaltaxid

114766    854082844
114767    753226669
114768    882585629
114769    474246581
114770    833402287
            ...    
257745    844827985
257746    842159679
257747    871318967
257748    471515674
257749    881785433
Name: federaltaxid, Length: 114460, dtype: object

In [15]:
tmobile["credit_flag_cutoff_date"] = tmobile.appdatetime_gmt.str.slice(0, 9)
tmobile["credit_flag_cutoff_date_fmt"] = pd.to_datetime(tmobile["credit_flag_cutoff_date"], format='%d%b%Y')

In [16]:
tmobile = tmobile[tmobile["fraud_ind"].notnull()]
print(tmobile.shape)

(62417, 41)


In [17]:
freq(tmobile.src)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
march,21914.0,0.35109,21914.0,0.35109
retro,40503.0,0.64891,62417.0,1.0


In [29]:
sbfe_for_tmobile = sbfe.merge(tmobile, left_on = "taxidnumber", right_on = "federaltaxid")
print(sbfe_for_tmobile.shape)
sbfe_for_tmobile = sbfe_for_tmobile[sbfe_for_tmobile["inquiry_date_fmt"] < sbfe_for_tmobile["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_tmobile.shape)

(20833, 51)
(13342, 51)


In [30]:
temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_1w.shape)

(6517, 2)
(4465, 2)
(2162, 2)
(656, 2)


In [31]:
count_unique_06.columns = ['crid_encr', "addr_count_unique_06"]
count_unique_03.columns = ['crid_encr', "addr_count_unique_03"]
count_unique_01.columns = ['crid_encr', "addr_count_unique_01"]
count_unique_1w.columns = ['crid_encr', "addr_count_unique_1w"]

tmobile = tmobile.merge(count_unique_06, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_03, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_01, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_1w, on = "crid_encr", how = "left")
print(tmobile.shape)

(62417, 46)
(62417, 47)
(62417, 48)
(62417, 49)


In [33]:
temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['crid_encr'])["businessphone"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['crid_encr'])["businessphone"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['crid_encr'])["businessphone"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['crid_encr'])["businessphone"].nunique().reset_index()
print(count_unique_1w.shape)

(6517, 2)
(4465, 2)
(2162, 2)
(656, 2)


In [34]:
count_unique_06.columns = ['crid_encr', "phn_count_unique_06"]
count_unique_03.columns = ['crid_encr', "phn_count_unique_03"]
count_unique_01.columns = ['crid_encr', "phn_count_unique_01"]
count_unique_1w.columns = ['crid_encr', "phn_count_unique_1w"]

tmobile = tmobile.merge(count_unique_06, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_03, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_01, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_1w, on = "crid_encr", how = "left")
print(tmobile.shape)

(62417, 50)
(62417, 51)
(62417, 52)
(62417, 53)


In [35]:
tmobile['addr_count_unique_06'] = np.where(tmobile["addr_count_unique_06"].isnull(), 0, tmobile["addr_count_unique_06"])
tmobile['addr_count_unique_03'] = np.where(tmobile["addr_count_unique_03"].isnull(), 0, tmobile["addr_count_unique_03"])
tmobile['addr_count_unique_01'] = np.where(tmobile["addr_count_unique_01"].isnull(), 0, tmobile["addr_count_unique_01"])
tmobile['addr_count_unique_1w'] = np.where(tmobile["addr_count_unique_1w"].isnull(), 0, tmobile["addr_count_unique_1w"])

tmobile['phn_count_unique_06'] = np.where(tmobile["phn_count_unique_06"].isnull(), 0, tmobile["phn_count_unique_06"])
tmobile['phn_count_unique_03'] = np.where(tmobile["phn_count_unique_03"].isnull(), 0, tmobile["phn_count_unique_03"])
tmobile['phn_count_unique_01'] = np.where(tmobile["phn_count_unique_01"].isnull(), 0, tmobile["phn_count_unique_01"])
tmobile['phn_count_unique_1w'] = np.where(tmobile["phn_count_unique_1w"].isnull(), 0, tmobile["phn_count_unique_1w"])

In [21]:
# sbfe_for_tmobile['inquiry_count_06']   = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6, 1, 0)
# sbfe_for_tmobile['inquiry_count_03']   = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3, 1, 0)
# sbfe_for_tmobile['inquiry_count_01']   = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1, 1, 0)
# sbfe_for_tmobile['inquiry_count_week'] = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1, 1, 0)

# sbfe_for_tmobile_roll = sbfe_for_tmobile.groupby(by = ['crid_encr'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()
# print(sbfe_for_tmobile_roll.shape)

(8047, 5)


  sbfe_for_tmobile_roll = sbfe_for_tmobile.groupby(by = ['crid_encr'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()


In [22]:
# tmobile = tmobile.merge(sbfe_for_tmobile_roll, on = "crid_encr", how = "left")
# print(tmobile.shape)

(62417, 45)


In [25]:
# tmobile['inquiry_count_06'] = np.where(tmobile["inquiry_count_06"].isnull(), 0,   tmobile["inquiry_count_06"])
# tmobile['inquiry_count_03'] = np.where(tmobile["inquiry_count_03"].isnull(), 0,   tmobile["inquiry_count_03"])
# tmobile['inquiry_count_01'] = np.where(tmobile["inquiry_count_01"].isnull(), 0,   tmobile["inquiry_count_01"])
# tmobile['inquiry_count_week']=np.where(tmobile["inquiry_count_week"].isnull(), 0, tmobile["inquiry_count_week"])

In [36]:
tmobile["fraud_ind"] = tmobile["fraud_ind"].astype(float)

In [37]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
# keep = ['inquiry_count_06', 'inquiry_count_03', 'inquiry_count_01', 'inquiry_count_week']
keep = ['addr_count_unique_06', 'addr_count_unique_03', 'addr_count_unique_01', 'addr_count_unique_1w', 
        'phn_count_unique_06', 'phn_count_unique_03', 'phn_count_unique_01', 'phn_count_unique_1w']
for i, a in enumerate(keep):
    result[i] = bivariate(tmobile[a], tmobile["fraud_ind"], format = fmt)

wb = TableWriter(filename = "./_temp/tmobile.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Chase

In [38]:
chase = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/Chase_11449/chase_stacked_final.parquet")
print(chase.shape)

(640958, 36)


In [39]:
chase["credit_flag_cutoff_date_fmt"] = pd.to_datetime(chase["app_date"], format='%d%b%Y')

In [40]:
chase.columns

Index(['transaction_id', 'cust_first_nm', 'cust_mid_init_tx', 'cust_last_nm',
       'govt_issu_id_nb', 'line_1_ad', 'line_2_ad', 'city_nm', 'state_prov_cd',
       'pst_area_7_cd', 'pst_area_cd', 'channel', 'bus_name', 'app_date',
       'date_of_birth', 'decision', 'bizidscore', 'bd_score', 'bd_score1',
       'bd_score2', 'bd_score3', 'bus_addr1', 'bus_addr2', 'bus_city',
       'bus_state', 'bus_zip', 'bus_tax_id', 'bus_phone', 'bus_type',
       'home_phone', 'bad', 'seq', 'account', 'date', 'LN_booked', 'count',
       'credit_flag_cutoff_date_fmt'],
      dtype='object')

In [41]:
chase[chase.bus_tax_id == ""].shape

(295, 37)

In [42]:
chase = chase[chase["bus_tax_id"] != ""]
print(chase.shape)

(640663, 37)


In [43]:
640663/640958

0.9995397514345714

In [44]:
chase["bus_tax_id"]

499427     821981929
837612     364099086
1018748    873995150
82631      881360963
37307      877827797
             ...    
156867     882571862
140946     574985128
62551      881982804
1225640    874818394
49725      881429212
Name: bus_tax_id, Length: 640663, dtype: object

In [19]:
sbfe.columns

Index(['seleid_inquiry_sbfe', 'inquiry_date', 'taxidnumber',
       'inquiry_date_fmt'],
      dtype='object')

In [46]:
sbfe_for_chase = sbfe.merge(chase, left_on = "taxidnumber", right_on = "bus_tax_id")
print(sbfe_for_chase.shape)
sbfe_for_chase = sbfe_for_chase[sbfe_for_chase["inquiry_date_fmt"] < sbfe_for_chase["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_chase.shape)

(85384, 43)
(51535, 43)


In [47]:
temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_1w.shape)

(19367, 2)
(13792, 2)
(7446, 2)
(2910, 2)


In [48]:
count_unique_06.columns = ['account', "addr_count_unique_06"]
count_unique_03.columns = ['account', "addr_count_unique_03"]
count_unique_01.columns = ['account', "addr_count_unique_01"]
count_unique_1w.columns = ['account', "addr_count_unique_1w"]

chase = chase.merge(count_unique_06, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_03, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_01, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_1w, on = "account", how = "left")
print(chase.shape)

(640663, 38)
(640663, 39)
(640663, 40)
(640663, 41)


In [49]:
temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['account'])["businessphone"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['account'])["businessphone"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['account'])["businessphone"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['account'])["businessphone"].nunique().reset_index()
print(count_unique_1w.shape)

(19367, 2)
(13792, 2)
(7446, 2)
(2910, 2)


In [50]:
count_unique_06.columns = ['account', "phn_count_unique_06"]
count_unique_03.columns = ['account', "phn_count_unique_03"]
count_unique_01.columns = ['account', "phn_count_unique_01"]
count_unique_1w.columns = ['account', "phn_count_unique_1w"]

chase = chase.merge(count_unique_06, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_03, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_01, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_1w, on = "account", how = "left")
print(chase.shape)

(640663, 42)
(640663, 43)
(640663, 44)
(640663, 45)


In [51]:
chase['addr_count_unique_06'] = np.where(chase["addr_count_unique_06"].isnull(), 0, chase["addr_count_unique_06"])
chase['addr_count_unique_03'] = np.where(chase["addr_count_unique_03"].isnull(), 0, chase["addr_count_unique_03"])
chase['addr_count_unique_01'] = np.where(chase["addr_count_unique_01"].isnull(), 0, chase["addr_count_unique_01"])
chase['addr_count_unique_1w'] = np.where(chase["addr_count_unique_1w"].isnull(), 0, chase["addr_count_unique_1w"])

chase['phn_count_unique_06'] = np.where(chase["phn_count_unique_06"].isnull(), 0, chase["phn_count_unique_06"])
chase['phn_count_unique_03'] = np.where(chase["phn_count_unique_03"].isnull(), 0, chase["phn_count_unique_03"])
chase['phn_count_unique_01'] = np.where(chase["phn_count_unique_01"].isnull(), 0, chase["phn_count_unique_01"])
chase['phn_count_unique_1w'] = np.where(chase["phn_count_unique_1w"].isnull(), 0, chase["phn_count_unique_1w"])

In [52]:
chase["fraud_ind"] = chase["bad"].astype(float)

In [53]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ['addr_count_unique_06', 'addr_count_unique_03', 'addr_count_unique_01', 'addr_count_unique_1w', 'phn_count_unique_06', 'phn_count_unique_03', 'phn_count_unique_01', 'phn_count_unique_1w']
for i, a in enumerate(keep):
    result[i] = bivariate(chase[a].astype(float), chase["fraud_ind"], format = fmt)

wb = TableWriter(filename = "./_temp/chase.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Fleector

In [54]:
fleector = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/fleector_11124/fleector_sample_to_use.parquet")
print(fleector.shape)

(83704, 92)


In [55]:
fleector["credit_flag_cutoff_date_fmt"] = pd.to_datetime(fleector["date_created"], format='%Y-%m-%d')

In [56]:
freq(fleector["credit_flag_cutoff_date_fmt"].astype(str).str.slice(0, 7))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
credit_flag_cutoff_date_fmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,15827.0,0.189083,15827.0,0.189083
2022-02,15512.0,0.18532,31339.0,0.374403
2022-03,19353.0,0.231208,50692.0,0.60561
2022-04,16822.0,0.20097,67514.0,0.80658
2022-05,16185.0,0.19336,83699.0,0.99994
2022-06,5.0,6e-05,83704.0,1.0


In [57]:
fleector[fleector.fed_id == ""].shape

(0, 93)

In [58]:
fleector.fed_id

6875     844759743
5628     872922146
19723    862131849
4981     842053436
17116    851178146
           ...    
49448    814826144
705      582165025
79073    881986215
61964    882490371
54909    862273900
Name: fed_id, Length: 83704, dtype: object

In [59]:
sbfe_for_fleector = sbfe.merge(fleector, left_on = "taxidnumber", right_on = "fed_id")
print(sbfe_for_fleector.shape)
sbfe_for_fleector = sbfe_for_fleector[sbfe_for_fleector["inquiry_date_fmt"] < sbfe_for_fleector["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_fleector.shape)

(71826, 99)
(23759, 99)


In [62]:
fleector.seq.is_unique

True

In [63]:
temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['seq'])["full_addr_clean"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['seq'])["full_addr_clean"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['seq'])["full_addr_clean"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['seq'])["full_addr_clean"].nunique().reset_index()
print(count_unique_1w.shape)

(11793, 2)
(8135, 2)
(4080, 2)
(1572, 2)


In [66]:
count_unique_06.columns = ['seq', "addr_count_unique_06"]
count_unique_03.columns = ['seq', "addr_count_unique_03"]
count_unique_01.columns = ['seq', "addr_count_unique_01"]
count_unique_1w.columns = ['seq', "addr_count_unique_1w"]

fleector = fleector.merge(count_unique_06, on = "seq", how = "left")
print(fleector.shape)
fleector = fleector.merge(count_unique_03, on = "seq", how = "left")
print(fleector.shape)
fleector = fleector.merge(count_unique_01, on = "seq", how = "left")
print(fleector.shape)
fleector = fleector.merge(count_unique_1w, on = "seq", how = "left")
print(fleector.shape)

(83704, 94)
(83704, 95)
(83704, 96)
(83704, 97)


In [67]:
temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['seq'])["businessphone"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['seq'])["businessphone"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['seq'])["businessphone"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_fleector[(sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['seq'])["businessphone"].nunique().reset_index()
print(count_unique_1w.shape)

(11793, 2)
(8135, 2)
(4080, 2)
(1572, 2)


In [68]:
count_unique_06.columns = ['seq', "phn_count_unique_06"]
count_unique_03.columns = ['seq', "phn_count_unique_03"]
count_unique_01.columns = ['seq', "phn_count_unique_01"]
count_unique_1w.columns = ['seq', "phn_count_unique_1w"]

fleector = fleector.merge(count_unique_06, on = "seq", how = "left")
print(fleector.shape)
fleector = fleector.merge(count_unique_03, on = "seq", how = "left")
print(fleector.shape)
fleector = fleector.merge(count_unique_01, on = "seq", how = "left")
print(fleector.shape)
fleector = fleector.merge(count_unique_1w, on = "seq", how = "left")
print(fleector.shape)

(83704, 98)
(83704, 99)
(83704, 100)
(83704, 101)


In [69]:
fleector['addr_count_unique_06'] = np.where(fleector["addr_count_unique_06"].isnull(), 0, fleector["addr_count_unique_06"])
fleector['addr_count_unique_03'] = np.where(fleector["addr_count_unique_03"].isnull(), 0, fleector["addr_count_unique_03"])
fleector['addr_count_unique_01'] = np.where(fleector["addr_count_unique_01"].isnull(), 0, fleector["addr_count_unique_01"])
fleector['addr_count_unique_1w'] = np.where(fleector["addr_count_unique_1w"].isnull(), 0, fleector["addr_count_unique_1w"])

fleector['phn_count_unique_06'] = np.where(fleector["phn_count_unique_06"].isnull(), 0, fleector["phn_count_unique_06"])
fleector['phn_count_unique_03'] = np.where(fleector["phn_count_unique_03"].isnull(), 0, fleector["phn_count_unique_03"])
fleector['phn_count_unique_01'] = np.where(fleector["phn_count_unique_01"].isnull(), 0, fleector["phn_count_unique_01"])
fleector['phn_count_unique_1w'] = np.where(fleector["phn_count_unique_1w"].isnull(), 0, fleector["phn_count_unique_1w"])

In [71]:
fleector["fraud_ind"] = fleector["final_fraud_flag"].astype(float)

In [72]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ['addr_count_unique_06', 'addr_count_unique_03', 'addr_count_unique_01', 'addr_count_unique_1w', 'phn_count_unique_06', 'phn_count_unique_03', 'phn_count_unique_01', 'phn_count_unique_1w']
for i, a in enumerate(keep):
    result[i] = bivariate(fleector[a].astype(float), fleector["fraud_ind"], format = fmt, groups= fleector.acct_flag)

wb = TableWriter(filename = "./_temp/fleector.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Other

In [84]:
statename_to_abbr = {
    # Other
    'District of Columbia': 'DC',

    # States
    'Alabama': 'AL',
    'Montana': 'MT',
    'Alaska': 'AK',
    'Nebraska': 'NE',
    'Arizona': 'AZ',
    'Nevada': 'NV',
    'Arkansas': 'AR',
    'New Hampshire': 'NH',
    'California': 'CA',
    'New Jersey': 'NJ',
    'Colorado': 'CO',
    'New Mexico': 'NM',
    'Connecticut': 'CT',
    'New York': 'NY',
    'Delaware': 'DE',
    'North Carolina': 'NC',
    'Florida': 'FL',
    'North Dakota': 'ND',
    'Georgia': 'GA',
    'Ohio': 'OH',
    'Hawaii': 'HI',
    'Oklahoma': 'OK',
    'Idaho': 'ID',
    'Oregon': 'OR',
    'Illinois': 'IL',
    'Pennsylvania': 'PA',
    'Indiana': 'IN',
    'Rhode Island': 'RI',
    'Iowa': 'IA',
    'South Carolina': 'SC',
    'Kansas': 'KS',
    'South Dakota': 'SD',
    'Kentucky': 'KY',
    'Tennessee': 'TN',
    'Louisiana': 'LA',
    'Texas': 'TX',
    'Maine': 'ME',
    'Utah': 'UT',
    'Maryland': 'MD',
    'Vermont': 'VT',
    'Massachusetts': 'MA',
    'Virginia': 'VA',
    'Michigan': 'MI',
    'Washington': 'WA',
    'Minnesota': 'MN',
    'West Virginia': 'WV',
    'Mississippi': 'MS',
    'Wisconsin': 'WI',
    'Missouri': 'MO',
    'Wyoming': 'WY',
    "Puerto Rico" : "PR",
"Virgin Islands" : "VI"
}