In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [38]:
## Input Data
chase = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/Chase_11449/" + "chase_stacked_final.parquet")
print(chase.shape)

tmobile = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/tmobile_10823/" + "tmbile_file_to_use.parquet")
print(tmobile.shape)

stripe = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/stripe_11363/" + "stripe_file_to_use.parquet")
print(stripe.shape)

fleector = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/fleector_11124/" + "fleector_sample_to_use.parquet")
print(fleector.shape)

(640958, 36)
(114460, 39)
(75000, 22)
(83704, 92)


In [3]:
## keep seleid, full address, phone, taxid, performance flag >> business information >> might need to dedup

#### SBFE Inquiry

In [4]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/sbfe_seleid_append_deduped_cleaned_0501.parquet"
sbfe = pd.read_adls(path, reader = pd.read_parquet)
print(sbfe.shape)

(79429086, 20)


In [5]:
sbfe.columns

Index(['accountnumber_x', 'companyname', 'alternatecompanyname_x', 'addr',
       'city', 'state', 'zip', 'businessphone', 'taxidnumber', 'historydate_x',
       'sufficient_input', 'accountnumber_y', 'alternatecompanyname_y',
       'historydate_y', 'powid', 'proxid', 'seleid', 'orgid', 'ultid',
       'overallweight'],
      dtype='object')

In [6]:
sbfe = sbfe[["seleid", "historydate_x", "businessphone"]]
sbfe.columns = ["seleid_inquiry_sbfe", "inquiry_date", "businessphone"]

In [7]:
sbfe["inquiry_date"] = sbfe["inquiry_date"].astype(int)
sbfe["inquiry_date_fmt"] = pd.to_datetime(sbfe["inquiry_date"], format='%Y%m%d')

In [8]:
sbfe["seleid_inquiry_sbfe"] = sbfe["seleid_inquiry_sbfe"].astype(float)

In [9]:
sbfe.head()

Unnamed: 0,seleid_inquiry_sbfe,inquiry_date,businessphone,inquiry_date_fmt
0,133615600000.0,20210701,4355353654,2021-07-01
1,1330122000.0,20210701,7708857033,2021-07-01
2,906413600.0,20210701,9798653142,2021-07-01
5,61367720.0,20210701,6505550000,2021-07-01
6,137145200000.0,20210701,9798859397,2021-07-01


#### Stripe 11363

In [39]:
stripe = stripe.fillna("")
stripe = stripe.drop_duplicates(subset = ['companyname', 'alternatecompanyname', 'bus_streetaddress1', 
                                          'bus_streetaddress2', 'bus_city', 'bus_state', 'bus_zip', 'bus_phone', 'bus_website'], ignore_index = True) 
print(stripe.shape)
print(stripe.uniqueid.is_unique)

(74443, 22)
True


In [30]:
# Compile SELEIDS
# stripe_sba = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/stripe_11363/final_str_11363_20221025file_sba21_nonsbfe.csv.gz",
#                       compression = "gzip", reader = pd.read_csv)
# print(stripe_sba.shape)
# stripe_sba = stripe_sba.drop_duplicates(subset = ["uniqueid"], ignore_index = True) 
# print(stripe_sba.shape)
# keep = ['uniqueid', 'lnlexidsele',  'inquirycount03m', 'inquirycount12m', 'inquirycreditcount03m', 'inquirycreditcount12m',]
# stripe_sba = stripe_sba[keep]
# stripe = stripe.merge(stripe_sba, on = "uniqueid")
# print(stripe.shape)

(249000, 391)
(248318, 391)


In [40]:
stripe.columns

Index(['Unnamed: 0', 'uniqueid', 'historydate', 'companyname',
       'alternatecompanyname', 'bus_streetaddress1', 'bus_streetaddress2',
       'bus_city', 'bus_state', 'bus_zip', 'bus_phone', 'bus_website',
       'firstname', 'lastname', 'streetaddress1', 'streetaddress2', 'city',
       'state', 'zip', 'email', 'ip_address', 'performance_indicator'],
      dtype='object')

In [39]:
# Compile Full Address: np.where(stripe["bus_streetaddress2"] != "", ", " + stripe["bus_streetaddress2"], "") + 
# stripe["full_addr"] = stripe["bus_streetaddress1"] + \
#     np.where(stripe["bus_city"] != "", ", " + stripe["bus_city"], "") + \
#     np.where(stripe["bus_state"] != "", ", " + stripe["bus_state"], "") + np.where(stripe["bus_zip"] != "", ", " + stripe["bus_zip"].str.slice(0, 5), "")
# stripe["full_addr_clean"] = stripe['full_addr'].str.replace('.', '')
# stripe["full_addr_clean"] = stripe['full_addr_clean'].str.replace(',', '')
# stripe["full_addr_clean"] = stripe['full_addr_clean'].str.replace('\W\s', '')
# stripe["full_addr_clean"] = stripe["full_addr_clean"].str.lower()

  stripe["full_addr_clean"] = stripe['full_addr'].str.replace('.', '')
  stripe["full_addr_clean"] = stripe['full_addr_clean'].str.replace('\W\s', '')


In [41]:
stripe.bus_phone.isnull().sum()

0

In [42]:
stripe[["bus_phone"]] 

Unnamed: 0,bus_phone
0,+18322103210
1,+12139263811
2,+17246208909
3,
4,+19172720510
...,...
74438,+12159203332
74439,+19168622031
74440,+18773302677
74441,


In [43]:
stripe["len"] = stripe["bus_phone"].str.len()
freq(stripe["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,14817.0,0.199038,14817.0,0.199038
11,51.0,0.000685,14868.0,0.199723
12,59549.0,0.799927,74417.0,0.999651
13,23.0,0.000309,74440.0,0.99996
14,3.0,4e-05,74443.0,1.0


In [45]:
stripe["phone_modify"] = np.where(stripe.bus_phone == "", "", 
                                  np.where(stripe["len"] >12, "", 
                                           np.where(stripe.bus_phone.str.contains("\+") & (stripe["len"] == 12), stripe.bus_phone.str.slice(2, 12), 
                                                    np.where(stripe.bus_phone.str.contains("\+") & (stripe["len"] == 11), stripe.bus_phone.str.slice(1, 11), stripe.bus_phone))))
stripe["phone_modify"]

0        8322103210
1        2139263811
2        7246208909
3                  
4        9172720510
            ...    
74438    2159203332
74439    9168622031
74440    8773302677
74441              
74442              
Name: phone_modify, Length: 74443, dtype: object

In [33]:
freq(stripe.historydate.str.slice(0, 7))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
historydate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,24426.0,0.328117,24426.0,0.328117
2022-02,24209.0,0.325202,48635.0,0.653319
2022-03,25808.0,0.346681,74443.0,1.0


In [34]:
freq(stripe.performance_indicator)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
performance_indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,73807.0,0.991457,73807.0,0.991457
True,636.0,0.008543,74443.0,1.0


In [46]:
stripe["bad"] = np.where(stripe.performance_indicator == "True", 1, 0)

In [47]:
stripe[stripe["phone_modify"] != ""].shape

(59600, 25)

In [48]:
stripe["credit_flag_cutoff_date"] = stripe["historydate"].str.slice(0, 10)
stripe["credit_flag_cutoff_date_fmt"] = pd.to_datetime(stripe["credit_flag_cutoff_date"], format='%Y-%m-%d')

In [49]:
stripe = stripe[stripe["phone_modify"] != ""]
print(stripe.shape)

(59600, 27)


In [50]:
sbfe_for_stripe = sbfe.merge(stripe, left_on = "businessphone", right_on = "phone_modify")
print(sbfe_for_stripe.shape)
sbfe_for_stripe = sbfe_for_stripe[sbfe_for_stripe["inquiry_date_fmt"] < sbfe_for_stripe["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_stripe.shape)

sbfe_for_stripe['inquiry_count_06']   = np.where((sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6, 1, 0)
sbfe_for_stripe['inquiry_count_03']   = np.where((sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3, 1, 0)
sbfe_for_stripe['inquiry_count_01']   = np.where((sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1, 1, 0)
sbfe_for_stripe['inquiry_count_week'] = np.where((sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1, 1, 0)

sbfe_for_stripe_roll = sbfe_for_stripe.groupby(by = ['uniqueid'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()
print(sbfe_for_stripe_roll.shape)

(64631, 31)
(43488, 31)
(4634, 5)


  sbfe_for_stripe_roll = sbfe_for_stripe.groupby(by = ['uniqueid'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()


In [51]:
stripe = stripe.merge(sbfe_for_stripe_roll, on = "uniqueid", how = "left")
print(stripe.shape)

(59600, 31)


In [52]:
stripe['inquiry_count_06'] = np.where(stripe["inquiry_count_06"].isnull(), 0,   stripe["inquiry_count_06"])
stripe['inquiry_count_03'] = np.where(stripe["inquiry_count_03"].isnull(), 0,   stripe["inquiry_count_03"])
stripe['inquiry_count_01'] = np.where(stripe["inquiry_count_01"].isnull(), 0,   stripe["inquiry_count_01"])
stripe['inquiry_count_week']=np.where(stripe["inquiry_count_week"].isnull(), 0, stripe["inquiry_count_week"])

In [53]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ["inquiry_count_06", 'inquiry_count_03', 'inquiry_count_01', 'inquiry_count_week']
for i, a in enumerate(keep):
    result[i] = bivariate(stripe[a], stripe["bad"], format = fmt)

wb = TableWriter(filename = "./_temp/stripe.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### TMobile 10823

In [54]:
tmobile["credit_flag_cutoff_date"] = tmobile.appdatetime_gmt.str.slice(0, 9)
tmobile["credit_flag_cutoff_date_fmt"] = pd.to_datetime(tmobile["credit_flag_cutoff_date"], format='%d%b%Y')

In [55]:
freq(tmobile["credit_flag_cutoff_date_fmt"].astype(str).str.slice(0, 7))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
credit_flag_cutoff_date_fmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,20510.0,0.179189,20510.0,0.179189
2022-02,22452.0,0.196156,42962.0,0.375345
2022-03,22846.0,0.199598,65808.0,0.574943
2022-04,14973.0,0.130814,80781.0,0.705757
2022-05,16271.0,0.142154,97052.0,0.847912
2022-06,17408.0,0.152088,114460.0,1.0


In [56]:
freq(tmobile.src)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
march,33679.0,0.294243,33679.0,0.294243
retro,80781.0,0.705757,114460.0,1.0


In [57]:
freq(tmobile.fraud_ind)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
fraud_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,58655.0,0.51245,58655.0,0.51245
1,3762.0,0.032867,62417.0,0.545317
Missing,52043.0,0.454683,114460.0,1.0


In [58]:
tmobile = tmobile[tmobile["fraud_ind"].notnull()]
print(tmobile.shape)

(62417, 41)


In [59]:
freq(tmobile.src)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
march,21914.0,0.35109,21914.0,0.35109
retro,40503.0,0.64891,62417.0,1.0


In [60]:
tmobile["credit_flag_cutoff_date"] = tmobile["appdatetime_gmt"].str.slice(0, 9)
tmobile["credit_flag_cutoff_date_fmt"] = pd.to_datetime(tmobile["credit_flag_cutoff_date"], format='%d%b%Y')

In [126]:
tmobile.columns

Index(['crid_encr', 'appdatetime_gmt', 'businessname', 'business_contactphone',
       'business_addressline1', 'business_addressline2',
       'business_addresscity', 'business_addressstate',
       'business_addresspostalcode', 'federaltaxid', 'contactfirstname',
       'contactmiddlename', 'contactlastname', 'contact_contactphone',
       'contact_addressline1', 'contact_addressline2', 'contact_addresscity',
       'contact_addressstate', 'contact_addresspostalcode', 'contact_dob',
       'contact_idnum', 'contact_idtype', 'contact_idstate', 'contact_email',
       'application_channel', 'fraud_ind', 'fraud_tp_ind', 'fraud_fp_ind',
       'fpd_ind', 'any_wo_ind', 'activationflag', 'seq', 'account', 'date',
       'src', 'contact_dob_year', 'contact_dob_month', 'contact_dob_day',
       'contact_driverlicensenumber', 'credit_flag_cutoff_date',
       'credit_flag_cutoff_date_fmt', 'full_addr', 'full_addr_clean',
       'abbrev_state', 'inquiry_count_06', 'inquiry_count_03',
       'i

In [62]:
tmobile["len"] = tmobile["business_contactphone"].str.len()
freq(tmobile["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,62417.0,1.0,62417.0,1.0


In [65]:
tmobile[tmobile["business_contactphone"] != ""].shape

(62417, 42)

In [66]:
sbfe_for_tmobile = sbfe.merge(tmobile, left_on = "businessphone", right_on = "business_contactphone")
print(sbfe_for_tmobile.shape)
sbfe_for_tmobile = sbfe_for_tmobile[sbfe_for_tmobile["inquiry_date_fmt"] < sbfe_for_tmobile["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_tmobile.shape)

sbfe_for_tmobile['inquiry_count_06']   = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6, 1, 0)
sbfe_for_tmobile['inquiry_count_03']   = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3, 1, 0)
sbfe_for_tmobile['inquiry_count_01']   = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1, 1, 0)
sbfe_for_tmobile['inquiry_count_week'] = np.where((sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1, 1, 0)

sbfe_for_tmobile_roll = sbfe_for_tmobile.groupby(by = ['crid_encr'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()
print(sbfe_for_tmobile_roll.shape)

(196414, 46)
(119582, 46)
(9666, 5)


  sbfe_for_tmobile_roll = sbfe_for_tmobile.groupby(by = ['crid_encr'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()


In [67]:
tmobile = tmobile.merge(sbfe_for_tmobile_roll, on = "crid_encr", how = "left")
print(tmobile.shape)

(62417, 46)


In [68]:
tmobile['inquiry_count_06'] = np.where(tmobile["inquiry_count_06"].isnull(), 0,   tmobile["inquiry_count_06"])
tmobile['inquiry_count_03'] = np.where(tmobile["inquiry_count_03"].isnull(), 0,   tmobile["inquiry_count_03"])
tmobile['inquiry_count_01'] = np.where(tmobile["inquiry_count_01"].isnull(), 0,   tmobile["inquiry_count_01"])
tmobile['inquiry_count_week']=np.where(tmobile["inquiry_count_week"].isnull(), 0, tmobile["inquiry_count_week"])

In [69]:
tmobile["fraud_ind"] = tmobile["fraud_ind"].astype(float)

In [70]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ["inquiry_count_06", 'inquiry_count_03', 'inquiry_count_01', 'inquiry_count_week']
for i, a in enumerate(keep):
    result[i] = bivariate(tmobile[a], tmobile["fraud_ind"], format = fmt)

wb = TableWriter(filename = "./_temp/tmobile.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Chase

In [71]:
chase = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/Chase_11449/chase_stacked_final.parquet")
print(chase.shape)

(640958, 36)


In [72]:
chase["credit_flag_cutoff_date_fmt"] = pd.to_datetime(chase["app_date"], format='%d%b%Y')

In [95]:
chase.columns

Index(['transaction_id', 'cust_first_nm', 'cust_mid_init_tx', 'cust_last_nm',
       'govt_issu_id_nb', 'line_1_ad', 'line_2_ad', 'city_nm', 'state_prov_cd',
       'pst_area_7_cd', 'pst_area_cd', 'channel', 'bus_name', 'app_date',
       'date_of_birth', 'decision', 'bizidscore', 'bd_score', 'bd_score1',
       'bd_score2', 'bd_score3', 'bus_addr1', 'bus_addr2', 'bus_city',
       'bus_state', 'bus_zip', 'bus_tax_id', 'bus_phone', 'bus_type',
       'home_phone', 'bad', 'seq', 'account', 'date', 'LN_booked', 'count',
       'credit_flag_cutoff_date_fmt'],
      dtype='object')

In [73]:
chase["len"] = chase["bus_phone"].str.len()
freq(chase["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2663.0,0.004155,2663.0,0.004155
10,638295.0,0.995845,640958.0,1.0


In [75]:
chase = chase[chase["bus_phone"] != ""]
print(chase.shape)

(638295, 38)


In [76]:
sbfe.columns

Index(['seleid_inquiry_sbfe', 'inquiry_date', 'businessphone',
       'inquiry_date_fmt'],
      dtype='object')

In [77]:
sbfe_for_chase = sbfe.merge(chase, left_on = "businessphone", right_on = "bus_phone")
print(sbfe_for_chase.shape)
sbfe_for_chase = sbfe_for_chase[sbfe_for_chase["inquiry_date_fmt"] < sbfe_for_chase["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_chase.shape)

sbfe_for_chase['inquiry_count_06']   = np.where((sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6, 1, 0)
sbfe_for_chase['inquiry_count_03']   = np.where((sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3, 1, 0)
sbfe_for_chase['inquiry_count_01']   = np.where((sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1, 1, 0)
sbfe_for_chase['inquiry_count_week'] = np.where((sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1, 1, 0)

sbfe_for_chase_roll = sbfe_for_chase.groupby(by = ['account'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()
print(sbfe_for_chase_roll.shape)

(883962, 42)
(678316, 42)
(71327, 5)


  sbfe_for_chase_roll = sbfe_for_chase.groupby(by = ['account'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()


In [78]:
chase = chase.merge(sbfe_for_chase_roll, on = "account", how = "left")
print(chase.shape)

(638295, 42)


In [79]:
chase['inquiry_count_06'] = np.where(chase["inquiry_count_06"].isnull(), 0,   chase["inquiry_count_06"])
chase['inquiry_count_03'] = np.where(chase["inquiry_count_03"].isnull(), 0,   chase["inquiry_count_03"])
chase['inquiry_count_01'] = np.where(chase["inquiry_count_01"].isnull(), 0,   chase["inquiry_count_01"])
chase['inquiry_count_week']=np.where(chase["inquiry_count_week"].isnull(), 0, chase["inquiry_count_week"])

In [80]:
chase["fraud_ind"] = chase["bad"].astype(float)

In [81]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ["inquiry_count_06", 'inquiry_count_03', 'inquiry_count_01', 'inquiry_count_week']
for i, a in enumerate(keep):
    result[i] = bivariate(chase[a].astype(float), chase["fraud_ind"], format = fmt)

wb = TableWriter(filename = "./_temp/chase.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Fleector

In [82]:
fleector = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/fleector_11124/fleector_sample_to_use.parquet")
print(fleector.shape)

(83704, 92)


In [83]:
fleector["credit_flag_cutoff_date_fmt"] = pd.to_datetime(fleector["date_created"], format='%Y-%m-%d')

In [84]:
freq(fleector["credit_flag_cutoff_date_fmt"].astype(str).str.slice(0, 7))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
credit_flag_cutoff_date_fmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,15827.0,0.189083,15827.0,0.189083
2022-02,15512.0,0.18532,31339.0,0.374403
2022-03,19353.0,0.231208,50692.0,0.60561
2022-04,16822.0,0.20097,67514.0,0.80658
2022-05,16185.0,0.19336,83699.0,0.99994
2022-06,5.0,6e-05,83704.0,1.0


In [85]:
fleector.columns

Index(['seq', 'ats_id', 'open_date', 'open_mon', 'open_qtr',
       'writeoff_flag_12mo', 'writeoff_flag_18mo', 'wo_amount', 'wo_date',
       'wo_mon', 'wo_qtr', 'pmt_sum_12mo', 'pmt_count_12mo',
       'return_pmt_count_12mo', 'pmt_sum_18mo', 'pmt_count_18mo',
       'return_pmt_count_18mo', 'platform', 'lock_code', 'lock_reason',
       'revenue_12mo', 'revenue_18mo', 'fuel_revenue_12mo',
       'fuel_revenue_18mo', 'fee_revenue_12mo', 'fee_revenue_18mo',
       'account_code', 'limit_approved', 'last_pos_cred_limit', 'gallons_12mo',
       'gallons_18mo', 'roadster_twentile', 'ats_bill_cycle_group', 'app_flag',
       'acct_flag', 'app_mon', 'date_created', 'channel', 'team',
       'processor_cd', 'brand', 'product', 'portfolio', 'status_credit',
       'status_fraud', 'creditstatus', 'req_credit_limit', 'builder_pro_flag',
       'test_app', 'dup_match', 'approve_flag', 'cond_deposit_category',
       'secured_approval_flag', 'cust_name', 'fed_id', 'nmf_contact_email',
       'bu

In [111]:
sbfe_for_fleector = sbfe.merge(fleector, on = "full_addr_clean")
print(sbfe_for_fleector.shape)
sbfe_for_fleector = sbfe_for_fleector[sbfe_for_fleector["inquiry_date_fmt"] < sbfe_for_fleector["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_fleector.shape)

sbfe_for_fleector['inquiry_count_06']   = np.where((sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6, 1, 0)
sbfe_for_fleector['inquiry_count_03']   = np.where((sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3, 1, 0)
sbfe_for_fleector['inquiry_count_01']   = np.where((sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1, 1, 0)
sbfe_for_fleector['inquiry_count_week'] = np.where((sbfe_for_fleector["credit_flag_cutoff_date_fmt"] - sbfe_for_fleector["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1, 1, 0)

sbfe_for_fleector_roll = sbfe_for_fleector.groupby(by = ['seq'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()
print(sbfe_for_fleector_roll.shape)

(176804, 98)
(89741, 98)
(14482, 5)


  sbfe_for_fleector_roll = sbfe_for_fleector.groupby(by = ['seq'])["inquiry_count_06", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()


In [112]:
fleector = fleector.merge(sbfe_for_fleector_roll, on = "seq", how = "left")
print(fleector.shape)

(83704, 99)


In [113]:
fleector['inquiry_count_06'] = np.where(fleector["inquiry_count_06"].isnull(), 0,   fleector["inquiry_count_06"])
fleector['inquiry_count_03'] = np.where(fleector["inquiry_count_03"].isnull(), 0,   fleector["inquiry_count_03"])
fleector['inquiry_count_01'] = np.where(fleector["inquiry_count_01"].isnull(), 0,   fleector["inquiry_count_01"])
fleector['inquiry_count_week']=np.where(fleector["inquiry_count_week"].isnull(), 0, fleector["inquiry_count_week"])

In [114]:
fleector["fraud_ind"] = fleector["final_fraud_flag"].astype(float)

In [115]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ["inquiry_count_06", 'inquiry_count_03', 'inquiry_count_01', 'inquiry_count_week']
for i, a in enumerate(keep):
    result[i] = bivariate(fleector[a].astype(float), fleector["fraud_ind"], format = fmt, groups= fleector.acct_flag)

wb = TableWriter(filename = "./_temp/fleector.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Other

In [84]:
statename_to_abbr = {
    # Other
    'District of Columbia': 'DC',

    # States
    'Alabama': 'AL',
    'Montana': 'MT',
    'Alaska': 'AK',
    'Nebraska': 'NE',
    'Arizona': 'AZ',
    'Nevada': 'NV',
    'Arkansas': 'AR',
    'New Hampshire': 'NH',
    'California': 'CA',
    'New Jersey': 'NJ',
    'Colorado': 'CO',
    'New Mexico': 'NM',
    'Connecticut': 'CT',
    'New York': 'NY',
    'Delaware': 'DE',
    'North Carolina': 'NC',
    'Florida': 'FL',
    'North Dakota': 'ND',
    'Georgia': 'GA',
    'Ohio': 'OH',
    'Hawaii': 'HI',
    'Oklahoma': 'OK',
    'Idaho': 'ID',
    'Oregon': 'OR',
    'Illinois': 'IL',
    'Pennsylvania': 'PA',
    'Indiana': 'IN',
    'Rhode Island': 'RI',
    'Iowa': 'IA',
    'South Carolina': 'SC',
    'Kansas': 'KS',
    'South Dakota': 'SD',
    'Kentucky': 'KY',
    'Tennessee': 'TN',
    'Louisiana': 'LA',
    'Texas': 'TX',
    'Maine': 'ME',
    'Utah': 'UT',
    'Maryland': 'MD',
    'Vermont': 'VT',
    'Massachusetts': 'MA',
    'Virginia': 'VA',
    'Michigan': 'MI',
    'Washington': 'WA',
    'Minnesota': 'MN',
    'West Virginia': 'WV',
    'Mississippi': 'MS',
    'Wisconsin': 'WI',
    'Missouri': 'MO',
    'Wyoming': 'WY',
    "Puerto Rico" : "PR",
"Virgin Islands" : "VI"
}