In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [2]:
# ## new fix for data import
# import os
# os.environ["HTTP_PROXY"] = ""
# os.environ["HTTPS_PROXY"] = ""
# os.environ["NO_PROXY"] = ""

In [3]:
## Input Data
chase = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/Chase_11449/" + "chase_stacked_final.parquet")
print(chase.shape)

tmobile = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/tmobile_10823/" + "tmbile_file_to_use.parquet")
print(tmobile.shape)

stripe = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/stripe_11363/" + "stripe_file_to_use.parquet")
print(stripe.shape)

fleector = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/fleector_11124/" + "fleector_sample_to_use.parquet")
print(fleector.shape)

(640958, 36)
(114460, 39)
(75000, 22)
(83704, 92)


In [4]:
## keep seleid, full address, phone, taxid, performance flag >> business information >> might need to dedup

#### SBFE Inquiry

In [5]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/sbfe_seleid_append_deduped_cleaned_0501.parquet"
sbfe = pd.read_adls(path, reader = pd.read_parquet)
print(sbfe.shape)

(79429086, 20)


In [6]:
sbfe["full_addr"] = sbfe["addr"] + np.where(sbfe["city"] != "", ", " + sbfe["city"], "") + \
    np.where(sbfe["state"] != "", ", " + sbfe["state"], "") + np.where(sbfe["zip"] != "", ", " + sbfe["zip"].str.slice(0, 5), "")
sbfe["full_addr_clean"] = sbfe['full_addr'].str.replace('.', '')
sbfe["full_addr_clean"] = sbfe['full_addr_clean'].str.replace(',', '')
sbfe["full_addr_clean"] = sbfe['full_addr_clean'].str.replace('\W\s', '')
sbfe["full_addr_clean"] = sbfe["full_addr_clean"].str.lower()

  sbfe["full_addr_clean"] = sbfe['full_addr'].str.replace('.', '')
  sbfe["full_addr_clean"] = sbfe['full_addr_clean'].str.replace('\W\s', '')


In [7]:
sbfe.columns

Index(['accountnumber_x', 'companyname', 'alternatecompanyname_x', 'addr',
       'city', 'state', 'zip', 'businessphone', 'taxidnumber', 'historydate_x',
       'sufficient_input', 'accountnumber_y', 'alternatecompanyname_y',
       'historydate_y', 'powid', 'proxid', 'seleid', 'orgid', 'ultid',
       'overallweight', 'full_addr', 'full_addr_clean'],
      dtype='object')

In [8]:
sbfe = sbfe[["seleid", "historydate_x", "full_addr_clean", 'businessphone', 'taxidnumber']]
sbfe.columns = ["seleid_inquiry_sbfe", "inquiry_date", "full_addr_clean", 'businessphone', 'taxidnumber']

In [9]:
sbfe["inquiry_date"] = sbfe["inquiry_date"].astype(int)
sbfe["inquiry_date_fmt"] = pd.to_datetime(sbfe["inquiry_date"], format='%Y%m%d')

In [10]:
sbfe["seleid_inquiry_sbfe"] = sbfe["seleid_inquiry_sbfe"].astype(float)

In [11]:
sbfe["full_addr_clean"] = sbfe["full_addr_clean"].str.lower()

In [12]:
sbfe.head()

Unnamed: 0,seleid_inquiry_sbfe,inquiry_date,full_addr_clean,businessphone,taxidnumber,inquiry_date_fmt
0,133615600000.0,20210701,115 golf course rd ste e logan ut 84321,4355353654,,2021-07-01
1,1330122000.0,20210701,3200 palm tree dr lithonia ga 30038,7708857033,,2021-07-01
2,906413600.0,20210701,1476 highway 159 e bellville tx 77418,9798653142,,2021-07-01
5,61367720.0,20210701,492 koller st san francisco ca 94110,6505550000,,2021-07-01
6,137145200000.0,20210701,107 lassiter lane bellville tx 77418,9798859397,,2021-07-01


#### Stripe 11363

In [13]:
stripe = stripe.fillna("")
stripe = stripe.drop_duplicates(subset = ['companyname', 'alternatecompanyname', 'bus_streetaddress1', 
                                          'bus_streetaddress2', 'bus_city', 'bus_state', 'bus_zip', 'bus_phone', 'bus_website'], ignore_index = True) 
print(stripe.shape)
print(stripe.uniqueid.is_unique)

(74443, 22)
True


In [14]:
stripe.bus_phone.isnull().sum()

0

In [15]:
stripe[["bus_phone"]] 

Unnamed: 0,bus_phone
0,+18322103210
1,+12139263811
2,+17246208909
3,
4,+19172720510
...,...
74438,+12159203332
74439,+19168622031
74440,+18773302677
74441,


In [16]:
stripe["len"] = stripe["bus_phone"].str.len()
freq(stripe["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,14817.0,0.199038,14817.0,0.199038
11,51.0,0.000685,14868.0,0.199723
12,59549.0,0.799927,74417.0,0.999651
13,23.0,0.000309,74440.0,0.99996
14,3.0,4e-05,74443.0,1.0


In [17]:
stripe["phone_modify"] = np.where(stripe.bus_phone == "", "", 
                                  np.where(stripe["len"] >12, "", 
                                           np.where(stripe.bus_phone.str.contains("\+") & (stripe["len"] == 12), stripe.bus_phone.str.slice(2, 12), 
                                                    np.where(stripe.bus_phone.str.contains("\+") & (stripe["len"] == 11), stripe.bus_phone.str.slice(1, 11), stripe.bus_phone))))
stripe["phone_modify"]

0        8322103210
1        2139263811
2        7246208909
3                  
4        9172720510
            ...    
74438    2159203332
74439    9168622031
74440    8773302677
74441              
74442              
Name: phone_modify, Length: 74443, dtype: object

In [19]:
stripe["bad"] = np.where(stripe.performance_indicator == "True", 1, 0)

In [20]:
stripe[stripe["phone_modify"] != ""].shape

(59600, 25)

In [21]:
59600/74443

0.8006125491987158

In [22]:
stripe["credit_flag_cutoff_date"] = stripe["historydate"].str.slice(0, 10)
stripe["credit_flag_cutoff_date_fmt"] = pd.to_datetime(stripe["credit_flag_cutoff_date"], format='%Y-%m-%d')

In [23]:
stripe = stripe[stripe["phone_modify"] != ""]
print(stripe.shape)

(59600, 27)


In [24]:
sbfe_for_stripe = sbfe.merge(stripe, left_on = "businessphone", right_on = "phone_modify")
print(sbfe_for_stripe.shape)
sbfe_for_stripe = sbfe_for_stripe[sbfe_for_stripe["inquiry_date_fmt"] < sbfe_for_stripe["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_stripe.shape)

(64631, 33)
(43488, 33)


In [26]:
temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['uniqueid'])["full_addr_clean"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['uniqueid'])["full_addr_clean"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['uniqueid'])["full_addr_clean"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['uniqueid'])["full_addr_clean"].nunique().reset_index()
print(count_unique_1w.shape)

(4403, 2)
(3834, 2)
(3085, 2)
(1030, 2)


In [27]:
count_unique_06.columns = ['uniqueid', "addr_count_unique_06"]
count_unique_03.columns = ['uniqueid', "addr_count_unique_03"]
count_unique_01.columns = ['uniqueid', "addr_count_unique_01"]
count_unique_1w.columns = ['uniqueid', "addr_count_unique_1w"]

stripe = stripe.merge(count_unique_06, on = "uniqueid", how = "left")
print(stripe.shape)
stripe = stripe.merge(count_unique_03, on = "uniqueid", how = "left")
print(stripe.shape)
stripe = stripe.merge(count_unique_01, on = "uniqueid", how = "left")
print(stripe.shape)
stripe = stripe.merge(count_unique_1w, on = "uniqueid", how = "left")
print(stripe.shape)

(59600, 28)
(59600, 29)
(59600, 30)
(59600, 31)


In [28]:
temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['uniqueid'])["taxidnumber"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['uniqueid'])["taxidnumber"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['uniqueid'])["taxidnumber"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_stripe[(sbfe_for_stripe["credit_flag_cutoff_date_fmt"] - sbfe_for_stripe["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['uniqueid'])["taxidnumber"].nunique().reset_index()
print(count_unique_1w.shape)

(4403, 2)
(3834, 2)
(3085, 2)
(1030, 2)


In [29]:
count_unique_06.columns = ['uniqueid', "taxid_count_unique_06"]
count_unique_03.columns = ['uniqueid', "taxid_count_unique_03"]
count_unique_01.columns = ['uniqueid', "taxid_count_unique_01"]
count_unique_1w.columns = ['uniqueid', "taxid_count_unique_1w"]

stripe = stripe.merge(count_unique_06, on = "uniqueid", how = "left")
print(stripe.shape)
stripe = stripe.merge(count_unique_03, on = "uniqueid", how = "left")
print(stripe.shape)
stripe = stripe.merge(count_unique_01, on = "uniqueid", how = "left")
print(stripe.shape)
stripe = stripe.merge(count_unique_1w, on = "uniqueid", how = "left")
print(stripe.shape)

(59600, 32)
(59600, 33)
(59600, 34)
(59600, 35)


In [30]:
stripe['addr_count_unique_06'] = np.where(stripe["addr_count_unique_06"].isnull(), 0, stripe["addr_count_unique_06"])
stripe['addr_count_unique_03'] = np.where(stripe["addr_count_unique_03"].isnull(), 0, stripe["addr_count_unique_03"])
stripe['addr_count_unique_01'] = np.where(stripe["addr_count_unique_01"].isnull(), 0, stripe["addr_count_unique_01"])
stripe['addr_count_unique_1w'] = np.where(stripe["addr_count_unique_1w"].isnull(), 0, stripe["addr_count_unique_1w"])

stripe['taxid_count_unique_06'] = np.where(stripe["taxid_count_unique_06"].isnull(), 0, stripe["taxid_count_unique_06"])
stripe['taxid_count_unique_03'] = np.where(stripe["taxid_count_unique_03"].isnull(), 0, stripe["taxid_count_unique_03"])
stripe['taxid_count_unique_01'] = np.where(stripe["taxid_count_unique_01"].isnull(), 0, stripe["taxid_count_unique_01"])
stripe['taxid_count_unique_1w'] = np.where(stripe["taxid_count_unique_1w"].isnull(), 0, stripe["taxid_count_unique_1w"])

In [31]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ['addr_count_unique_06', 'addr_count_unique_03', 'addr_count_unique_01', 'addr_count_unique_1w', 'taxid_count_unique_06', 'taxid_count_unique_03', 'taxid_count_unique_01', 'taxid_count_unique_1w']
for i, a in enumerate(keep):
    result[i] = bivariate(stripe[a], stripe["bad"], format = fmt)

wb = TableWriter(filename = "./_temp/stripe.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### TMobile 10823

In [32]:
tmobile["credit_flag_cutoff_date"] = tmobile.appdatetime_gmt.str.slice(0, 9)
tmobile["credit_flag_cutoff_date_fmt"] = pd.to_datetime(tmobile["credit_flag_cutoff_date"], format='%d%b%Y')

In [33]:
freq(tmobile["credit_flag_cutoff_date_fmt"].astype(str).str.slice(0, 7))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
credit_flag_cutoff_date_fmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,20510.0,0.179189,20510.0,0.179189
2022-02,22452.0,0.196156,42962.0,0.375345
2022-03,22846.0,0.199598,65808.0,0.574943
2022-04,14973.0,0.130814,80781.0,0.705757
2022-05,16271.0,0.142154,97052.0,0.847912
2022-06,17408.0,0.152088,114460.0,1.0


In [34]:
freq(tmobile.src)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
march,33679.0,0.294243,33679.0,0.294243
retro,80781.0,0.705757,114460.0,1.0


In [35]:
freq(tmobile.fraud_ind)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
fraud_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,58655.0,0.51245,58655.0,0.51245
1,3762.0,0.032867,62417.0,0.545317
Missing,52043.0,0.454683,114460.0,1.0


In [36]:
tmobile = tmobile[tmobile["fraud_ind"].notnull()]
print(tmobile.shape)

(62417, 41)


In [37]:
freq(tmobile.src)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
march,21914.0,0.35109,21914.0,0.35109
retro,40503.0,0.64891,62417.0,1.0


In [38]:
tmobile["credit_flag_cutoff_date"] = tmobile["appdatetime_gmt"].str.slice(0, 9)
tmobile["credit_flag_cutoff_date_fmt"] = pd.to_datetime(tmobile["credit_flag_cutoff_date"], format='%d%b%Y')

In [39]:
tmobile["len"] = tmobile["business_contactphone"].str.len()
freq(tmobile["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,62417.0,1.0,62417.0,1.0


In [41]:
tmobile[tmobile["business_contactphone"] != ""].shape

(62417, 42)

In [42]:
sbfe_for_tmobile = sbfe.merge(tmobile, left_on = "businessphone", right_on = "business_contactphone")
print(sbfe_for_tmobile.shape)
sbfe_for_tmobile = sbfe_for_tmobile[sbfe_for_tmobile["inquiry_date_fmt"] < sbfe_for_tmobile["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_tmobile.shape)

(196414, 48)
(119582, 48)


In [43]:
temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['crid_encr'])["full_addr_clean"].nunique().reset_index()
print(count_unique_1w.shape)

(8953, 2)
(7329, 2)
(4783, 2)
(1505, 2)


In [44]:
count_unique_06.columns = ['crid_encr', "addr_count_unique_06"]
count_unique_03.columns = ['crid_encr', "addr_count_unique_03"]
count_unique_01.columns = ['crid_encr', "addr_count_unique_01"]
count_unique_1w.columns = ['crid_encr', "addr_count_unique_1w"]

tmobile = tmobile.merge(count_unique_06, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_03, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_01, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_1w, on = "crid_encr", how = "left")
print(tmobile.shape)

(62417, 43)
(62417, 44)
(62417, 45)
(62417, 46)


In [45]:
temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['crid_encr'])["taxidnumber"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['crid_encr'])["taxidnumber"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['crid_encr'])["taxidnumber"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_tmobile[(sbfe_for_tmobile["credit_flag_cutoff_date_fmt"] - sbfe_for_tmobile["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['crid_encr'])["taxidnumber"].nunique().reset_index()
print(count_unique_1w.shape)

(8953, 2)
(7329, 2)
(4783, 2)
(1505, 2)


In [46]:
count_unique_06.columns = ['crid_encr', "taxid_count_unique_06"]
count_unique_03.columns = ['crid_encr', "taxid_count_unique_03"]
count_unique_01.columns = ['crid_encr', "taxid_count_unique_01"]
count_unique_1w.columns = ['crid_encr', "taxid_count_unique_1w"]

tmobile = tmobile.merge(count_unique_06, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_03, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_01, on = "crid_encr", how = "left")
print(tmobile.shape)
tmobile = tmobile.merge(count_unique_1w, on = "crid_encr", how = "left")
print(tmobile.shape)

(62417, 47)
(62417, 48)
(62417, 49)
(62417, 50)


In [47]:
tmobile['addr_count_unique_06'] = np.where(tmobile["addr_count_unique_06"].isnull(), 0, tmobile["addr_count_unique_06"])
tmobile['addr_count_unique_03'] = np.where(tmobile["addr_count_unique_03"].isnull(), 0, tmobile["addr_count_unique_03"])
tmobile['addr_count_unique_01'] = np.where(tmobile["addr_count_unique_01"].isnull(), 0, tmobile["addr_count_unique_01"])
tmobile['addr_count_unique_1w'] = np.where(tmobile["addr_count_unique_1w"].isnull(), 0, tmobile["addr_count_unique_1w"])

tmobile['taxid_count_unique_06'] = np.where(tmobile["taxid_count_unique_06"].isnull(), 0, tmobile["taxid_count_unique_06"])
tmobile['taxid_count_unique_03'] = np.where(tmobile["taxid_count_unique_03"].isnull(), 0, tmobile["taxid_count_unique_03"])
tmobile['taxid_count_unique_01'] = np.where(tmobile["taxid_count_unique_01"].isnull(), 0, tmobile["taxid_count_unique_01"])
tmobile['taxid_count_unique_1w'] = np.where(tmobile["taxid_count_unique_1w"].isnull(), 0, tmobile["taxid_count_unique_1w"])

In [48]:
tmobile["fraud_ind"] = tmobile["fraud_ind"].astype(float)

In [49]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ['addr_count_unique_06', 'addr_count_unique_03', 'addr_count_unique_01', 'addr_count_unique_1w', 'taxid_count_unique_06', 'taxid_count_unique_03', 'taxid_count_unique_01', 'taxid_count_unique_1w']
for i, a in enumerate(keep):
    result[i] = bivariate(tmobile[a], tmobile["fraud_ind"], format = fmt)

wb = TableWriter(filename = "./_temp/tmobile.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Chase

In [50]:
chase = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/Chase_11449/chase_stacked_final.parquet")
print(chase.shape)

(640958, 36)


In [51]:
chase["credit_flag_cutoff_date_fmt"] = pd.to_datetime(chase["app_date"], format='%d%b%Y')

In [52]:
chase.columns

Index(['transaction_id', 'cust_first_nm', 'cust_mid_init_tx', 'cust_last_nm',
       'govt_issu_id_nb', 'line_1_ad', 'line_2_ad', 'city_nm', 'state_prov_cd',
       'pst_area_7_cd', 'pst_area_cd', 'channel', 'bus_name', 'app_date',
       'date_of_birth', 'decision', 'bizidscore', 'bd_score', 'bd_score1',
       'bd_score2', 'bd_score3', 'bus_addr1', 'bus_addr2', 'bus_city',
       'bus_state', 'bus_zip', 'bus_tax_id', 'bus_phone', 'bus_type',
       'home_phone', 'bad', 'seq', 'account', 'date', 'LN_booked', 'count',
       'credit_flag_cutoff_date_fmt'],
      dtype='object')

In [53]:
chase["len"] = chase["bus_phone"].str.len()
freq(chase["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2663.0,0.004155,2663.0,0.004155
10,638295.0,0.995845,640958.0,1.0


In [55]:
chase = chase[chase["bus_phone"] != ""]
print(chase.shape)

(638295, 38)


In [56]:
sbfe.columns

Index(['seleid_inquiry_sbfe', 'inquiry_date', 'full_addr_clean',
       'businessphone', 'taxidnumber', 'inquiry_date_fmt'],
      dtype='object')

In [57]:
sbfe_for_chase = sbfe.merge(chase, left_on = "businessphone", right_on = "bus_phone")
print(sbfe_for_chase.shape)
sbfe_for_chase = sbfe_for_chase[sbfe_for_chase["inquiry_date_fmt"] < sbfe_for_chase["credit_flag_cutoff_date_fmt"]]
print(sbfe_for_chase.shape)

(883962, 44)
(678316, 44)


In [58]:
temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['account'])["full_addr_clean"].nunique().reset_index()
print(count_unique_1w.shape)

(65728, 2)
(58146, 2)
(43288, 2)
(13729, 2)


In [59]:
count_unique_06.columns = ['account', "addr_count_unique_06"]
count_unique_03.columns = ['account', "addr_count_unique_03"]
count_unique_01.columns = ['account', "addr_count_unique_01"]
count_unique_1w.columns = ['account', "addr_count_unique_1w"]

chase = chase.merge(count_unique_06, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_03, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_01, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_1w, on = "account", how = "left")
print(chase.shape)

(638295, 39)
(638295, 40)
(638295, 41)
(638295, 42)


In [60]:
temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 6].copy()
count_unique_06 = temp.groupby(by = ['account'])["taxidnumber"].nunique().reset_index()
print(count_unique_06.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3].copy()
count_unique_03 = temp.groupby(by = ['account'])["taxidnumber"].nunique().reset_index()
print(count_unique_03.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1].copy()
count_unique_01 = temp.groupby(by = ['account'])["taxidnumber"].nunique().reset_index()
print(count_unique_01.shape)

temp = sbfe_for_chase[(sbfe_for_chase["credit_flag_cutoff_date_fmt"] - sbfe_for_chase["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1].copy()
count_unique_1w = temp.groupby(by = ['account'])["taxidnumber"].nunique().reset_index()
print(count_unique_1w.shape)

(65728, 2)
(58146, 2)
(43288, 2)
(13729, 2)


In [61]:
count_unique_06.columns = ['account', "taxid_count_unique_06"]
count_unique_03.columns = ['account', "taxid_count_unique_03"]
count_unique_01.columns = ['account', "taxid_count_unique_01"]
count_unique_1w.columns = ['account', "taxid_count_unique_1w"]

chase = chase.merge(count_unique_06, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_03, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_01, on = "account", how = "left")
print(chase.shape)
chase = chase.merge(count_unique_1w, on = "account", how = "left")
print(chase.shape)

(638295, 43)
(638295, 44)
(638295, 45)
(638295, 46)


In [62]:
chase['addr_count_unique_06'] = np.where(chase["addr_count_unique_06"].isnull(), 0, chase["addr_count_unique_06"])
chase['addr_count_unique_03'] = np.where(chase["addr_count_unique_03"].isnull(), 0, chase["addr_count_unique_03"])
chase['addr_count_unique_01'] = np.where(chase["addr_count_unique_01"].isnull(), 0, chase["addr_count_unique_01"])
chase['addr_count_unique_1w'] = np.where(chase["addr_count_unique_1w"].isnull(), 0, chase["addr_count_unique_1w"])

chase['taxid_count_unique_06'] = np.where(chase["taxid_count_unique_06"].isnull(), 0, chase["taxid_count_unique_06"])
chase['taxid_count_unique_03'] = np.where(chase["taxid_count_unique_03"].isnull(), 0, chase["taxid_count_unique_03"])
chase['taxid_count_unique_01'] = np.where(chase["taxid_count_unique_01"].isnull(), 0, chase["taxid_count_unique_01"])
chase['taxid_count_unique_1w'] = np.where(chase["taxid_count_unique_1w"].isnull(), 0, chase["taxid_count_unique_1w"])

In [63]:
chase["fraud_ind"] = chase["bad"].astype(float)

In [64]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
keep = ['addr_count_unique_06', 'addr_count_unique_03', 'addr_count_unique_01', 'addr_count_unique_1w', 'taxid_count_unique_06', 'taxid_count_unique_03', 'taxid_count_unique_01', 'taxid_count_unique_1w']
for i, a in enumerate(keep):
    result[i] = bivariate(chase[a].astype(float), chase["fraud_ind"], format = fmt)

wb = TableWriter(filename = "./_temp/chase.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

#### Fleector

In [13]:
fleector = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/Fraud Sample/fleector_11124/fleector_sample_to_use.parquet")
print(fleector.shape)

(83704, 92)


In [14]:
fleector["credit_flag_cutoff_date_fmt"] = pd.to_datetime(fleector["date_created"], format='%Y-%m-%d')

In [15]:
freq(fleector["credit_flag_cutoff_date_fmt"].astype(str).str.slice(0, 7))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
credit_flag_cutoff_date_fmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,15827.0,0.189083,15827.0,0.189083
2022-02,15512.0,0.18532,31339.0,0.374403
2022-03,19353.0,0.231208,50692.0,0.60561
2022-04,16822.0,0.20097,67514.0,0.80658
2022-05,16185.0,0.19336,83699.0,0.99994
2022-06,5.0,6e-05,83704.0,1.0


In [16]:
fleector.columns

Index(['seq', 'ats_id', 'open_date', 'open_mon', 'open_qtr',
       'writeoff_flag_12mo', 'writeoff_flag_18mo', 'wo_amount', 'wo_date',
       'wo_mon', 'wo_qtr', 'pmt_sum_12mo', 'pmt_count_12mo',
       'return_pmt_count_12mo', 'pmt_sum_18mo', 'pmt_count_18mo',
       'return_pmt_count_18mo', 'platform', 'lock_code', 'lock_reason',
       'revenue_12mo', 'revenue_18mo', 'fuel_revenue_12mo',
       'fuel_revenue_18mo', 'fee_revenue_12mo', 'fee_revenue_18mo',
       'account_code', 'limit_approved', 'last_pos_cred_limit', 'gallons_12mo',
       'gallons_18mo', 'roadster_twentile', 'ats_bill_cycle_group', 'app_flag',
       'acct_flag', 'app_mon', 'date_created', 'channel', 'team',
       'processor_cd', 'brand', 'product', 'portfolio', 'status_credit',
       'status_fraud', 'creditstatus', 'req_credit_limit', 'builder_pro_flag',
       'test_app', 'dup_match', 'approve_flag', 'cond_deposit_category',
       'secured_approval_flag', 'cust_name', 'fed_id', 'nmf_contact_email',
       'bu

#### Other

In [84]:
statename_to_abbr = {
    # Other
    'District of Columbia': 'DC',

    # States
    'Alabama': 'AL',
    'Montana': 'MT',
    'Alaska': 'AK',
    'Nebraska': 'NE',
    'Arizona': 'AZ',
    'Nevada': 'NV',
    'Arkansas': 'AR',
    'New Hampshire': 'NH',
    'California': 'CA',
    'New Jersey': 'NJ',
    'Colorado': 'CO',
    'New Mexico': 'NM',
    'Connecticut': 'CT',
    'New York': 'NY',
    'Delaware': 'DE',
    'North Carolina': 'NC',
    'Florida': 'FL',
    'North Dakota': 'ND',
    'Georgia': 'GA',
    'Ohio': 'OH',
    'Hawaii': 'HI',
    'Oklahoma': 'OK',
    'Idaho': 'ID',
    'Oregon': 'OR',
    'Illinois': 'IL',
    'Pennsylvania': 'PA',
    'Indiana': 'IN',
    'Rhode Island': 'RI',
    'Iowa': 'IA',
    'South Carolina': 'SC',
    'Kansas': 'KS',
    'South Dakota': 'SD',
    'Kentucky': 'KY',
    'Tennessee': 'TN',
    'Louisiana': 'LA',
    'Texas': 'TX',
    'Maine': 'ME',
    'Utah': 'UT',
    'Maryland': 'MD',
    'Vermont': 'VT',
    'Massachusetts': 'MA',
    'Virginia': 'VA',
    'Michigan': 'MI',
    'Washington': 'WA',
    'Minnesota': 'MN',
    'West Virginia': 'WV',
    'Mississippi': 'MS',
    'Wisconsin': 'WI',
    'Missouri': 'MO',
    'Wyoming': 'WY',
    "Puerto Rico" : "PR",
"Virgin Islands" : "VI"
}