In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [2]:
## SBA attributes: Analytics/Personal Folders/dobsti01/2023 Model Revalidation/11657/ReRun SBA Fix/

In [3]:
# input performance data:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 11657 (2023 Model revalidation)/input/origination_extract_12mth_w_perf_202101_202112.parquet"
input_12m = pd.read_adls(path, reader = pd.read_parquet)
print(input_12m.shape)

(3272574, 52)


In [4]:
freq(input_12m.AnalysisCreditBad)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
AnalysisCreditBad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3173726.0,0.969795,3173726.0,0.969795
1,98848.0,0.030205,3272574.0,1.0


In [5]:
freq(input_12m.InsufficientBusiness)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
InsufficientBusiness,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3272035.0,0.999835,3272035.0,0.999835
1,539.0,0.000165,3272574.0,1.0


In [6]:
input_12m.columns

Index(['sbfe_contributor_number', 'contract_account_number',
       'account_type_reported', 'ultid', 'orgid', 'seleid', 'proxid', 'powid',
       'ProjectName', 'UniqueIdentifier', 'ArchiveDate', 'BusinessName',
       'BusinessStreetAddress', 'BusinessCity', 'BusinessState',
       'BusinessZipcode', 'BusinessPhone', 'BusinessTIN',
       'AlternateBusinessName', 'InsufficientBusiness', 'RepFirstName',
       'RepLastName', 'RepStreetAddress', 'RepCity', 'RepState', 'RepZipcode',
       'RepPhone', 'RepSSN', 'RepDOB', 'RepEmail', 'InsufficientRep', 'Sales',
       'EmployeeCount', 'SIC', 'NAICS', 'Approved', 'ApprovedNotFunded',
       'Declined', 'AnalysisCreditBad', 'AnalysisFraud', 'CreditDPD',
       'ChargeOff', 'ChargeOffAmount', 'UndefinedFraud', 'FirstPayDefault',
       'FirstPartyFraud', 'ThirdPartyFraud', 'FraudLossAmount',
       'ClientScore1Name', 'ClientScore1', 'ClientScore2Name', 'ClientScore2'],
      dtype='object')

In [7]:
input_12m["full_addr"] = input_12m["BusinessStreetAddress"] + np.where(input_12m["BusinessCity"] != "", ", " + input_12m["BusinessCity"], "") + \
    np.where(input_12m["BusinessState"] != "", ", " + input_12m["BusinessState"], "") + np.where(input_12m["BusinessZipcode"] != "", ", " + input_12m["BusinessZipcode"].str.slice(0, 5), "")
input_12m[["full_addr"]].head()

Unnamed: 0,full_addr
0,"12 TERRY DR STE 203, NEWTOWN, PA, 18940"
1,"2333 S COLUMBINE ST, DENVER, CO, 80210"
4,"1500 PICARDY CIR, CLEARWATER, FL, 33755"
5,"3146 VIA POINCIANA APT 404, LAKE WORTH, FL, 33467"
6,"2802 LIPSCOMB ST, MELBOURNE, FL, 32901"


In [8]:
input_12m["full_addr_clean"] = input_12m['full_addr'].str.replace('\W\s\,', '')
input_12m.loc[(input_12m["full_addr_clean"]!=input_12m["full_addr"]) & (input_12m["full_addr"].notnull()), ["full_addr_clean", "full_addr"]].head()

  input_12m["full_addr_clean"] = input_12m['full_addr'].str.replace('\W\s\,', '')


Unnamed: 0,full_addr_clean,full_addr


In [11]:
input_12m = input_12m[["UniqueIdentifier", "account_type_reported", "ArchiveDate", "AnalysisCreditBad", 
                       "full_addr_clean", 'BusinessPhone', 'BusinessTIN']]
print(input_12m.shape)

(3272574, 7)


In [10]:
#### import attributes
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 11657 (2023 Model revalidation)/Processing/"

In [11]:
use = ['accountnumber', 'historydateyyyymm', 'seleid', 
       'businessrecordtimeoldest','sbfehitindex', 'sbfetimeoldest', 'inquirycount03m', 'inquirycount12m', 
       'inquirycreditcount03m', 'inquirycreditcount12m', 'inquiryhighriskcount03m', 'inquiryhighriskcount12m', 'inquiryothercount03m', 'inquiryothercount12m', 
       'inquiryconsumeraddress', 'inquiryconsumerphone', 'inquiryconsumeraddressssn', 'model1score', "model2score"]

part1 = "ln_11657_bus_rep1_input_0320_pt1_sba_v21_sbfe_busshell_31_w20230320-165230-3.csv.gz"
part2 = "ln_11657_bus_rep1_input_0320_pt1_sba_v21_sbfe_busshell_31_w20230320-223541.csv.gz"
part3 = "ln_11657_bus_rep1_input_0320_pt2_sba_v21_sbfe_busshell_31_rew20230320-223558.csv.gz"
part4 = "ln_11657_bus_rep1_input_0320_pt2_sba_v21_sbfe_busshell_31_w20230320-165245.csv.gz"
part5 = "ln_11657_bus_rep1_input_0320_pt3_sba_v21_sbfe_busshell_31_w20230320-165501.csv.gz"
part6 = "ln_11657_bus_rep1_input_0320_pt4_sba_v21_sbfe_busshell_31_w20230320-165514.csv.gz"
part7 = "ln_11657_bus_rep1_input_0320_pt5_sba_v21_sbfe_busshell_31_w20230320-165523.csv.gz"


part1_df = pd.read_adls(path + part1, reader = pd.read_csv, compression = "gzip", encoding = 'iso-8859-1', usecols = use)
part2_df = pd.read_adls(path + part2, reader = pd.read_csv, compression = "gzip", encoding = 'iso-8859-1', usecols = use)
part3_df = pd.read_adls(path + part3, reader = pd.read_csv, compression = "gzip", encoding = 'iso-8859-1', usecols = use)
part4_df = pd.read_adls(path + part4, reader = pd.read_csv, compression = "gzip", encoding = 'iso-8859-1', usecols = use)
part5_df = pd.read_adls(path + part5, reader = pd.read_csv, compression = "gzip", encoding = 'iso-8859-1', usecols = use)
part6_df = pd.read_adls(path + part6, reader = pd.read_csv, compression = "gzip", encoding = 'iso-8859-1', usecols = use)
part7_df = pd.read_adls(path + part7, reader = pd.read_csv, compression = "gzip", encoding = 'iso-8859-1', usecols = use)

stacked = pd.concat([part1_df, part2_df, part3_df, part4_df, part5_df, part6_df, part7_df], ignore_index = True)
print(stacked.shape)

(3272574, 19)


In [12]:
stacked.head()

Unnamed: 0,accountnumber,historydateyyyymm,seleid,businessrecordtimeoldest,inquirycount03m,inquirycount12m,inquirycreditcount03m,inquirycreditcount12m,inquiryhighriskcount03m,inquiryhighriskcount12m,inquiryothercount03m,inquiryothercount12m,inquiryconsumeraddress,inquiryconsumerphone,inquiryconsumeraddressssn,sbfehitindex,sbfetimeoldest,model1score,model2score
0,SBFEAccountOriginations0000001,202112,29,1233.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,3.0,142.0,0,683
1,SBFEAccountOriginations0000002,202104,50,228.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,3.0,163.0,0,738
2,SBFEAccountOriginations0000005,202109,149,295.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,230.0,679,640
3,SBFEAccountOriginations0000006,202103,206,148.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,84.0,0,606
4,SBFEAccountOriginations0000007,202102,391,124.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,3.0,121.0,0,807


In [13]:
input_12m = input_12m.merge(stacked, left_on = "UniqueIdentifier", right_on = "accountnumber")
print(input_12m.shape)

(3272574, 26)


In [15]:
input_12m.to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/SBFE/input_12m_w_perf_attri.parquet", overwrite=True)

In [2]:
input_12m = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Performance/SBFE/input_12m_w_perf_attri.parquet")
fmt = make_format(cuts = [-np.inf, -1, 0, np.inf])
freq(input_12m.seleid, format = fmt, observed = True)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
seleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5016.0,0.001533,5016.0,0.001533
1+,3267558.0,0.998467,3272574.0,1.0


In [3]:
input_12m.columns

Index(['UniqueIdentifier', 'account_type_reported', 'ArchiveDate',
       'AnalysisCreditBad', 'full_addr_clean', 'BusinessPhone', 'BusinessTIN',
       'accountnumber', 'historydateyyyymm', 'seleid',
       'businessrecordtimeoldest', 'inquirycount03m', 'inquirycount12m',
       'inquirycreditcount03m', 'inquirycreditcount12m',
       'inquiryhighriskcount03m', 'inquiryhighriskcount12m',
       'inquiryothercount03m', 'inquiryothercount12m',
       'inquiryconsumeraddress', 'inquiryconsumerphone',
       'inquiryconsumeraddressssn', 'sbfehitindex', 'sbfetimeoldest',
       'model1score', 'model2score'],
      dtype='object')

In [4]:
fmt = make_format(cuts = [-np.inf, 0, 500, np.inf], exceptions = [0, 200, 222])
freq(input_12m.model2score, format = fmt, observed = True)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
model2score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
501+,2461050.0,0.752023,2461050.0,0.752023
0,939.0,0.000287,2461989.0,0.75231
222,810585.0,0.24769,3272574.0,1.0


In [5]:
bivariate("ArchiveDate", "AnalysisCreditBad", df = input_12m)

tag,AnalysisCreditBad,AnalysisCreditBad,AnalysisCreditBad,AnalysisCreditBad,AnalysisCreditBad,AnalysisCreditBad
stats,N,PctN,Sum,Mean,WoE,IV
ArchiveDate,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
202101,301755.0,0.092207,4556.0,0.015098,-0.708884,0.033709
202102,397448.0,0.121448,5530.0,0.013914,-0.791791,0.053481
202103,408901.0,0.124948,6845.0,0.01674,-0.604,0.034691
202104,310799.0,0.094971,7326.0,0.023572,-0.25479,0.00548
202105,240389.0,0.073456,7533.0,0.031337,0.037946,0.000108
202106,233503.0,0.071351,8697.0,0.037246,0.216814,0.003718
202107,228185.0,0.069726,9488.0,0.04158,0.331414,0.008974
202108,219631.0,0.067113,9597.0,0.043696,0.383254,0.011846
202109,241325.0,0.073742,10520.0,0.043593,0.380778,0.012833
202110,232326.0,0.070992,9957.0,0.042858,0.363011,0.011131


In [6]:
print(input_12m.BusinessTIN.isnull().sum())
print(input_12m[input_12m.BusinessTIN == ""].shape)

1684704
(0, 26)


In [7]:
## keep
input_12m = input_12m[input_12m.ArchiveDate.isin(['202110', "202111", "202112"])]
print(input_12m.shape)
# input_12m = input_12m[(input_12m.seleid !=0) & ((input_12m.seleid.notnull()))]
# input_12m = input_12m[input_12m.full_addr_clean.notnull()]
# input_12m = input_12m[input_12m.BusinessTIN.notnull()]
print(input_12m.shape)

(690638, 26)
(690638, 26)


In [8]:
print(300199-690638)
print((300199-690638)/690638)

-390439
-0.5653308969387726


In [9]:
input_12m.historydateyyyymm.value_counts()

202111    238467
202110    232326
202112    219845
Name: historydateyyyymm, dtype: int64

In [10]:
input_12m.accountnumber.is_unique

True

In [11]:
input_12m.shape

(690638, 26)

### Import SBFE Inquiry Data

In [12]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/sbfe_seleid_append_deduped_cleaned_0501.parquet"
sbfe = pd.read_adls(path, reader = pd.read_parquet)
print(sbfe.shape)

(79429086, 20)


In [13]:
sbfe.seleid = sbfe.seleid.astype(int)

In [37]:
sbfe["full_addr"] = sbfe["addr"] + np.where(sbfe["city"] != "", ", " + sbfe["city"], "") + \
    np.where(sbfe["state"] != "", ", " + sbfe["state"], "") + np.where(sbfe["zip"] != "", ", " + sbfe["zip"].str.slice(0, 5), "")
sbfe[["full_addr"]].head()

Unnamed: 0,full_addr
0,"115 GOLF COURSE RD STE E, LOGAN, UT, 84321"
1,"3200 PALM TREE DR, LITHONIA, GA, 30038"
2,"1476 HIGHWAY 159 E, BELLVILLE, TX, 77418"
5,"492 KOLLER ST, SAN FRANCISCO, CA, 94110"
6,"107 LASSITER LANE, BELLVILLE, TX, 77418"


In [38]:
sbfe["full_addr_clean"] = sbfe['full_addr'].str.replace('\W\s\,', '')
sbfe.loc[sbfe["full_addr_clean"]!=sbfe["full_addr"], ["full_addr_clean", "full_addr"]].head()

  sbfe["full_addr_clean"] = sbfe['full_addr'].str.replace('\W\s\,', '')


Unnamed: 0,full_addr_clean,full_addr
2156707,"11175 LAKEVIEW AV 1963, LUCERNE VALLEY, CA, 9...","11175 LAKEVIEW AV. , 1963, LUCERNE VALLEY, CA..."
4907592,"500 MS-12 MS 39759, STARKVILLE, MS, 39759","500 MS-12, , MS 39759, STARKVILLE, MS, 39759"
4935389,"500 MS-12 MS 39759, STARKVILLE, MS, 39759","500 MS-12, , MS 39759, STARKVILLE, MS, 39759"
7966573,"6521 SALTSBURG RD PA 15235, PITTSBURGH, PA, 15235","6521 SALTSBURG RD, , PA 15235, PITTSBURGH, PA,..."
15990890,"11175 LAKEVIEW AV 1963, LUCERNE VALLEY, CA, 9...","11175 LAKEVIEW AV. , 1963, LUCERNE VALLEY, CA..."


In [39]:
sbfe["len"] = sbfe["businessphone"].str.len()
freq(sbfe["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,21962890.0,0.2765094,21962890.0,0.276509
1,103.0,1.296754e-06,21962993.0,0.276511
2,60.0,7.553908e-07,21963053.0,0.276511
3,211.0,2.656458e-06,21963264.0,0.276514
4,19281.0,0.0002427448,21982545.0,0.276757
5,35.0,4.406446e-07,21982580.0,0.276757
6,3734.0,4.701049e-05,21986314.0,0.276804
7,752844.0,0.00947819,22739158.0,0.286283
8,18743.0,0.0002359715,22757901.0,0.286518
9,106831.0,0.001344986,22864732.0,0.287863


In [40]:
input_12m["len"] = input_12m["BusinessPhone"].str.len()
freq(input_12m["len"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,2.0,3e-06,2.0,3e-06
6.0,1.0,1e-06,3.0,4e-06
7.0,28.0,4.1e-05,31.0,4.5e-05
8.0,4.0,6e-06,35.0,5.1e-05
9.0,30.0,4.3e-05,65.0,9.4e-05
10.0,427263.0,0.61865,427328.0,0.618744
Missing,263310.0,0.381256,690638.0,1.0


In [14]:
temp = input_12m[["seleid", "ArchiveDate", "accountnumber"]] # , "full_addr_clean", 'BusinessPhone', 'BusinessTIN', 
# temp = temp[temp["len"] == 10]
# print(temp.shape)
sbfe = sbfe.merge(temp, on = "seleid")
# sbfe["full_addr_clean"] = sbfe["full_addr_clean"].str.lower()
# temp["full_addr_clean"] = temp["full_addr_clean"].str.lower()
# sbfe = sbfe.merge(temp, on = "full_addr_clean")
# sbfe = sbfe.merge(temp, left_on = "businessphone", right_on = "BusinessPhone")
# sbfe = sbfe.merge(temp, left_on = "taxidnumber", right_on = "BusinessTIN")

In [15]:
print(sbfe.shape)

(3303127, 22)


In [16]:
sbfe["credit_flag_cutoff_date"] = sbfe["ArchiveDate"] + "01"
sbfe["credit_flag_cutoff_date"] = sbfe["credit_flag_cutoff_date"].astype(int)

In [17]:
sbfe["historydate_x"] = sbfe["historydate_x"].astype(int)
sbfe = sbfe[sbfe["historydate_x"] < sbfe["credit_flag_cutoff_date"]]
print(sbfe.shape)

(768210, 23)


In [18]:
sbfe["credit_flag_cutoff_date_fmt"] = pd.to_datetime(sbfe["credit_flag_cutoff_date"], format='%Y%m%d')
sbfe["inquiry_date_fmt"] = pd.to_datetime(sbfe["historydate_x"], format='%Y%m%d')

In [19]:
sbfe['inquiry_count_03'] = np.where((sbfe["credit_flag_cutoff_date_fmt"] - sbfe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 3, 1, 0)
sbfe['inquiry_count_01'] = np.where((sbfe["credit_flag_cutoff_date_fmt"] - sbfe["inquiry_date_fmt"])/np.timedelta64(1, 'M') <= 1, 1, 0)
sbfe['inquiry_count_week'] =  np.where((sbfe["credit_flag_cutoff_date_fmt"] - sbfe["inquiry_date_fmt"])/np.timedelta64(1, 'W') <= 1, 1, 0)
sbfe[["credit_flag_cutoff_date_fmt", "inquiry_date_fmt", "inquiry_count_03", "inquiry_count_01", "inquiry_count_week"]].tail()

Unnamed: 0,credit_flag_cutoff_date_fmt,inquiry_date_fmt,inquiry_count_03,inquiry_count_01,inquiry_count_week
3249954,2021-12-01,2021-11-30,1,1,1
3249957,2021-12-01,2021-11-30,1,1,1
3249958,2021-12-01,2021-11-30,1,1,1
3249962,2021-12-01,2021-11-30,1,1,1
3249978,2021-12-01,2021-11-30,1,1,1


In [20]:
sbfe_roll = sbfe.groupby(by = ['accountnumber'])["inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()
print(sbfe_roll.shape)

  sbfe_roll = sbfe.groupby(by = ['accountnumber'])["inquiry_count_03", "inquiry_count_01", "inquiry_count_week"].sum().reset_index()


(157963, 4)


In [22]:
sbfe_roll[sbfe_roll["inquiry_count_03"] < 1]

Unnamed: 0,accountnumber,inquiry_count_03,inquiry_count_01,inquiry_count_week
6,SBFEAccountOriginations0000171,0,0,0
14,SBFEAccountOriginations0000287,0,0,0
21,SBFEAccountOriginations0000341,0,0,0
28,SBFEAccountOriginations0000370,0,0,0
30,SBFEAccountOriginations0000401,0,0,0
...,...,...,...,...
157923,SBFEAccountOriginations3605375,0,0,0
157951,SBFEAccountOriginations3613112,0,0,0
157954,SBFEAccountOriginations3617126,0,0,0
157957,SBFEAccountOriginations3617326,0,0,0


In [21]:
input_12m = input_12m.merge(sbfe_roll, on = "accountnumber", how = "left")
print(input_12m.shape)

(690638, 29)


In [49]:
input_12m['inquiry_count_03'] = np.where(input_12m["inquiry_count_03"].isnull(), 0, input_12m["inquiry_count_03"])
input_12m['inquiry_count_01'] = np.where(input_12m["inquiry_count_01"].isnull(), 0, input_12m["inquiry_count_01"])
input_12m['inquiry_count_week']=np.where(input_12m["inquiry_count_week"].isnull(), 0, input_12m["inquiry_count_week"])

In [52]:
fmt1 = make_format(cuts = [-np.inf, 0, 500, 550, 600, 650, 700, 750, 800, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
freq("model1score", "inquiry_count_03", df = input_12m, format = [fmt1, fmt], with_stats = False, observed = True)

Unnamed: 0_level_0,inquiry_count_03,inquiry_count_03,inquiry_count_03,inquiry_count_03,inquiry_count_03,inquiry_count_03,inquiry_count_03,inquiry_count_03,inquiry_count_03,inquiry_count_03
Unnamed: 0_level_1,<= 0,1,2,3,4,5,6-10,11-20,21-30,31-40
Unnamed: 0_level_2,Count,Count,Count,Count,Count,Count,Count,Count,Count,Count
model1score,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
501-550,492.0,183.0,72.0,41.0,25.0,11.0,46.0,2.0,0.0,0.0
551-600,2915.0,620.0,215.0,104.0,53.0,30.0,132.0,9.0,0.0,0.0
601-650,18857.0,2409.0,730.0,353.0,167.0,83.0,495.0,36.0,1.0,0.0
651-700,77823.0,5365.0,1610.0,872.0,291.0,162.0,1690.0,57.0,0.0,0.0
701-750,127958.0,5776.0,2488.0,1868.0,456.0,275.0,4239.0,131.0,1.0,0.0
751-800,44186.0,3158.0,3116.0,2796.0,581.0,355.0,6599.0,186.0,3.0,0.0
801-850,8768.0,1128.0,1247.0,1305.0,340.0,196.0,4272.0,129.0,1.0,0.0
851-900,1118.0,248.0,159.0,236.0,90.0,55.0,1023.0,81.0,0.0,1.0
0,258486.0,28916.0,12344.0,10794.0,3724.0,2093.0,29407.0,1389.0,9.0,0.0
200,1267.0,141.0,63.0,64.0,13.0,3.0,194.0,4.0,1.0,0.0


In [23]:
pd.set_option('display.max_columns', None)
input_12m.describe()

Unnamed: 0,AnalysisCreditBad,historydateyyyymm,seleid,businessrecordtimeoldest,inquirycount03m,inquirycount12m,inquirycreditcount03m,inquirycreditcount12m,inquiryhighriskcount03m,inquiryhighriskcount12m,inquiryothercount03m,inquiryothercount12m,inquiryconsumeraddress,inquiryconsumerphone,inquiryconsumeraddressssn,sbfehitindex,sbfetimeoldest,model1score,model2score,inquiry_count_03,inquiry_count_01,inquiry_count_week
count,300199.0,300199.0,300199.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300075.0,300199.0,300199.0,300199.0,300199.0,300199.0
mean,0.020903,202110.988354,98110740000.0,98.591036,-0.162529,-0.000676,-0.210557,-0.1739,-0.230246,-0.22474,-0.184804,-0.065114,0.704877,0.110816,0.002173,1.591909,-10.435564,515.447626,628.278312,0.115946,0.070933,0.032025
std,0.143059,0.814792,58117900000.0,204.167546,0.691705,1.333441,0.490481,0.62861,0.426363,0.4464,0.602555,1.112926,0.456099,0.833708,0.046563,1.191527,124.731505,327.186507,211.18465,0.453338,0.314865,0.196947
min,0.0,202110.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,-99.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,202110.0,38152760000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,-99.0,0.0,659.0,0.0,0.0,0.0
50%,0.0,202111.0,136210100000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-99.0,693.0,717.0,0.0,0.0,0.0
75%,0.0,202112.0,138300100000.0,114.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,56.0,740.0,754.0,0.0,0.0,0.0
max,1.0,202112.0,139821300000.0,1463.0,61.0,112.0,32.0,63.0,12.0,16.0,29.0,80.0,1.0,1.0,1.0,3.0,600.0,900.0,900.0,25.0,18.0,7.0


In [24]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, np.inf], exceptions = [-1])
result = dict()
# keep = ['inquirycount03m', 'inquirycount12m', 'inquirycreditcount03m', 'inquirycreditcount12m', 'inquiry_count_03', 'inquiry_count_01', 'inquiry_count_week']
keep = ['inquiry_count_03', 'inquiry_count_01', 'inquiry_count_week']
for i, a in enumerate(keep):
    result[i] = bivariate(input_12m[a], input_12m["AnalysisCreditBad"], format = fmt)

wb = TableWriter(filename = "./_temp/sbfe_orig.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()