In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

Matplotlib is building the font cache; this may take a moment.


# Sample

In [4]:
extract_12_2021 = pd.read_adls('Analytics/Personal Folders/liuwei01/Prj2023/origination_extract_Joe/origination_extract_12M_202101_202112.csv', 
                      reader = pd.read_csv, dtype = str, encoding='iso-8859-1')
print(extract_12_2021.shape)
extract_12_2021 = extract_12_2021[extract_12_2021.AnalysisCreditBad.notnull() & (extract_12_2021.InsufficientBusiness == "0") & (extract_12_2021.InsufficientRep == "0")]
print(extract_12_2021.shape)

(3618335, 52)
(1375730, 52)


In [5]:
freq("InsufficientBusiness", "InsufficientRep", "AnalysisCreditBad", df = extract_12_2021, cross = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Count,Pct,Cuml Count,Cuml Pct
InsufficientBusiness,InsufficientRep,AnalysisCreditBad,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1352144.0,0.982856,1352144.0,0.982856
0,0,1,23586.0,0.017144,1375730.0,1.0


In [9]:
fmt = make_format(cuts = [-np.inf, 0, np.inf])
freq(extract_12_2021.seleid.astype(float), format = fmt)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
seleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1+,1375730.0,1.0,1375730.0,1.0
Missing,0.0,0.0,1375730.0,1.0


In [6]:
# with SELEID
extract_12_2021.columns

Index(['sbfe_contributor_number', 'contract_account_number',
       'account_type_reported', 'ultid', 'orgid', 'seleid', 'proxid', 'powid',
       'ProjectName', 'UniqueIdentifier', 'ArchiveDate', 'BusinessName',
       'BusinessStreetAddress', 'BusinessCity', 'BusinessState',
       'BusinessZipcode', 'BusinessPhone', 'BusinessTIN',
       'AlternateBusinessName', 'InsufficientBusiness', 'RepFirstName',
       'RepLastName', 'RepStreetAddress', 'RepCity', 'RepState', 'RepZipcode',
       'RepPhone', 'RepSSN', 'RepDOB', 'RepEmail', 'InsufficientRep', 'Sales',
       'EmployeeCount', 'SIC', 'NAICS', 'Approved', 'ApprovedNotFunded',
       'Declined', 'AnalysisCreditBad', 'AnalysisFraud', 'CreditDPD',
       'ChargeOff', 'ChargeOffAmount', 'UndefinedFraud', 'FirstPayDefault',
       'FirstPartyFraud', 'ThirdPartyFraud', 'FraudLossAmount',
       'ClientScore1Name', 'ClientScore1', 'ClientScore2Name', 'ClientScore2'],
      dtype='object')

In [17]:
sample_100k = extract_12_2021.sample(n=100_000, random_state=1)
print(sample_100k.shape)

(100000, 52)


In [18]:
sample_100k.to_adls('Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120_sample_100k.parquet')

In [19]:
sample_100k["unique_id"] = sample_100k.index + 1
sample_100k["unique_id"].is_unique

True

In [20]:
keep = ['unique_id', 
'BusinessName', 'AlternateBusinessName', 'BusinessStreetAddress', 'BusinessCity', 'BusinessState', 'BusinessZipcode', 'BusinessPhone','BusinessTIN', 
'RepFirstName', 'RepLastName', 'RepStreetAddress', 'RepCity', 'RepState', 'RepZipcode', 'RepSSN', 'RepDOB', 'RepEmail', 'RepPhone',
'ArchiveDate', 'powid', 'proxid', 'seleid', 'orgid', 'ultid', ]
sample_100k = sample_100k[keep]

In [21]:
sample_100k = sample_100k.rename(columns = {
    'unique_id' : "Accountnumber", 
    'BusinessName' : "CompanyName", 
    'AlternateBusinessName' : "AlternateCompanyName", 
    'BusinessStreetAddress' : "Addr", 
    'BusinessCity' : "City", 
    'BusinessState' : "State", 
    'BusinessZipcode' : 'Zip', 
    'BusinessPhone': 'BusinessPhone',
    'BusinessTIN' : 'TaxIdNumber', 
    'RepFirstName' : 'RepresentativeFirstName', 
    'RepLastName' : 'RepresentativeLastName', 
    'RepStreetAddress': 'RepresentativeAddr', 
    'RepCity' : "RepresentativeCity", 
    'RepState' : 'RepresentativeState', 
    'RepZipcode' : 'RepresentativeZip', 
    'RepSSN' : 'RepresentativeSSN', 
    'RepDOB': 'RepresentativeDOB', 
    'RepEmail' : 'RepresentativeEmailAddress', 
    'RepPhone' : 'RepresentativeHomePhone',
    'ArchiveDate' : 'HistoryDate', 
    'powid' : "PowID", 
    'proxid': "ProxID", 
    'seleid': "SeleID", 
    'orgid': "OrgID", 
    'ultid': "UltID", 
})

In [22]:
sample_100k.loc[:, "BusinessIPAddress"] = None
sample_100k.loc[:, "RepresentativeMiddleName"] = None
sample_100k.loc[:, "RepresentativeNameSuffix"] = None
sample_100k.loc[:, "RepresentativeAge"] = None
sample_100k.loc[:, "RepresentativeDLNumber"] = None
sample_100k.loc[:, "RepresentativeDLState"] = None
sample_100k.loc[:, "RepresentativeFormerLastName"] = None
sample_100k.loc[:, "SIC_Code"] = None
sample_100k.loc[:, "NAIC_Code"] = None

In [23]:
order = ['Accountnumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 
'BusinessIPAddress', 'RepresentativeFirstName', 'RepresentativeMiddleName', 'RepresentativeLastName', 'RepresentativeNameSuffix', 
'RepresentativeAddr', 'RepresentativeCity', 'RepresentativeState', 'RepresentativeZip', 'RepresentativeSSN', 'RepresentativeDOB', 
'RepresentativeAge', 'RepresentativeDLNumber', 'RepresentativeDLState', 'RepresentativeHomePhone', 'RepresentativeEmailAddress', 'RepresentativeFormerLastName', 
'HistoryDate', 'PowID', 'ProxID', 'SeleID', 'OrgID', 'UltID', 'SIC_Code', 'NAIC_Code', ]
sample_100k = sample_100k[order]

In [24]:
sample_100k.to_adls("Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120_sample_100k.csv", index = False, overwrite = True)

# Analysis

In [2]:
string_list = {'lnlexidsele': str, 'outbestbusname': str, 'outbestbusstreetaddr': str, 'outbestbuscity': str, 'outbestbusstate': str, 'outbestbuszip': str, 
               'outbestbustin': str, 'outbestbusphone': str, 'outbestnaics': str, 'outbestsic': str, 'liennewesttype': str, 'judgmentnewesttype': str, 
               'inputaddrtype': str, }

In [9]:
## bad data 0922
# 1601: 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120 (sbfe_bug)/data/0922_air_1120_business_shell_v31_baseline_results_w20230921-173734.csv.gz'
# 1601: 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120 (sbfe_bug)/data/0922_test_air_1120_business_shell_v31_baseline_results_w20230921-202044.csv.gz'

## 2105 test:
baseline_path_1 = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120 (sbfe_bug)/data/0922_2105_air_1120_business_shell_v31_baseline_results_w20230922-071955.csv'
baseline_path_2 = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120 (sbfe_bug)/data/0922_2105_air_1120_business_shell_v31_baseline_results_w20230922-104035.csv'
test_path = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120 (sbfe_bug)/data/0922_2105_air_1120_business_shell_v31_test_results_w20230922-105754.csv'
baseline_1 = pd.read_adls(baseline_path_1, dtype = string_list, 
                        reader = pd.read_csv, verbose = False, low_memory = True, 
                        encoding='iso-8859-1')
print(baseline_1.shape)

baseline_2 = pd.read_adls(baseline_path_2, dtype = string_list, 
                        reader = pd.read_csv, verbose = False, low_memory = True, 
                        encoding='iso-8859-1')
print(baseline_2.shape)
baseline = pd.concat([baseline_1, baseline_2], ignore_index = True)
print(baseline.shape)

test = pd.read_adls(test_path, dtype = string_list, 
                        reader = pd.read_csv, verbose = False, low_memory = True, 
                        encoding='iso-8859-1')
print(test.shape)


## 1601 test:
from dsgtools import azure
# baseline_path = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120 (sbfe_bug)/data/0926_air_1120_business_shell_v31_baseline_results_w20230924-085143.csv'
# test_path = 'Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/AIR_1120 (sbfe_bug)/data/0926_air_1120_business_shell_v31_test_results_w20230924-123242.csv'
# baseline = pd.read_adls(baseline_path, dtype = string_list, 
#                         reader = pd.read_csv, verbose = False, low_memory = True, 
#                         encoding='iso-8859-1')
# print(baseline.shape)

# test = pd.read_adls(test_path, dtype = string_list, 
#                         reader = pd.read_csv, verbose = False, low_memory = True, 
#                         encoding='iso-8859-1')
# print(test.shape)

  data = reader(file, **kwargs)


(95575, 2304)
(4421, 2304)
(99996, 2304)


  data = reader(file, **kwargs)


(100000, 2304)


In [13]:
keep = ["accountnumber", "seleid", 'SBFEUtilRevolving84M', 'SBFEUtilLine84M', 'SBFEUtilOELine84M', 'SBFEUtilCard84M', 
        'SBFEUtilRevolving24M', 'SBFEUtilLine24M', 'SBFEUtilOELine24M', 'SBFEUtilCard24M']
keep = [x.lower() for x in keep]
temp_base = baseline[keep].copy()
temp_test = test[keep].copy()

In [14]:
merged = temp_base.merge(temp_test, on = "accountnumber")
print(merged.shape)
merged[(merged.sbfeutilrevolving84m_x != merged.sbfeutilrevolving84m_y) | (merged.sbfeutilline84m_x != merged.sbfeutilline84m_y) |\
       (merged.sbfeutiloeline84m_x != merged.sbfeutiloeline84m_y) | (merged.sbfeutilcard84m_x != merged.sbfeutilcard84m_y)|\
              (merged.sbfeutilrevolving24m_x != merged.sbfeutilrevolving24m_y) | (merged.sbfeutilline24m_x != merged.sbfeutilline24m_y) |\
       (merged.sbfeutiloeline24m_x != merged.sbfeutiloeline24m_y) | (merged.sbfeutilcard24m_x != merged.sbfeutilcard24m_y)].shape

(99996, 19)


(104, 19)

In [None]:
merged.loc[(merged.sbfeutilrevolving84m_x != merged.sbfeutilrevolving84m_y) | (merged.sbfeutilline84m_x != merged.sbfeutilline84m_y) |\
       (merged.sbfeutiloeline84m_x != merged.sbfeutiloeline84m_y) | (merged.sbfeutilcard84m_x != merged.sbfeutilcard84m_y)|\
              (merged.sbfeutilrevolving24m_x != merged.sbfeutilrevolving24m_y) | (merged.sbfeutilline24m_x != merged.sbfeutilline24m_y) |\
       (merged.sbfeutiloeline24m_x != merged.sbfeutiloeline24m_y) | (merged.sbfeutilcard24m_x != merged.sbfeutilcard24m_y), 'accountnumber'].to_csv('./_temp/sample_for_pri_1120.csv', index = False)

In [7]:
bs_seleid = baseline[["accountnumber", "seleid"]]
bs_seleid.columns = ["accountnumber", "bs_seleid"]
test = test.merge(bs_seleid, on = "accountnumber")
print(test.shape)
test["seleid_diff"] = np.where(test.seleid == bs_seleid.bs_seleid, 0, 1)
freq(test["seleid_diff"])

(99982, 2305)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
seleid_diff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,99982.0,1.0,99982.0,1.0


In [8]:
exclude = ['accountnumber', 'historydateyyyymm', 'bus_company_name', 'powid', 'proxid', 'seleid', 'orgid', 'ultid', 'lnlexidsele','model1name', 'model1score',
           'model1rc1', 'model1rc2', 'model1rc3', 'model1rc4', 'model1rc5', 'model1rc6', 'model2name', 'model2score', 'model2rc1', 'model2rc2', 'model2rc3', 'model2rc4',
           'model2rc5', 'model2rc6', 'model3name', 'model3score', 'model3rc1', 'model3rc2', 'model3rc3', 'model3rc4', 'model3rc5', 'model3rc6', 'model4name', 'model4score',
           'model4rc1', 'model4rc2', 'model4rc3', 'model4rc4', 'model4rc5', 'model4rc6', 'model5name', 'model5score', 'model5rc1', 'model5rc2', 'model5rc3', 'model5rc4',
           'model5rc5', 'model5rc6', 'model6name', 'model6score', 'model6rc1', 'model6rc2', 'model6rc3', 'model6rc4', 'model6rc5', 'model6rc6', 'model7name', 'model7score',
           'model7rc1', 'model7rc2', 'model7rc3', 'model7rc4', 'model7rc5', 'model7rc6', 'model8name', 'model8score', 'model8rc1', 'model8rc2', 'model8rc3', 'model8rc4',
           'model8rc5', 'model8rc6', 'model9name', 'model9score', 'model9rc1', 'model9rc2', 'model9rc3', 'model9rc4', 'model9rc5', 'model9rc6', 'model10name', 'model10score',
           'model10rc1', 'model10rc2', 'model10rc3', 'model10rc4', 'model10rc5', 'model10rc6', 'errorcode', 'time_ms']
sbfe_attr = [x for x in baseline.columns if "sbfe" in x]
rv_attr = [x for x in baseline.columns if x not in sbfe_attr and x not in exclude]
model = ["model1score", "model2score"]

exception_dict = dict()
for x in baseline.columns:
    if x in sbfe_attr:
        exception_dict[x] = [-99, -98, -97]
    if x in rv_attr:
        exception_dict[x] = [-1]
    if x in model:
        exception_dict[x] = [0, 100, 200, 222]

In [9]:
from dsgtools.reporting import ImpactAnalysisReport
fmt_score = make_format(cuts = [-np.inf, 500, 550, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 900, np.inf], exceptions = [0, 100, 200, 222])
cr = ImpactAnalysisReport(baseline, test, keys="accountnumber",
           score_formats = {"model1score" : fmt_score, 
                         'model2score': fmt_score, },
           score_columns = ["model1score", "model2score", "sbfehitindex"], exceptions = exception_dict,
           dataframe_names = ("baseline", "test"))
cr.to_xlsx("./_temp/air1120_1601_data-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7ff9113d57e0>

<Figure size 640x480 with 0 Axes>

In [10]:
freq(baseline.model1name)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
model1name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SBBM1601_0_0,99982.0,1.0,99982.0,1.0


In [None]:
## 2105 examples for tradeline pull:

In [4]:
test.loc[test.accountnumber.isin([76047, 698578, 751924]), ["accountnumber", "historydateyyyymm", "seleid", "sbfeaccountcount", "sbfeutilrevolving03m", "sbfeutilrevolving06m", "sbfeutilrevolving12m"]]

Unnamed: 0,accountnumber,historydateyyyymm,seleid,sbfeaccountcount,sbfeutilrevolving03m,sbfeutilrevolving06m,sbfeutilrevolving12m
8421,751924,202109,140763572,7.0,7.0,27.0,-97.0
33964,698578,202110,129612605,7.0,59.0,7.0,5.0
37938,76047,202108,13445417,10.0,83.0,86.0,100.0


In [5]:
baseline.loc[baseline.accountnumber.isin([76047, 698578, 751924]), ["accountnumber", "historydateyyyymm", "seleid", "sbfeaccountcount", "sbfeutilrevolving03m", "sbfeutilrevolving06m", "sbfeutilrevolving12m"]]

Unnamed: 0,accountnumber,historydateyyyymm,seleid,sbfeaccountcount,sbfeutilrevolving03m,sbfeutilrevolving06m,sbfeutilrevolving12m
8061,751924,202109,140763572,7.0,7.0,27.0,-97.0
32529,698578,202110,129612605,7.0,59.0,7.0,56.0
36346,76047,202108,13445417,10.0,83.0,86.0,100.0


In [None]:
### performance & attributes
keep = ["accountsnumber", 'SBFEUtilRevolving03M'
'SBFEUtilRevolving06M'
'SBFEUtilRevolving12M'
'SBFEUtilRevolving24M'
'SBFEUtilRevolving36M'
'SBFEUtilRevolving60M'
'SBFEUtilRevolving84M'
 

'SBFEUtilLine03M'
'SBFEUtilLine06M'
'SBFEUtilLine12M'
'SBFEUtilLine24M'
'SBFEUtilLine36M'
'SBFEUtilLine60M'
'SBFEUtilLine84M'
'SBFEUtilCard03M'
'SBFEUtilCard06M'
'SBFEUtilCard12M'
'SBFEUtilCard24M'
'SBFEUtilCard36M'
'SBFEUtilCard60M'
'SBFEUtilCard84M'
'SBFEUtilOELine03M'
'SBFEUtilOELine06M'
SBFEUtilOELine12M
SBFEUtilOELine24M
SBFEUtilOELine36M
SBFEUtilOELine60M
SBFEUtilOELine84M
]
