In [1]:
# Libraries needed: 
import time
import pickle
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate
from dsgtools.reporting import col_summary
from dsgtools import azure

Matplotlib is building the font cache; this may take a moment.


## Sample preparation

#### SBFE Sample

In [4]:
path = "Analytics/Personal Folders/liuwei01/2023/ARMBS_ticket/2390_SOS/combined_prod_liuwei01_37119_customer_input_W20230221-140008.csv"
SBFE_combined = pd.read_adls(path, reader = pd.read_csv,  encoding='iso-8859-1', dtype = str)
print(SBFE_combined.shape)

(981928, 42)


In [5]:
SBFE_combined["blank_ct"] = SBFE_combined[['businessname', 'businessaddress', 'businesscity', 'businessstate']].isnull().sum(axis = 1)
freq(SBFE_combined["blank_ct"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
blank_ct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,891481.0,0.907888,891481.0,0.907888
1,33132.0,0.033742,924613.0,0.94163
2,41404.0,0.042166,966017.0,0.983796
3,3845.0,0.003916,969862.0,0.987712
4,12066.0,0.012288,981928.0,1.0


In [6]:
SBFE_combined = SBFE_combined[SBFE_combined["blank_ct"] == 0]
keep = ['transactionid', 'dateadded', 'businessname', 'businessaddress', 'businesscity','businessstate', 'businesszip', 'businessphone',]
SBFE_combined = SBFE_combined[keep]
SBFE_combined = SBFE_combined.sort_values(by = ['businessname', 'businessaddress', 'businesscity','businessstate', 'dateadded'])
SBFE_combined = SBFE_combined.drop_duplicates(subset = ['businessname', 'businessaddress', 'businesscity','businessstate'], 
                                              keep = "last", ignore_index = True)
print(SBFE_combined.shape)

(795958, 8)


In [7]:
freq(SBFE_combined["dateadded"].astype(str).str.slice(0, 6))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
dateadded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
202210,167259.0,0.210135,167259.0,0.210135
202211,167168.0,0.210021,334427.0,0.420157
202212,158105.0,0.198635,492532.0,0.618791
202301,187094.0,0.235055,679626.0,0.853847
202302,116332.0,0.146153,795958.0,1.0


In [17]:
SBFE_combined = SBFE_combined[SBFE_combined["dateadded"].astype(str).str.slice(0, 6) != "202210"]
temp_sample = SBFE_combined.sample(500_000, replace = False, random_state = 0, ignore_index = True)
print(temp_sample.shape)

(500000, 8)


In [18]:
temp_sample["business_fein"] = ""
temp_sample["source"] = "SBFE_combined"

#### BIID Sample

In [9]:
path = "Analytics/Personal Folders/liuwei01/2023/ARMBS_ticket/2390_SOS/prod_liuwei01_37109_biid2_input_W20230221-133008.csv"
biid = pd.read_adls(path, reader = pd.read_csv,  encoding='iso-8859-1', dtype = str)
print(biid.shape)

(7411276, 93)


In [11]:
keep = ['transaction_id', 'datetime', 'incompanyname', 'incompanystreetaddress', 'incompanycity','incompanystate', 'incompanyzip5', 'incompanyphone', "incompanyfein"]
biid = biid[keep]
biid.columns = ['transactionid', 'dateadded', 'businessname', 'businessaddress', 'businesscity','businessstate', 'businesszip', 'businessphone', "business_fein"]
biid["blank_ct"] = biid[['businessname', 'businessaddress', 'businesscity', 'businessstate']].isnull().sum(axis = 1)
freq(biid["blank_ct"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biid["blank_ct"] = biid[['businessname', 'businessaddress', 'businesscity', 'businessstate']].isnull().sum(axis = 1)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
blank_ct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6978629.0,0.9416231,6978629.0,0.941623
1,13883.0,0.001873227,6992512.0,0.943496
2,7477.0,0.001008868,6999989.0,0.944505
3,5.0,6.746477e-07,6999994.0,0.944506
4,411282.0,0.05549409,7411276.0,1.0


In [12]:
biid = biid[biid["blank_ct"] == 0]
biid = biid.sort_values(by = ['businessname', 'businessaddress', 'businesscity','businessstate', 'dateadded'])
biid = biid.drop_duplicates(subset = ['businessname', 'businessaddress', 'businesscity','businessstate'], keep = "last", ignore_index = True)
print(biid.shape)

(5616978, 10)


In [13]:
freq(biid["dateadded"].astype(str).str.slice(0, 6))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
dateadded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
202210,1178829.0,0.209869,1178829.0,0.209869
202211,1143592.0,0.203596,2322421.0,0.413464
202212,1140410.0,0.203029,3462831.0,0.616494
202301,1308767.0,0.233002,4771598.0,0.849496
202302,845380.0,0.150504,5616978.0,1.0


In [16]:
biid = biid[biid["dateadded"].astype(str).str.slice(0, 6) == "202302"]
temp_sample_biid = biid.sample(500_000, replace = False, random_state = 0, ignore_index = True)
print(temp_sample_biid.shape)
temp_sample_biid["source"] = "IIDBv2"

(500000, 10)


#### Final Sample

In [19]:
final = pd.concat([temp_sample, temp_sample_biid], ignore_index= True)
print(final.shape)
freq(final.source)

(1000000, 11)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IIDBv2,500000.0,0.5,500000.0,0.5
SBFE_combined,500000.0,0.5,1000000.0,1.0


In [22]:
final.to_adls("Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/sbfe_combined_biid_1mil_2390_input.parquet", format = '.parquet', overwrite = True)

## Bullet 2 for engineering - get BIPIDs

In [2]:
## error in this round of corpkey pull
path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/wl_0405_ln_11605_20230318_busv31_credit_2390_corp_key.csv"
corpkey = pd.read_adls(path, reader = pd.read_csv, encoding='iso-8859-1', dtype = str)
print(corpkey.shape)

(163476, 263)


In [4]:
corpkey = corpkey[['seleid']]
corpkey = corpkey.drop_duplicates(subset= ["seleid"])
print(corpkey.shape)

shell_temp = shell[["account", "id_seleid", "sos_inc_filing_count"]].copy()
merged = shell_temp.merge(corpkey, left_on = "id_seleid", right_on = "seleid", how = "outer")
print(merged.shape)

(83911, 1)
(1000001, 4)


In [18]:
shell[shell.account == "172893681R469623"]

Unnamed: 0,account,seq,id_powid,id_proxid,id_seleid,id_orgid,id_ultid,id_seleid_change_flag,id_seleid_change_code,id_weight,...,be_b2bfltrecflagbymonsum24mc,be_b2bmatrecflagbymonsum24mc,be_b2bopsrecflagbymonsum24mc,be_b2bothrecflagbymonsum24mc,be_b2bbalvol24mc,be_b2bcarrbalvol24mc,be_b2bfltbalvol24mc,be_b2bmatbalvol24mc,be_b2bopsbalvol24mc,be_b2bothbalvol24mc
999997,172893681R469623,0,122167816483,133686899179,147421848,147421848,147421848,0,0,89,...,-99998,-99998,-99998,6,0,-99998,-99998,-99998,-99998,0


Bad pipe message: %s [b"])f\xd6\xc2\xa5\x1b+\xdd\x02\xe3\x86j^\x00 \xeby \xb2\x0fh\xd2Z\xcc~\x9b\x84\x7f\xa6\x1fE\x80W\xae\xfd%Z=;\xe4.\x0c\xf2 y/\x1a\xd8'\xd0\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff", b'']
Bad pipe message: %s [b'a\xe8\x000<\x82\x83\xf7\xf3"\xac\xa98%\xc4\x1e\xech\x00\x00']
Bad pipe message: %s [b",\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff"]
Bad pipe message: %s [b'']
Bad pipe message: %s [b"L2\x9c\x07\xa2]\xed\xd6y\xf6\x8d\x94y2\xa0 \xee]\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0

In [6]:
merged.sos_inc_filing_count = merged.sos_inc_filing_count.astype(float)

In [7]:
merged[(merged.id_seleid != "0") & (merged.seleid.isnull()) & (merged.sos_inc_filing_count >=1)]

Unnamed: 0,account,id_seleid,sos_inc_filing_count,seleid
252528,172682391R21658,138092217243,1.0,
252529,172682291R11106,138813434501,1.0,
252530,172566641R1475,135061959143,1.0,
252531,172682141R9172,139774123066,1.0,
252532,172682151R7080,139774123066,1.0,
...,...,...,...,...
999992,172893611R291940,135369937239,1.0,
999993,172893591R264249,139774073733,1.0,
999996,172943481R80934,1259286060,1.0,
999997,172887041R10349,62310678,1.0,


In [7]:
corpkey.head()

Unnamed: 0,ultid,orgid,seleid,proxid,powid,empid,dotid,ultscore,orgscore,selescore,...,corp_prep_addr1_last_line,append_addr2_rawaid,append_addr2_aceaid,corp_prep_addr2_line1,corp_prep_addr2_last_line,append_ra_rawaid,append_ra_aceaid,ra_prep_addr_line1,ra_prep_addr_last_line,fp
0,2578,2578,2578,0,0,0,0,91,91,91,...,,0,0,,,0,0,,,0
1,2809241,2809241,2809241,0,0,0,0,100,100,100,...,,0,0,,,1070373674799,108359336663,2242 CRESTLINE LOOP,"NORTH LAS VEGAS, NV 89030",0
2,2809241,2809241,2809241,0,0,0,0,100,100,100,...,,0,0,,,1070373674799,108359336663,2242 CRESTLINE LOOP,"NORTH LAS VEGAS, NV 89030",0
3,2809241,2809241,2809241,0,0,0,0,100,100,100,...,,0,0,,,1070373674799,108359336663,2242 CRESTLINE LOOP,"NORTH LAS VEGAS, NV 89030",0
4,2809241,2809241,2809241,0,0,0,0,100,100,100,...,,0,0,,,1070373674799,108359336663,2242 CRESTLINE LOOP,"NORTH LAS VEGAS, NV 89030",0


In [2]:
path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/ln_11605_20230318_busv31_credit_w20230320-120136_sas_layout_busshell.csv.gz"
shell = pd.read_adls(path, reader = pd.read_csv, compression = "gzip", encoding='iso-8859-1', dtype = str)
print(shell.shape)

(1000000, 3115)


In [6]:
shell.account.is_unique

True

In [4]:
1000000-172589.0-338565.0

488846.0

In [3]:
shell["sos_inc_filing_count"] = shell["sos_inc_filing_count"].astype(float)
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, np.inf], exceptions = [-1])
freq(shell["sos_inc_filing_count"], format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
sos_inc_filing_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,172589.0,0.172589,172589.0,0.172589
1,396937.0,0.396937,569526.0,0.569526
2,61691.0,0.061691,631217.0,0.631217
3,14038.0,0.014038,645255.0,0.645255
4,5464.0,0.005464,650719.0,0.650719
5,2572.0,0.002572,653291.0,0.653291
6+,8141.0,0.008141,661432.0,0.661432
-1,338565.0,0.338565,999997.0,0.999997
Missing,3.0,3e-06,1000000.0,1.0


Bad pipe message: %s [b'x\x9aY\xc0*\x01t\xc0d\xcf\x93*\xc4VE}\xf6\xb2 \xf2[\xa3\xd6\xda\x922\x194\xf2s\x9b\x8e\xbe\x1f\x1eb*ritv\x17\x9d\x08\x97\xc9MV\x14\x8bR\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d']
Bad pipe message: %s [b'PT\x0fY(\xf4\xa9\xbe\xa3\x06\xb7\x0c\xc9)\x13\xd8\xdb', b'\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0']
Bad pipe message: %s [b"V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1"]
Bad pipe message: %s [b"p\xb2m\xdaW\xa8\x9aM\xc3Y\x1fsw\xd8:\x01\x98f\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\

In [14]:
shell["id_seleid"] = shell["id_seleid"].astype(float)
fmt = make_format(cuts = [-np.inf, 0, np.inf])
freq(shell["id_seleid"], format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
id_seleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,252527.0,0.252527,252527.0,0.252527
1+,747470.0,0.74747,999997.0,0.999997
Missing,3.0,3e-06,1000000.0,1.0


In [7]:
freq(shell.id_seleid_change_flag)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
id_seleid_change_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,999997.0,0.999997,999997.0,0.999997
Missing,3.0,3e-06,1000000.0,1.0


In [8]:
freq(shell.id_seleid_change_code)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
id_seleid_change_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1000000.0,1.0,1000000.0,1.0


In [9]:
freq(shell.id_truebiz)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
id_truebiz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,252527.0,0.252527,252527.0,0.252527
0,86038.0,0.086038,338565.0,0.338565
1,661432.0,0.661432,999997.0,0.999997
Missing,3.0,3e-06,1000000.0,1.0


In [13]:
path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/ln_11605_20230318_busv31_credit_w20230320-120136_for_Heberton_2390.csv"
shell[['account', 'id_powid', 'id_proxid', 'id_seleid', 'id_orgid', 'id_ultid', 'id_truebiz', 'history_date', 'history_datetime',]].to_adls(path, index = False, format = ".csv")

In [11]:
shell.history_datetime

0         20230318
1         20230318
2         20230318
3         20230318
4         20230318
            ...   
999995    20230318
999996    20230318
999997    20230318
999998    20230318
999999    20230318
Name: history_datetime, Length: 1000000, dtype: object

## Analysis

In [2]:
path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/ln_11605_20230318_busv31_credit_w20230320-120136_sas_layout_busshell.csv.gz"
keep = ['account', 'id_powid', 'id_proxid', 'id_seleid', 'id_orgid', 'id_ultid', 'id_truebiz', 'history_datetime',
        'sos_inc_filing_count', 'sos_inc_filing_firstseen', 'sos_inc_filing_lastseen']
shell = pd.read_adls(path, reader = pd.read_csv, compression = "gzip", encoding='iso-8859-1', dtype = str, usecols = keep)
print(shell.shape)

(1000000, 11)


In [3]:
fmt = make_format(cuts = [-np.inf, -1, 0, 1, 2, 3, 4, 5, 10, 50, np.inf], exceptions = [-1])
shell["sos_inc_filing_count"] = shell["sos_inc_filing_count"].astype(float)
freq("sos_inc_filing_count", df = shell, format = [fmt], observed = True)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
sos_inc_filing_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,172589.0,0.172589,172589.0,0.172589
1,396937.0,0.396937,569526.0,0.569526
2,61691.0,0.061691,631217.0,0.631217
3,14038.0,0.014038,645255.0,0.645255
4,5464.0,0.005464,650719.0,0.650719
5,2572.0,0.002572,653291.0,0.653291
6-10,4240.0,0.00424,657531.0,0.657531
11-50,3655.0,0.003655,661186.0,0.661186
51+,246.0,0.000246,661432.0,0.661432
-1,338565.0,0.338565,999997.0,0.999997


In [4]:
len(shell.loc[shell["sos_inc_filing_count"] > 0, "id_seleid"].unique())

428845

#### BR Key

In [5]:
path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/0410_ln_11605_20230318_busv31_credit_2390 (BR) rachel_edit.csv"
BR_key = pd.read_adls(path, reader = pd.read_csv, encoding='iso-8859-1', dtype = str, quotechar = '"')
print(BR_key.shape)

(395474, 500)


In [6]:
wb = TableWriter(filename = "./_temp/test1.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)

wb.write_table(
    pd.DataFrame(BR_key.isnull().sum()),
    sheetname="fair_lending",
)
wb.close()

#### Corp Key

In [5]:
path = "Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/0412_ln_11605_20230318_busv31_credit_2390_corp_key (modified).csv"
corp_key = pd.read_adls(path, reader = pd.read_csv, encoding='iso-8859-1', dtype = str)
print(corp_key.shape)

(1668884, 266)


In [6]:
wb = TableWriter(filename = "./_temp/test.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)

wb.write_table(
    pd.DataFrame(corp_key.isnull().sum()),
    sheetname="fair_lending",
)
wb.close()

In [7]:
shell.columns

Index(['account', 'id_powid', 'id_proxid', 'id_seleid', 'id_orgid', 'id_ultid',
       'id_truebiz', 'history_datetime', 'sos_inc_filing_count',
       'sos_inc_filing_firstseen', 'sos_inc_filing_lastseen'],
      dtype='object')

In [8]:
shell["id_seleid"] = shell["id_seleid"].astype(float)
seleid_input = shell.loc[shell["id_seleid"] > 0, ["id_seleid", "history_datetime", "sos_inc_filing_count"]]
seleid_input = seleid_input.sort_values(by=['id_seleid', "history_datetime"])
print(seleid_input.shape)
seleid_input = seleid_input.drop_duplicates(subset = ["id_seleid"], keep = "last", ignore_index = True)
print(seleid_input.shape)
seleid_input = seleid_input[seleid_input["sos_inc_filing_count"] >0]
print(seleid_input.shape)

(747470, 3)
(666666, 3)
(428845, 3)


In [9]:
seleid_corp = corp_key[["seleid", "corp_inc_state", "corp_inc_date"]].copy()
seleid_corp = seleid_corp[seleid_corp["corp_inc_date"].notnull()]
seleid_corp = seleid_corp.sort_values(by=['seleid', "corp_inc_date"])
print(seleid_corp.shape)
seleid_corp = seleid_corp.drop_duplicates(subset = ["seleid"], ignore_index = True, keep = "last")
seleid_corp["seleid"] = seleid_corp["seleid"].astype(float)
seleid_corp.columns = ["corp_" + x for x in seleid_corp.columns]
print(seleid_corp.shape)

(1252473, 3)
(378150, 3)


In [10]:
merged = seleid_input.merge(seleid_corp, how = "outer", left_on = "id_seleid", right_on = "corp_seleid")
print(merged.shape)

(436070, 6)


In [11]:
merged[(merged["corp_seleid"].notnull()) & (merged["id_seleid"].isnull())].shape

(7225, 6)

In [12]:
merged[(merged["corp_seleid"].isnull()) & (merged["id_seleid"].notnull())].shape

(57920, 6)

In [32]:
shell[shell["id_seleid"] == 139820665622]

Unnamed: 0,account,id_powid,id_proxid,id_seleid,id_orgid,id_ultid,id_truebiz,history_datetime,sos_inc_filing_count,sos_inc_filing_firstseen,sos_inc_filing_lastseen
62,172682431R19865,139820665622,139820665622,139820665622,139820665622,139820665622,0,20230318,-1,-1,-1


In [14]:
corp_key[corp_key["seleid"] == "1611"]

Unnamed: 0,ultid,orgid,seleid,proxid,powid,empid,dotid,ultscore,orgscore,selescore,...,corp_prep_addr1_last_line,append_addr2_rawaid,append_addr2_aceaid,corp_prep_addr2_line1,corp_prep_addr2_last_line,append_ra_rawaid,append_ra_aceaid,ra_prep_addr_line1,ra_prep_addr_last_line,fp


In [13]:
pd.set_option('display.float_format', lambda x: '%.f' % x)
merged[(merged["corp_seleid"].isnull()) & (merged["id_seleid"].notnull())]

Unnamed: 0,id_seleid,history_datetime,sos_inc_filing_count,corp_seleid,corp_corp_inc_state,corp_corp_inc_date
4,1611,20230318,1,,,
9,5745,20230318,2,,,
13,9951,20230318,7,,,
14,10994,20230318,1,,,
22,31774,20230318,15,,,
...,...,...,...,...,...,...
428819,139872310404,20230318,1,,,
428821,139872313501,20230318,1,,,
428825,139872388708,20230318,1,,,
428833,139872499826,20230318,1,,,


In [27]:
shell[shell["id_seleid"].isin([1611, 5745, 9951, 10994, 31774, 139872310404])]

Unnamed: 0,account,id_powid,id_proxid,id_seleid,id_orgid,id_ultid,id_truebiz,history_datetime,sos_inc_filing_count,sos_inc_filing_firstseen,sos_inc_filing_lastseen
45348,172649371R974,0,0,1611,1611,1611,1,20230318,1,20060623,20060623
92642,172943491R18638,31774,31774,31774,31774,31774,1,20230318,15,19660429,20050104
250568,172514275R264849,98671666,98671666,9951,9951,9951,1,20230318,7,19410505,20140714
424744,172582181R4509,139054402060,139872310404,139872310404,139054402060,139054402060,1,20230318,1,20230120,20230120
551355,172864121R1623,3806193480,3806193480,5745,5745,5745,1,20230318,2,20080520,20080520
730938,172564611R1304,122141018657,122141018657,10994,10994,10994,1,20230318,1,20070119,20070119
889674,172911231R5054,98671666,98671666,9951,9951,9951,1,20230318,7,19410505,20140714


In [13]:
seleid_br = BR_key[["seleid"]].copy()
seleid_br = seleid_br.drop_duplicates(subset = ["seleid"], ignore_index = True)
seleid_br.columns = ["br_seleid"]
print(seleid_br.shape)

(272992, 1)


In [6]:
## merge
seleid_corp = corp_key[["seleid"]].copy()
seleid_corp = seleid_corp.drop_duplicates(subset = ["seleid"], ignore_index = True)
seleid_br = BR_key[["seleid"]].copy()
seleid_br = seleid_br.drop_duplicates(subset = ["seleid"], ignore_index = True)
seleid_br.columns = ["br_seleid"]
temp = seleid_corp.merge(seleid_br, how = "outer", left_on = "seleid", right_on = "br_seleid")
print(temp.shape)

(443885, 2)


In [12]:
seleid_corp.shape

(390792, 1)

In [13]:
seleid_br.shape

(272992, 1)

In [9]:
temp[(temp["seleid"].isnull()) & (temp["br_seleid"].notnull())].shape

(53093, 2)

In [11]:
temp[(temp["seleid"].isnull()) & (temp["br_seleid"].notnull())]

Unnamed: 0,seleid,br_seleid
390792,,41836
390793,,51093
390794,,62295
390795,,72767
390796,,119702
...,...,...
443880,,139872616150
443881,,139872675284
443882,,139875518813
443883,,139875621151


In [10]:
temp[(temp["seleid"].notnull()) & (temp["br_seleid"].isnull())].shape

(170893, 2)