In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

Matplotlib is building the font cache; this may take a moment.


#### how's the input

In [2]:
input_df = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_filtered_remove_dt_for_dt_modeling.csv", 
             reader = pd.read_csv, dtype = str)
print(input_df.shape)

(7971880, 13)


In [3]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/ECL_seleid_append/0417_air166_final_with_ids_appended.csv"
seleid = pd.read_adls(path, reader = pd.read_csv, encoding='iso-8859-1', dtype = str)
print(seleid.shape)
## 17,311,232

(17311232, 16)


In [4]:
pd.DataFrame(input_df.isnull().sum())

Unnamed: 0,0
AccountNumber,0
CompanyName,0
AlternateCompanyName,7971880
Addr,0
City,221202
State,220546
Zip,460864
BusinessPhone,2437061
TaxIdNumber,7512130
HistoryDate,0


In [5]:
input_df = input_df.fillna("")
seleid = seleid.fillna("")
input_df = input_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
seleid = seleid.applymap(lambda x: x.strip() if isinstance(x, str) else x)
seleid = seleid.drop_duplicates(subset = ['companyname', 'alternatecompanyname', 'addr', 'city',
       'state', 'zip', 'businessphone', 'taxidnumber',])
print(seleid.shape)

(17306745, 16)


In [6]:
input_df.columns = [x.lower() for x in input_df.columns]
input_df.columns

Index(['accountnumber', 'companyname', 'alternatecompanyname', 'addr', 'city',
       'state', 'zip', 'businessphone', 'taxidnumber', 'historydate',
       'sufficient_input', 'count_x', 'count_y'],
      dtype='object')

In [7]:
input_df = input_df.merge(seleid, on = ['companyname', 'alternatecompanyname', 'addr', 'city',
       'state', 'zip', 'businessphone', 'taxidnumber',], how = "left")
print(input_df.shape)

(7971880, 21)


In [8]:
input_df[input_df.accountnumber_y.isnull()].shape

(7808, 21)

In [9]:
seleid[seleid["companyname"] == "![CDATA[JMI JUDGE II INC]]>"]

Unnamed: 0,accountnumber,companyname,alternatecompanyname,addr,city,state,zip,businessphone,taxidnumber,historydate,powid,proxid,seleid,orgid,ultid,overallweight
6059188,AAA000000102910446,![CDATA[JMI JUDGE II INC]]>,,2877 EAST CHARLESTON BOULEVARD,SUITE 100 #100,LA,,,452772832,20220310,223951517,223951517,223951517,223951517,223951517,70


In [10]:
input_df[input_df.accountnumber_y.isnull()].head()

Unnamed: 0,accountnumber_x,companyname,alternatecompanyname,addr,city,state,zip,businessphone,taxidnumber,historydate_x,...,count_x,count_y,accountnumber_y,historydate_y,powid,proxid,seleid,orgid,ultid,overallweight
457,AAA000000102755325,![CDATA[JMI JUDGE II INC]]>,,2877 EAST CHARLESTON BOULEVARD,SUITE 100 #100,LA,VEGAS,,452772832.0,20220223,...,1,3,,,,,,,,
458,AAA000000102755360,![CDATA[JMI JUDGE II INC]]>,,2877 EAST CHARLESTON BOULEVARD,SUITE 100 #100,LA,VEGAS,,452772832.0,20220223,...,1,3,,,,,,,,
459,AAA000000102910446,![CDATA[JMI JUDGE II INC]]>,,2877 EAST CHARLESTON BOULEVARD,SUITE 100 #100,LA,VEGAS,,452772832.0,20220310,...,1,3,,,,,,,,
994,AAA000000079317032,"""FUTURE NETWORK CERT"", GESELLSCHAFT ZUR MULTID...",,KAISERSTRAÃÂE 14/2,WIEN,AT,01070,1522363637.0,,20220302,...,1,1,,,,,,,,
2225,AAA000000090182637,037634039,,1150 W STATE ROAD 436,FOREST CITY,NC,28043,0.0,,20220412,...,1,1,,,,,,,,


In [11]:
input_df.columns

Index(['accountnumber_x', 'companyname', 'alternatecompanyname', 'addr',
       'city', 'state', 'zip', 'businessphone', 'taxidnumber', 'historydate_x',
       'sufficient_input', 'count_x', 'count_y', 'accountnumber_y',
       'historydate_y', 'powid', 'proxid', 'seleid', 'orgid', 'ultid',
       'overallweight'],
      dtype='object')

In [13]:
input_df = input_df[input_df.accountnumber_y.notnull()]
print(input_df.shape)
input_df[["accountnumber_x", "seleid"]].to_parquet("./_temp/profile_seleid.parquet")

(7964072, 21)


#### BIID

In [2]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/"

In [3]:
biid = pd.read_adls(path + "/_temp/" + "biid" + "_all.parquet", reader = pd.read_parquet)
print(biid.shape)

(7971880, 347)


In [4]:
fmt = make_format(cuts = [-np.inf, 0, np.inf])
freq(biid["seleid"].astype(float), observed = True, format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
seleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,554690.0,0.069581,554690.0,0.069581
1+,7416688.0,0.930356,7971378.0,0.999937
Missing,502.0,6.3e-05,7971880.0,1.0


In [5]:
# use input_df
input_df = pd.read_parquet("./_temp/profile_seleid.parquet")
print(input_df.shape)
input_df.columns = ["accountnumber", "ecl_seleid"]

(7964072, 2)


In [6]:
biid = biid.merge(input_df, left_on = "acctno", right_on = "accountnumber")
print(biid.shape)

(7964072, 349)


In [7]:
biid[(biid["seleid"]!= biid["ecl_seleid"])].shape

(523357, 349)

In [8]:
fmt = make_format(cuts = [-np.inf, 0, np.inf])
biid["ecl_seleid"] = biid["ecl_seleid"].astype(float)
biid["seleid"] = biid["seleid"].astype(float)
freq(biid["ecl_seleid"], biid["seleid"], cross = False, observed = True, format = [fmt, fmt])

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Pct,Cuml Count,Cuml Pct
ecl_seleid,seleid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<= 0,<= 0,101040.0,0.012687,101040.0,0.012687
<= 0,1+,68627.0,0.008617,169667.0,0.021304
<= 0,Missing,372.0,4.7e-05,170039.0,0.021351
1+,<= 0,448742.0,0.056346,618781.0,0.077697
1+,1+,7345183.0,0.92229,7963964.0,0.999986
1+,Missing,108.0,1.4e-05,7964072.0,1.0


In [9]:
biid = biid[(biid["seleid"]!=0) & (biid.seleid == biid.ecl_seleid)]
print(biid.shape)
biid["count"] = 1
biid_roll =biid.groupby(by = ["seleid", "historydate"])['count'].sum().reset_index(name='count')
biid_roll["count"].describe()

(7339675, 349)


count    6.636477e+06
mean     1.105960e+00
std      8.518872e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.440000e+02
Name: count, dtype: float64

In [10]:
fmt = make_format(cuts = [-np.inf, -1, 0, 1, 2, 3, 4, 5, 10, 50, 100, np.inf])
freq("count", df = biid_roll, format = fmt, observed = True)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,6147397.0,0.926304,6147397.0,0.926304
2,431015.0,0.064946,6578412.0,0.991251
3,31555.0,0.004755,6609967.0,0.996005
4,10322.0,0.001555,6620289.0,0.997561
5,3977.0,0.000599,6624266.0,0.99816
6-10,6833.0,0.00103,6631099.0,0.99919
11-50,5254.0,0.000792,6636353.0,0.999981
51-100,83.0,1.3e-05,6636436.0,0.999994
101+,41.0,6e-06,6636477.0,1.0


In [11]:
print(biid.shape)

(7339675, 350)


In [12]:
biid = biid.drop_duplicates(subset = ["seleid", "historydate"], ignore_index = True)
print(biid.shape)

(6636477, 350)


In [13]:
## merge in input data:
input_df = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_filtered_remove_dt_for_dt_modeling.csv", 
             reader = pd.read_csv, dtype = str)
print(input_df.shape)
biid = biid.merge(input_df, left_on = "acctno", right_on = "AccountNumber")
print(biid.shape)

biid["CompanyName_flag"] = np.where(biid.CompanyName.isnull() | (biid.CompanyName == ""), 0, 1)
biid["AlternateCompanyName_flag"] = np.where(biid.AlternateCompanyName.isnull() | (biid.AlternateCompanyName == ""), 0, 1)
# biid["full_addr"] = biid.Addr + biid.City + biid.State + biid.Zip


biid["BusinessPhone_flag"] = np.where(biid.BusinessPhone.isnull() | (biid.BusinessPhone == ""), 0, 1)
biid["TaxIdNumber_flag"] = np.where(biid.TaxIdNumber.isnull() | (biid.TaxIdNumber == ""), 0, 1)
biid["addr_flag"] = np.where(biid.Addr.isnull() | (biid.Addr == ""), 0, 1)
biid["City_flag"] = np.where(biid.City.isnull() | (biid.City == ""), 0, 1)
biid["State_flag"] = np.where(biid.State.isnull() | (biid.State == ""), 0, 1)
biid["Zip_flag"] = np.where(biid.Zip.isnull() | (biid.Zip == ""), 0, 1)

(7971880, 13)
(6636477, 363)


In [17]:
pd.set_option('display.max_columns', None)
biid[biid["City_flag"] == 0].head(2)

Unnamed: 0,acctno,historydate,transaction_id,numbervalidauthrepsinput,vercmpy,veraddr,vercity,verstate,verzip,verphone,verfein,cnamematchflag,addrmatchflag,citymatchflag,statematchflag,zipmatchflag,phonematchflag,feinmatchflag,bestcompanyname,bestaddr,bestcity,beststate,bestzip,bestzip4,bestphone,bestfein,ultid,orgid,seleid,proxid,powid,bvi,bvi_desc,bus_ri_1,bus_ri_desc_1,bus_ri_2,bus_ri_desc_2,bus_ri_3,bus_ri_desc_3,bus_ri_4,bus_ri_desc_4,bus_ri_5,bus_ri_desc_5,bus_ri_6,bus_ri_desc_6,bus_ri_7,bus_ri_desc_7,bus_ri_8,bus_ri_desc_8,residential_bus_indicator,residential_bus_desc,phone_verification,phone_ver_desc,bureau_verification,bureau_ver_desc,govt_reg_verification,govt_reg_ver_desc,pubrec_filings_verification,pubrec_filings_ver_desc,bus_directories_verification,bus_directories_ver_desc,bus_phone_match_company_1,bus_phone_match_prim_range_1,bus_phone_match_predir_1,bus_phone_match_prim_name_1,bus_phone_match_suffix_1,bus_phone_match_postdir_1,bus_phone_match_unit_desig_1,bus_phone_match_sec_range_1,bus_phone_match_addr_1,bus_phone_match_city_1,bus_phone_match_state_1,bus_phone_match_zip_1,bus_phone_match_zip4_1,bus_phone_match_seleid_1,bus_phone_match_company_2,bus_phone_match_prim_range_2,bus_phone_match_predir_2,bus_phone_match_prim_name_2,bus_phone_match_suffix_2,bus_phone_match_postdir_2,bus_phone_match_unit_desig_2,bus_phone_match_sec_range_2,bus_phone_match_addr_2,bus_phone_match_city_2,bus_phone_match_state_2,bus_phone_match_zip_2,bus_phone_match_zip4_2,bus_phone_match_seleid_2,bus_phone_match_company_3,bus_phone_match_prim_range_3,bus_phone_match_predir_3,bus_phone_match_prim_name_3,bus_phone_match_suffix_3,bus_phone_match_postdir_3,bus_phone_match_unit_desig_3,bus_phone_match_sec_range_3,bus_phone_match_addr_3,bus_phone_match_city_3,bus_phone_match_state_3,bus_phone_match_zip_3,bus_phone_match_zip4_3,bus_phone_match_seleid_3,bus_addr_match_phone_1,bus_addr_match_phone_2,bus_addr_match_phone_3,bus_fein_match_company_1,bus_fein_match_prim_range_1,bus_fein_match_predir_1,bus_fein_match_prim_name_1,bus_fein_match_suffix_1,bus_fein_match_postdir_1,bus_fein_match_unit_desig_1,bus_fein_match_sec_range_1,bus_fein_match_addr_1,bus_fein_match_city_1,bus_fein_match_state_1,bus_fein_match_zip_1,bus_fein_match_zip4_1,bus_fein_match_seleid_1,bus_fein_match_company_2,bus_fein_match_prim_range_2,bus_fein_match_predir_2,bus_fein_match_prim_name_2,bus_fein_match_suffix_2,bus_fein_match_postdir_2,bus_fein_match_unit_desig_2,bus_fein_match_sec_range_2,bus_fein_match_addr_2,bus_fein_match_city_2,bus_fein_match_state_2,bus_fein_match_zip_2,bus_fein_match_zip4_2,bus_fein_match_seleid_2,bus_fein_match_company_3,bus_fein_match_prim_range_3,bus_fein_match_predir_3,bus_fein_match_prim_name_3,bus_fein_match_suffix_3,bus_fein_match_postdir_3,bus_fein_match_unit_desig_3,bus_fein_match_sec_range_3,bus_fein_match_addr_3,bus_fein_match_city_3,bus_fein_match_state_3,bus_fein_match_zip_3,bus_fein_match_zip4_3,bus_fein_match_seleid_3,bus_ofac_table_1,bus_ofac_program_1,bus_ofac_record_number_1,bus_ofac_companyname_1,bus_ofac_firstname_1,bus_ofac_lastname_1,bus_ofac_address_1,bus_ofac_city_1,bus_ofac_state_1,bus_ofac_zip_1,bus_ofac_country_1,bus_ofac_entity_name_1,bus_ofac_sequence_1,bus_ofac_table_2,bus_ofac_program_2,bus_ofac_record_number_2,bus_ofac_companyname_2,bus_ofac_firstname_2,bus_ofac_lastname_2,bus_ofac_address_2,bus_ofac_city_2,bus_ofac_state_2,bus_ofac_zip_2,bus_ofac_country_2,bus_ofac_entity_name_2,bus_ofac_sequence_2,bus_ofac_table_3,bus_ofac_program_3,bus_ofac_record_number_3,bus_ofac_companyname_3,bus_ofac_firstname_3,bus_ofac_lastname_3,bus_ofac_address_3,bus_ofac_city_3,bus_ofac_state_3,bus_ofac_zip_3,bus_ofac_country_3,bus_ofac_entity_name_3,bus_ofac_sequence_3,bus_ofac_table_4,bus_ofac_program_4,bus_ofac_record_number_4,bus_ofac_companyname_4,bus_ofac_firstname_4,bus_ofac_lastname_4,bus_ofac_address_4,bus_ofac_city_4,bus_ofac_state_4,bus_ofac_zip_4,bus_ofac_country_4,bus_ofac_entity_name_4,bus_ofac_sequence_4,bus_ofac_table_5,bus_ofac_program_5,bus_ofac_record_number_5,bus_ofac_companyname_5,bus_ofac_firstname_5,bus_ofac_lastname_5,bus_ofac_address_5,bus_ofac_city_5,bus_ofac_state_5,bus_ofac_zip_5,bus_ofac_country_5,bus_ofac_entity_name_5,bus_ofac_sequence_5,bus_ofac_table_6,bus_ofac_program_6,bus_ofac_record_number_6,bus_ofac_companyname_6,bus_ofac_firstname_6,bus_ofac_lastname_6,bus_ofac_address_6,bus_ofac_city_6,bus_ofac_state_6,bus_ofac_zip_6,bus_ofac_country_6,bus_ofac_entity_name_6,bus_ofac_sequence_6,bus_ofac_table_7,bus_ofac_program_7,bus_ofac_record_number_7,bus_ofac_companyname_7,bus_ofac_firstname_7,bus_ofac_lastname_7,bus_ofac_address_7,bus_ofac_city_7,bus_ofac_state_7,bus_ofac_zip_7,bus_ofac_country_7,bus_ofac_entity_name_7,bus_ofac_sequence_7,bus_watchlist_table_1,bus_watchlist_program_1,bus_watchlist_record_number_1,bus_watchlist_companyname_1,bus_watchlist_firstname_1,bus_watchlist_lastname_1,bus_watchlist_address_1,bus_watchlist_city_1,bus_watchlist_state_1,bus_watchlist_zip_1,bus_watchlist_country_1,bus_watchlist_entity_name_1,bus_watchlist_sequence_1,bus_watchlist_table_2,bus_watchlist_program_2,bus_watchlist_record_number_2,bus_watchlist_companyname_2,bus_watchlist_firstname_2,bus_watchlist_lastname_2,bus_watchlist_address_2,bus_watchlist_city_2,bus_watchlist_state_2,bus_watchlist_zip_2,bus_watchlist_country_2,bus_watchlist_entity_name_2,bus_watchlist_sequence_2,bus_watchlist_table_3,bus_watchlist_program_3,bus_watchlist_record_number_3,bus_watchlist_companyname_3,bus_watchlist_firstname_3,bus_watchlist_lastname_3,bus_watchlist_address_3,bus_watchlist_city_3,bus_watchlist_state_3,bus_watchlist_zip_3,bus_watchlist_country_3,bus_watchlist_entity_name_3,bus_watchlist_sequence_3,bus_watchlist_table_4,bus_watchlist_program_4,bus_watchlist_record_number_4,bus_watchlist_companyname_4,bus_watchlist_firstname_4,bus_watchlist_lastname_4,bus_watchlist_address_4,bus_watchlist_city_4,bus_watchlist_state_4,bus_watchlist_zip_4,bus_watchlist_country_4,bus_watchlist_entity_name_4,bus_watchlist_sequence_4,bus_watchlist_table_5,bus_watchlist_program_5,bus_watchlist_record_number_5,bus_watchlist_companyname_5,bus_watchlist_firstname_5,bus_watchlist_lastname_5,bus_watchlist_address_5,bus_watchlist_city_5,bus_watchlist_state_5,bus_watchlist_zip_5,bus_watchlist_country_5,bus_watchlist_entity_name_5,bus_watchlist_sequence_5,bus_watchlist_table_6,bus_watchlist_program_6,bus_watchlist_record_number_6,bus_watchlist_companyname_6,bus_watchlist_firstname_6,bus_watchlist_lastname_6,bus_watchlist_address_6,bus_watchlist_city_6,bus_watchlist_state_6,bus_watchlist_zip_6,bus_watchlist_country_6,bus_watchlist_entity_name_6,bus_watchlist_sequence_6,bus_watchlist_table_7,bus_watchlist_program_7,bus_watchlist_record_number_7,bus_watchlist_companyname_7,bus_watchlist_firstname_7,bus_watchlist_lastname_7,bus_watchlist_address_7,bus_watchlist_city_7,bus_watchlist_state_7,bus_watchlist_zip_7,bus_watchlist_country_7,bus_watchlist_entity_name_7,bus_watchlist_sequence_7,ln_status,sos_status,sos_filing_name,time_on_sos,sic,sic_desc,naics,naics_desc,bus_firstseen_yyyy,time_on_publicrecord,bus_description,bus_county,parent_seleid,parent_best_bus_name,time_on_sbfe,last_seen_sbfe,count_of_trades_sbfe,accountnumber,ecl_seleid,count,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate,sufficient_input,count_x,count_y,CompanyName_flag,AlternateCompanyName_flag,BusinessPhone_flag,TaxIdNumber_flag,addr_flag,City_flag,State_flag,Zip_flag
214,AAA000000107273618,20210909,0,0,OYO GEOSPACE CORP,7007 PINEMONT DR,HOUSTON,TX,77040,,,1,1,1,1,1,0,0,GEOSPACE TECHNOLOGIES CORPORATION,7007 PINEMONT DR,HOUSTON,TX,77040,6601,7139864444,760447780,51223511,51223511,58931540.0,58931543,51223511,50,Input business name and address verified on mu...,53,The input business phone was missing or incomp...,54,The input business TIN was missing or incomplete,0,,0,,0,,0,,0,,0,,0,Commercial Address,1,Input Business Address verified,2,Input Business Name and Address verified,2,Input Business Name and Address verified,2,Input Business Name and Address verified,2,Input Business Name and Address verified,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ACTIVE,ACTIVE,GEOSPACE TECHNOLOGIES CORPORATION,286,3829,"MEASURING & CONTROLLING DEVICES, NEC",334515,INSTRUMENT MANUFACTURING FOR MEASURING AND TES...,1977,536,,HARRIS,0,,,,,AAA000000107273618,58931540.0,1,AAA000000107273618,OYO GEOSPACE CORP.,,7007 PINEMONT,,,77040,,,20210909,0,1,1,1,0,0,0,1,0,0,1
238,AAA000000106539729,20210801,0,0,ROMAN GARCIA,923 N LOOP 1604 E,SAN ANTONIO,TX,78232,,,1,1,1,1,1,0,0,THE BRIDAL CONNECTION,923 N LOOP 1604 E STE 106,SAN ANTONIO,TX,78232,1386,2109674696,463636649,64786651720,64786651720,64786650000.0,64786651720,64786651720,50,Input business name and address verified on mu...,47,The input business name may have been miskeyed,53,The input business phone was missing or incomp...,54,The input business TIN was missing or incomplete,0,,0,,0,,0,,0,,0,Commercial Address,1,Input Business Address verified,2,Input Business Name and Address verified,2,Input Business Name and Address verified,1,Input Business Address verified,1,Input Business Address verified,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ACTIVE,UNKNOWN,,0,5621,WOMEN'S CLOTHING STORES,812990,ALL OTHER PERSONAL SERVICES,1999,263,,BEXAR,0,,,,,AAA000000106539729,64786650000.0,1,AAA000000106539729,ROMAN GARCIA,,923 N. LOOP 1604 E,,,78232,,,20210801,0,1,1,1,0,0,0,1,0,0,1


In [18]:
biid['vercmpy_flag']  = np.where(biid['vercmpy' ].isnull(), 0, 1)
biid['veraddr_flag']  = np.where(biid['veraddr' ].isnull(), 0, 1)
biid['vercity_flag']  = np.where(biid['vercity' ].isnull(), 0, 1)
biid['verstate_flag'] = np.where(biid['verstate'].isnull(), 0, 1)
biid['verzip_flag']   = np.where(biid['verzip'  ].isnull(), 0, 1)
biid['verphone_flag'] = np.where(biid['verphone'].isnull(), 0, 1)
biid['verfein_flag']  = np.where(biid['verfein' ].isnull(), 0, 1)

In [19]:
biid['bestcompanyname_flag']  = np.where(biid['bestcompanyname' ].isnull(), 0, 1)
biid['bestaddr_flag']  = np.where(biid['bestaddr' ].isnull(), 0, 1)
biid['bestcity_flag']  = np.where(biid['bestcity' ].isnull(), 0, 1)
biid['beststate_flag'] = np.where(biid['beststate'].isnull(), 0, 1)
biid['bestzip_flag']   = np.where(biid['bestzip'  ].isnull(), 0, 1)
biid['bestphone_flag'] = np.where(biid['bestphone'].isnull(), 0, 1)
biid['bestfein_flag']  = np.where(biid['bestfein' ].isnull(), 0, 1)

In [27]:
freq("TaxIdNumber_flag", 'bestfein_flag', 'verfein_flag', df = biid, cross = False, observed = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Count,Pct,Cuml Count,Cuml Pct
TaxIdNumber_flag,bestfein_flag,verfein_flag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,3697962.0,0.557218,3697962.0,0.557218
0,1,0,2665539.0,0.40165,6363501.0,0.958867
1,0,0,153366.0,0.02311,6516867.0,0.981977
1,0,1,5389.0,0.000812,6522256.0,0.982789
1,1,0,25235.0,0.003802,6547491.0,0.986591
1,1,1,88986.0,0.013409,6636477.0,1.0


In [20]:
result = dict()
result["1"] = bivariate("CompanyName_flag", 'bestcompanyname_flag' , df = biid)
result["2"] = bivariate("addr_flag", 'bestaddr_flag' , df = biid)
result["3"] = bivariate("City_flag", 'bestcity_flag' , df = biid)
result["4"] = bivariate("State_flag", 'beststate_flag', df = biid)
result["5"] = bivariate("Zip_flag", 'bestzip_flag'  , df = biid)
result["6"] = bivariate("BusinessPhone_flag", 'bestphone_flag', df = biid)
result["7"] = bivariate("TaxIdNumber_flag", 'bestfein_flag' , df = biid)
# result["1"] = bivariate("CompanyName_flag", 'vercmpy_flag' , df = biid)
# result["2"] = bivariate("addr_flag", 'veraddr_flag' , df = biid)
# result["3"] = bivariate("City_flag", 'vercity_flag' , df = biid)
# result["4"] = bivariate("State_flag", 'verstate_flag', df = biid)
# result["5"] = bivariate("Zip_flag", 'verzip_flag'  , df = biid)
# result["6"] = bivariate("BusinessPhone_flag", 'verphone_flag', df = biid)
# result["7"] = bivariate("TaxIdNumber_flag", 'verfein_flag' , df = biid)

wb = TableWriter(filename = "./_temp/sources.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in result.keys():
    wb.write_table(
        result[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()

In [24]:
biid_roll =biid.groupby(by = ["seleid"])['count'].sum().reset_index(name='count')
biid_roll["count"].describe()

count    1.195630e+06
mean     5.550611e+00
std      6.809864e+00
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      9.000000e+00
max      1.950000e+02
Name: count, dtype: float64

In [25]:
freq("count", df = biid_roll, format = fmt, observed = True)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,489896.0,0.409739,489896.0,0.409739
2,211556.0,0.176941,701452.0,0.58668
3,53289.0,0.04457,754741.0,0.63125
4,27586.0,0.023072,782327.0,0.654322
5,19905.0,0.016648,802232.0,0.67097
6-10,196898.0,0.164681,999130.0,0.835651
11-50,196486.0,0.164337,1195616.0,0.999988
51-100,12.0,1e-05,1195628.0,0.999998
101+,2.0,2e-06,1195630.0,1.0


In [26]:
biid_roll[biid_roll["count"] == 195]

Unnamed: 0,seleid,count
184649,60937705.0,195


In [27]:
biid_roll["seleid"] = biid_roll["seleid"].astype(float)
biid["seleid"] = biid["seleid"].astype(float)
biid = biid.merge(biid_roll, on = "seleid")
print(biid.shape)

(6636477, 351)


In [13]:
def left_closed_labels(b, include_max=False, fmt=".3f", step=1):
    lbs = []
    for i in range(len(b) - 1):
        if b[i] == (b[i + 1] - step):
            lbs.append(f"{b[i]:{fmt}}")
        else:
            lbs.append(f"{b[i]:{fmt}}-{b[i+1] - step:{fmt}}")
    if not include_max:
        lbs[-1] = f"{b[-2]:{fmt}}+"
    return lbs

In [14]:
biid = biid.apply(pd.to_numeric, errors='ignore')

In [33]:
biv_all = {}
biv_client = {}
for s in biid_keep_list:
    if pd.api.types.is_string_dtype(biid[s].dtype):
        temp_biv = freq(biid[s]).fillna("n/a")
    else:
        if biid[s].nunique() <=11:
            temp_biv = freq(biid[s]).fillna("n/a")
        else:
            brks = (
                    biid[s].pipe(lambda x: x[x.gt(-1)])
                    .quantile([0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
                )
            dup = [x+1 for x in brks if brks.tolist().count(x)>1]
            brks = brks.tolist() + dup
            brks = np.unique(brks)

            if brks.max() == 0:
                brks = np.append(brks, 1)
            brks.sort()
            brks = np.append(brks, np.inf) 

            labs = left_closed_labels(brks, fmt=".0f")
            fmt = make_format(
                cuts=brks, labels=labs, right=False, exceptions=[-99999, -99998, -99997]
            )
            temp_biv = freq(biid[s], format=fmt).fillna("n/a")
            
    biv_all[s] = temp_biv
    
wb = TableWriter(filename = "./_temp/temp_biid.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in biv_all.keys():
    wb.write_table(
        biv_all[k],
        sheetname="biv",
        conditional_fmt_cols=[1],
    )
wb.close()


In [17]:
biid_keep_list = [x.lower() for x in biid_keep_list]

In [16]:
biid_keep_list = [
'numbervalidauthrepsinput',  
'cnamematchflag', 
'addrmatchflag', 
'citymatchflag', 
'statematchflag', 
'zipmatchflag', 
'phonematchflag', 
'feinmatchflag', 
'bvi', 
'residential_bus_indicator', 
'residential_bus_desc', 
'Phone_Verification', 
'Phone_Ver_Desc', 
'Bureau_Verification', 
'Bureau_Ver_Desc', 
'Govt_Reg_Verification', 
'Govt_Reg_Ver_Desc', 
'PubRec_Filings_Verification', 
'PubRec_Filings_Ver_desc', 
'Bus_Directories_Verification', 

'LN_Status', 
'sos_status', 
'time_on_sos',
'SIC_desc', 
'NAICS_desc', 
'Bus_firstseen_YYYY', 
'time_on_publicrecord',
'time_on_sbfe', 
'last_seen_sbfe', 
'count_of_trades_sbfe', 
]

In [22]:
biid.columns

Index(['acctno', 'historydate', 'transaction_id', 'numbervalidauthrepsinput',
       'vercmpy', 'veraddr', 'vercity', 'verstate', 'verzip', 'verphone',
       ...
       'time_on_publicrecord', 'bus_description', 'bus_county',
       'parent_seleid', 'parent_best_bus_name', 'time_on_sbfe',
       'last_seen_sbfe', 'count_of_trades_sbfe', 'count_x', 'count_y'],
      dtype='object', length=349)

In [24]:
biid.count_y.describe()

count    6.704804e+06
mean     1.382143e+01
std      8.553096e+00
min      1.000000e+00
25%      8.000000e+00
50%      1.100000e+01
75%      2.300000e+01
max      1.950000e+02
Name: count_y, dtype: float64

In [34]:
## create flag
biid["many_inquiry"] = np.where(biid['count_y'].astype(int) >25, 1, 0)
freq(biid["many_inquiry"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
many_inquiry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6553508.0,0.987498,6553508.0,0.987498
1,82969.0,0.012502,6636477.0,1.0


In [31]:
biv_all = {}
biv_client = {}
for s in biid_keep_list:
    if pd.api.types.is_string_dtype(biid[s].dtype):
        temp_biv = bivariate(biid[s], biid["many_inquiry"]).fillna("n/a")
    else:
        if biid[s].nunique() <=11:
            temp_biv = bivariate(biid[s], biid["many_inquiry"]).fillna("n/a")
        else:
            brks = (
                    biid[s].pipe(lambda x: x[x.gt(-1)])
                    .quantile([0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
                )
            dup = [x+1 for x in brks if brks.tolist().count(x)>1]
            brks = brks.tolist() + dup
            brks = np.unique(brks)

            if brks.max() == 0:
                brks = np.append(brks, 1)
            brks.sort()
            brks = np.append(brks, np.inf) 

            labs = left_closed_labels(brks, fmt=".0f")
            fmt = make_format(
                cuts=brks, labels=labs, right=False, exceptions=[-99999, -99998, -99997]
            )
            temp_biv = bivariate(biid[s], biid["many_inquiry"], format=fmt).fillna("n/a")
            
    biv_all[s] = temp_biv
    
wb = TableWriter(filename = "./_temp/temp_biid_biv.xlsx", options={'nan_inf_to_errors': True}, overwrite = True)
for k in biv_all.keys():
    wb.write_table(
        biv_all[k],
        sheetname="biv",
        conditional_fmt_cols=[3],
    )
wb.close()