In [2]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

## Additional cleaning

In [3]:
## dedupped data
data = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_filtered_remove_dt_dedup_for_ECL.csv",
                    reader = pd.read_csv, dtype = str)
print(data.shape)

(17311232, 10)


In [5]:
data.columns

Index(['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate'],
      dtype='object')

In [9]:
data["weird_name"] = np.where(data['CompanyName'].str.contains("CDATA") | data['CompanyName'].str.contains(",!.; -@!%^&*), 1, 0)
freq(data["weird_name"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
weird_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,17301945.0,0.999464,17301945.0,0.999464
1,9287.0,0.000536,17311232.0,1.0


In [13]:
data.iloc[15000:15005]

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate,weird_name
15000,AAA000000103049078,1 OF ONE LOGISTICS LLC,,1008 S NEBRASKA STREET,MARION,IN,46953,,872055608.0,20220323,0
15001,AAA000000097149727,1 OF ONE LOGISTICS LLC,,1008 S NEBRASKA STREET,MARION,IN,46953,7656614089.0,,20220509,0
15002,AAA000000090463360,1 OF ONE LOGISTICS LLC,,1008 S NEBRASKA STREET,MARION,IN,469532125,7656614089.0,,20220418,0
15003,AAA000000082985045,"1 OF ONE LOGISTICS, LLC",,1408 S WESTERN AVE 1007,MARION,IN,469531540,7656614089.0,,20220323,0
15004,AAA000000105713415,1 OF THE OFF BEATS,,1534 SELWYN AVE # 2G,,,10457,,,20210801,0


In [None]:
df['E'] = re.sub(r'\W+', '', df['B'].str)

## Input

In [2]:
path = "Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/goodfile_e_total.csv.gz"
data = pd.read_adls(path, reader = pd.read_csv, compression = "gzip", dtype = str)
print(data.shape)

(101176653, 27)


In [3]:
data = data.drop(axis = 0, index = [0])
print(data.shape)

(101176652, 27)


In [4]:
data.columns

Index(['account', 'first_name', 'alternateCompanyName', 'new_address',
       'new_city', 'new_state', 'new_zip', 'new_phone1', 'new_ssn',
       'businessIPAddress', 'representativeFirstName',
       'representativeMiddleName', 'representativeLastName',
       'representativeNameSuffix', 'representativeAddr', 'representativeCity',
       'representativeState', 'representativeZip', 'representativeSSN',
       'representativeDOB', 'representativeAge', 'representativeDLNumber',
       'representativeDLState', 'representativeHomePhone',
       'representativeEmailAddress', 'representativeFormerLastName',
       'final_date'],
      dtype='object')

In [5]:
column_name = ['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'BusinessIPAddress', 
'RepresentativeFirstName', 'RepresentativeMiddleName', 'RepresentativeLastName', 'RepresentativeNameSuffix', 'RepresentativeAddr', 'RepresentativeCity', 
'RepresentativeState', 'RepresentativeZip', 'RepresentativeSSN', 'RepresentativeDOB', 'RepresentativeAge', 'RepresentativeDLNumber', 'RepresentativeDLState', 
'RepresentativeHomePhone', 'RepresentativeEmailAddress', 'RepresentativeFormerLastName', 'HistoryDate']
data.columns = column_name

In [6]:
## keep columns needed for Engineer processing
data = data[['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate']]
data = data.fillna("")

In [7]:
data.loc[~((data.BusinessPhone.str.isnumeric())| (data.BusinessPhone ==""))]

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate
1,AAA000000000000005,MOUNTAIN VIEW,,725606465,840,87,,AZZAAZFAABF,,


In [11]:
data.loc[~((data.TaxIdNumber.str.isnumeric())| (data.TaxIdNumber ==""))].head(5)

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate
89148995,AAA000000101121833,CONNICK AND CONNICK LLC,,3421 NO CAUSEWAY BLVD STE 408,METAIRIE,LA,70002.0,,W+HBR67PVA8,20210802
89248641,AAA000000101224590,WVCZVZCRDDHMMVWMQY+BAK8EDHH4VR8J+CP0GW4ECCPOIG...,,NCX0WMBNTCMSXL6YKMUYGQXAQVXG+2JBW/EAXDBW6/ICOV...,PRIMARY>>,CU,,,SQN7HSEL2OS,20210813
89248642,AAA000000101224591,W4GXM8E/ULJLMW/4WGW9PDOJEEERB9LSEBO+X5DSEETXHI...,,AW/HIUKMT33TXYFYDMBZSF9Q34FQS6SUMPDULJND5QXYSG...,PRIMARY>>,CU,,,EXWCDUU5FH4,20210813
89248643,AAA000000101224593,YA8/TDOEE+VMZT2VY302SR4/SB/6QO5KMLZKGC+GD4G0QG...,,G8LF2+LLNKQ3T7G2/ETEZ+SR6V8BILPQBRCVYHPT3VCGN7...,PRIMARY>>,CU,,,XEJESBOLCVX,20210813
89248644,AAA000000101224594,6GKWMXQ335W0FW+FQLQJTIOLOXXUOL+PTL1UXU6FHQ1ULH...,,XD0RVT2XQ9KIH49PLE9AUAUSJN/RN+ACJGYID87TGFWXKO...,PRIMARY>>,CU,,,AEZSA0KO8SF,20210813


In [12]:
data.loc[~(((~data.State.str.isnumeric())| (data.State =="")))].head(5)

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate
1,AAA000000000000005,MOUNTAIN VIEW,,725606465,840,87,,AZZAAZFAABF,,
88945472,AAA000000100912819,"INTELLIGENT LIGHTING DESIGN, LLC",,1300 W 9TH ST,CLEVELAND,44,,,851674199.0,20210706.0
88986425,AAA000000100954820,"BF HOMES, LLC : BF HOMES, LLC",,BF HOMES,LLC,99,DUNGAN RD,,833629737.0,20210712.0
89074171,AAA000000101044961,RHETT FONTENOT CONSTRUCTION LLC,,640 JONQUIL DR,LAKE CHARLES,70,,,,20210722.0
89100384,AAA000000101071800,"![CDATA[JARM ONE, A NV LLC]]>",,4815,4831,49,CRAIG RD,,451162986.0,20210726.0


In [13]:
data.loc[~(((~data.City.str.isnumeric())| (data.City =="")))].head(5)

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate
1,AAA000000000000005,MOUNTAIN VIEW,,725606465,840,87,,AZZAAZFAABF,,
216387,AAA000000000443032,SUZANNE WHITE,,0,0,ME,,2079233235,,20210528.0
905009,AAA000000003264640,SUZANNE WHITE,,0,0,ME,,2079233235,,20210603.0
14817496,AAA000000018929645,USSHER INVENTIONS (PTY) LTD,,3 SIGMA RD,1401,ZA,1401.0,118251100,,20210716.0
30143365,AAA000000035769903,SUZANNE WHITE,,0,0,ME,,2079233235,,20211008.0


In [14]:
data.loc[~(((~data.Addr.str.isnumeric())| (data.Addr =="")))].head(5)

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate
4445085,AAA000000007552034,EVA POLINI,,737,BEVERLY HILLS,CA,90210,3102772226,,20210630
7660961,AAA000000011292284,EVA POLINI,,737,BEVERLY HILLS,CA,90210,3102772226,,20210703
10691061,AAA000000014685537,K B SALES CO. INC.,,520,NORTH SALT LAKE,UT,84054,8012962868,,20210705
12558205,AAA000000016585444,K B SALES CO. INC.,,520,NORTH SALT LAKE,UT,84054,8012962868,,20210709
15657694,AAA000000019775614,K B SALES CO. INC.,,520,NORTH SALT LAKE,UT,84054,8012962868,,20210719


In [15]:
data.loc[~(((~data.CompanyName.str.isnumeric())| (data.CompanyName =="")))].head(5)

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate
12,AAA000000000030674,1977,,203 MEADOWLINK STREET,HOUSTON,TX,770374305,8322303319,,20191125
7031,AAA000000000037736,1952,,10304 INDIAN MOUND DRIVE,NEW PORT RICHEY,FL,346543522,7272437770,,20191125
7033,AAA000000000037738,1958,,978 TRILLIUM TRAIL,OSHKOSH,WI,549047670,9204108070,,20191125
7034,AAA000000000037739,1958,,978 TRILLIUM TRAIL,OSHKOSH,WI,549047670,9204108070,,20191125
7035,AAA000000000037740,1961,,10 WELLINGTON COURT,SAYREVILLE,NJ,88721370,6093841717,,20191125


In [19]:
data.loc[~((data.CompanyName.str.contains("[a-zA-Z0-9]")) | (data.CompanyName !="") | (data.CompanyName !=" "))].head(5)

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate


In [21]:
data = data[data.HistoryDate != ""]
print(data.shape)
data = data[data.CompanyName.str.strip() != ""]
print(data.shape)
## remove Phone with not numbers
data = data.loc[(data.BusinessPhone.str.isnumeric())| (data.BusinessPhone =="")]
print(data.shape)
## remove TaxIdNumber with not numbers
data = data.loc[(data.TaxIdNumber.str.isnumeric())| (data.TaxIdNumber =="")]
print(data.shape)
## remove addr, city, state without character
data = data.loc[(~data.State.str.isnumeric())| (data.State =="")]
print(data.shape)
data = data.loc[(~data.City.str.isnumeric())| (data.City =="")]
print(data.shape)
data = data.loc[(~data.Addr.str.isnumeric())| (data.Addr =="")]
print(data.shape)


(101176651, 10)
(101174569, 10)
(101174569, 10)
(101154745, 10)
(101154692, 10)
(101152182, 10)
(101152150, 10)


In [22]:
data.head()

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate
2,AAA000000000030664,1 BODY STRENGTH AND CONDI TIONING LLC,,2160 LONG BEACH BOULEVARD,LONG BEACH,CA,908064807,5623307372.0,,20191125
3,AAA000000000030665,1 EIGHTY LABS,,701 5TH AVENUE STE 4200,SEATTLE,WA,981047047,2062627302.0,,20191125
4,AAA000000000030666,1 SPARTAN TECHNOLOGY SOLUTISPARTAN TECHNOLOGY S,,125 VENTURE BOULEVARD,SPARTANBURG,SC,293063817,8645871386.0,,20191125
5,AAA000000000030667,1 SPARTAN TECHNOLOGY SOLUTISPARTAN TECHNOLOGY S,,125 VENTURE BOULEVARD,SPARTANBURG,SC,293063817,8645871386.0,,20191125
6,AAA000000000030668,1013 ATHLETICS LLC,,726 FOSTER AVENUE,BENSENVILLE,IL,60106,,,20191125


In [25]:
print(data[data['AccountNumber'].str.strip() == ""].shape)
print(data[data['CompanyName'].str.strip() == ""].shape)
print(data[data['AlternateCompanyName'].str.strip() == ""].shape)
print(data[data['Addr'].str.strip() == ""].shape)
print(data[data['City'].str.strip() == ""].shape)
print(data[data['State'].str.strip() == ""].shape)
print(data[data['Zip'].str.strip() == ""].shape)
print(data[data['BusinessPhone'].str.strip() == ""].shape)
print(data[data['TaxIdNumber'].str.strip() == ""].shape)
print(data[data['HistoryDate'].str.strip() == ""].shape)

(0, 10)
(0, 10)
(101152150, 10)
(3, 10)
(4204895, 10)
(4198817, 10)
(5361736, 10)
(31250392, 10)
(95845741, 10)
(0, 10)


In [27]:
data["sufficient_input"] = np.where(data["CompanyName"].isnull() |
                              (data["Addr"].isnull() | data["City"].isnull() | data["State"].isnull()) &
                              (data["Addr"].isnull() | data["Zip"].isnull()), 1, 0)
freq(data["sufficient_input"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
sufficient_input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,101152150.0,1.0,101152150.0,1.0


In [28]:
data.dtypes

AccountNumber           object
CompanyName             object
AlternateCompanyName    object
Addr                    object
City                    object
State                   object
Zip                     object
BusinessPhone           object
TaxIdNumber             object
HistoryDate             object
sufficient_input         int64
dtype: object

In [30]:
data.to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/goodfile_futher_filtered.parquet", 
             format = '.parquet', overwrite = True)

## Dedup by BII for Engineering

In [31]:
data.columns

Index(['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',
       'sufficient_input'],
      dtype='object')

In [34]:
data["count"] = 1
dedup_bii =data.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', ])['count'].sum().reset_index(name='count')
freq(dedup_bii["count"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,9218466.0,4.661482e-01,9218466.0,0.466148
2,3391872.0,1.715161e-01,12610338.0,0.637664
3,898848.0,4.545185e-02,13509186.0,0.683116
4,606188.0,3.065298e-02,14115374.0,0.713769
5,319605.0,1.616140e-02,14434979.0,0.729930
...,...,...,...,...
45946,1.0,5.056679e-08,19775823.0,1.000000
54648,1.0,5.056679e-08,19775824.0,1.000000
66412,1.0,5.056679e-08,19775825.0,1.000000
91744,1.0,5.056679e-08,19775826.0,1.000000


In [36]:
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 100, 1000, np.inf])
freq(dedup_bii["count"], format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,9218466.0,0.466148,9218466.0,0.466148
2,3391872.0,0.171516,12610338.0,0.637664
3,898848.0,0.045452,13509186.0,0.683116
4,606188.0,0.030653,14115374.0,0.713769
5,319605.0,0.016161,14434979.0,0.72993
6-10,1319742.0,0.066735,15754721.0,0.796666
11-100,4020233.0,0.20329,19774954.0,0.999956
101-1000,837.0,4.2e-05,19775791.0,0.999998
1001+,36.0,2e-06,19775827.0,1.0


In [35]:
dedup_bii[dedup_bii["count"] == 554803]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,count
7325786,"GORMAN MANUFACTURING COMPANY, INC.",,492 KOLLER ST,SAN FRANCISCO,CA,94110,6505550000,,554803


In [37]:
data = data.sort_values(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', "HistoryDate"])

In [41]:
dedup_bii = data.drop_duplicates(subset=['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', ], 
                                 keep = "last", ignore_index = True)
print(dedup_bii.shape)

(19775827, 12)


In [42]:
dedup_bii.columns

Index(['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',
       'sufficient_input', 'count'],
      dtype='object')

In [44]:
dedup_bii[['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',]].to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/goodfile_futher_filtered_dedup_for_ECL.csv", 
             format = '.csv', overwrite = True, index = False)

In [None]:
dedup_bii[['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',]].sample(n=1_000_000, random_state=1234, ignore_index = True).to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/goodfile_futher_filtered_dedup_for_ECL_1mil_sample.csv", 
             format = '.csv', overwrite = True)

#### Further remove records - By time

In [2]:
data = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/goodfile_futher_filtered.parquet", reader = pd.read_parquet)
print(data.shape)

(101152150, 11)


In [3]:
freq(data.HistoryDate.str.slice(0,6))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
HistoryDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
201911,18374.0,0.000182,18374.0,0.000182
202105,322253.0,0.003186,340627.0,0.003367
202106,7956225.0,0.078656,8296852.0,0.082023
202107,10580351.0,0.104598,18877203.0,0.186622
202108,4961686.0,0.049052,23838889.0,0.235674
202109,7551680.0,0.074657,31390569.0,0.31033
202110,8729360.0,0.086299,40119929.0,0.39663
202111,7075626.0,0.06995,47195555.0,0.46658
202112,7874424.0,0.077847,55069979.0,0.544427
202201,10880052.0,0.107561,65950031.0,0.651988


In [4]:
data = data[~data.HistoryDate.str. slice(0, 6).isin(["201911", "202105", "202106"])]
print(data.shape)

(92855298, 11)


In [5]:
temp = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data/temp_stacked_datetime_for_merge.parquet")
print(temp.shape)

(106154115, 2)


In [6]:
data = data.merge(temp, on = "AccountNumber")
print(data.shape)

(92855298, 12)


In [7]:
data.head()

Unnamed: 0,AccountNumber,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate,sufficient_input,HistoryDateTime
0,AAA000000010112675,"FAMILY SOLUTIONS COUNSELING, LLC",,115 GOLF COURSE RD STE E,LOGAN,UT,843215934,4355353654,,20210701,0,2021-07-01T00:00:00
1,AAA000000010112676,GLORIOUS GIFT BASKETS,,3200 PALM TREE DR,LITHONIA,GA,300382361,7708857033,,20210701,0,2021-07-01T00:00:15
2,AAA000000010112677,LIESCHESKI FOUNDATION REPAIR,,1476 HIGHWAY 159 E,BELLVILLE,TX,774185805,9798653142,,20210701,0,2021-07-01T00:00:21
3,AAA000000010112678,"BEIJING YDZL TECHNOLOGY CO., LTD",,"ROOM 310, FLOOR 3, BUILDING 1-4, SOUTH SIDE BE...",BEIJING,CN,100000,1064362748,,20210701,0,2021-07-01T00:00:22
4,AAA000000010112679,LIESCHESKI FOUNDATION REPAIR,,1476 HIGHWAY 159 E,BELLVILLE,TX,774185805,9798653142,,20210701,0,2021-07-01T00:00:23


In [8]:
data.to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_cleaned_filtered_input.parquet",
             format = ".parquet")

In [28]:
data.columns

Index(['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',
       'sufficient_input', 'count'],
      dtype='object')

In [29]:
print(data[data['AccountNumber'].str.strip() == ""].shape)
print(data[data['CompanyName'].str.strip() == ""].shape)
print(data[data['AlternateCompanyName'].str.strip() == ""].shape)
print(data[data['Addr'].str.strip() == ""].shape)
print(data[data['City'].str.strip() == ""].shape)
print(data[data['State'].str.strip() == ""].shape)
print(data[data['Zip'].str.strip() == ""].shape)
print(data[data['BusinessPhone'].str.strip() == ""].shape)
print(data[data['TaxIdNumber'].str.strip() == ""].shape)
print(data[data['HistoryDate'].str.strip() == ""].shape)

(0, 12)
(0, 12)
(92855298, 12)
(3, 12)
(2552188, 12)
(2545788, 12)
(5359102, 12)
(28167156, 12)
(87548889, 12)
(0, 12)


In [None]:
data["count"] = 1
dedup_bii =data.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', ])['count'].sum().reset_index(name='count')
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 100, 1000, np.inf])
freq(dedup_bii["count"], format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,7425137.0,0.42892,7425137.0,0.42892
2,3061059.0,0.176825,10486196.0,0.605745
3,810805.0,0.046837,11297001.0,0.652582
4,505334.0,0.029191,11802335.0,0.681773
5,288999.0,0.016694,12091334.0,0.698468
6-10,2736862.0,0.158097,14828196.0,0.856565
11-100,2482181.0,0.143386,17310377.0,0.999951
101-1000,819.0,4.7e-05,17311196.0,0.999998
1001+,36.0,2e-06,17311232.0,1.0


In [None]:
dedup_bii["count"].describe()

count    1.731123e+07
mean     5.363876e+00
std      1.282771e+02
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      8.000000e+00
max      5.139480e+05
Name: count, dtype: float64

In [None]:
dedup_bii[dedup_bii["count"] == 513948]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,count
6417399,"GORMAN MANUFACTURING COMPANY, INC.",,492 KOLLER ST,SAN FRANCISCO,CA,94110,6505550000,,513948


In [30]:
data = data.sort_values(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', "HistoryDate"])
dedup_bii = data.drop_duplicates(subset=['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', ], 
                                 keep = "last", ignore_index = True)
print(dedup_bii.shape)

(17311232, 12)


In [36]:
dedup_bii[['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',]].to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_filtered_remove_dt_dedup_for_ECL.csv", 
             format = '.csv', overwrite = True, index = False)

## Select 5-10Mil sample

In [2]:
data = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/goodfile_futher_filtered.parquet", reader = pd.read_parquet)
print(data.shape)

(101152150, 11)


In [37]:
dedup_bii =data.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', ])['count'].sum().reset_index(name='count')
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 100, 1000, np.inf])
freq(dedup_bii["count"], format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,7425137.0,0.42892,7425137.0,0.42892
2,3061059.0,0.176825,10486196.0,0.605745
3,810805.0,0.046837,11297001.0,0.652582
4,505334.0,0.029191,11802335.0,0.681773
5,288999.0,0.016694,12091334.0,0.698468
6-10,2736862.0,0.158097,14828196.0,0.856565
11-100,2482181.0,0.143386,17310377.0,0.999951
101-1000,819.0,4.7e-05,17311196.0,0.999998
1001+,36.0,2e-06,17311232.0,1.0


In [38]:
sample = dedup_bii.sample(n=1_500_000, random_state=1234, ignore_index = True)
print(sample["count"].describe())
freq(sample["count"], format = [fmt])

count    1.500000e+06
mean     5.314587e+00
std      6.979322e+00
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      8.000000e+00
max      1.003000e+03
Name: count, dtype: float64


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,642817.0,0.4285447,642817.0,0.428545
2,265138.0,0.1767587,907955.0,0.605303
3,70782.0,0.047188,978737.0,0.652491
4,43771.0,0.02918067,1022508.0,0.681672
5,25083.0,0.016722,1047591.0,0.698394
6-10,237373.0,0.1582487,1284964.0,0.856643
11-100,214964.0,0.1433093,1499928.0,0.999952
101-1000,71.0,4.733333e-05,1499999.0,0.999999
1001+,1.0,6.666667e-07,1500000.0,1.0


In [39]:
## merge back to full input data:
temp = data.merge(sample, on = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', ])
print(temp.shape)

(7971880, 13)


In [40]:
freq(temp.HistoryDate.str.slice(0,6))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
HistoryDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
202107,908157.0,0.11392,908157.0,0.11392
202108,422458.0,0.052994,1330615.0,0.166914
202109,648204.0,0.081311,1978819.0,0.248225
202110,750634.0,0.09416,2729453.0,0.342385
202111,606733.0,0.076109,3336186.0,0.418494
202112,676377.0,0.084845,4012563.0,0.50334
202201,935557.0,0.117357,4948120.0,0.620697
202202,337750.0,0.042368,5285870.0,0.663064
202203,1317915.0,0.16532,6603785.0,0.828385
202204,468558.0,0.058776,7072343.0,0.887161


In [48]:
temp.to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_filtered_remove_dt_for_dt_modeling.csv", 
             format = '.csv', overwrite = True, index = False)

In [28]:
temp = pd.read_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_filtered_remove_dt_for_dt_modeling.csv", 
             reader = pd.read_csv)
print(temp.shape)

(7971880, 13)


In [29]:
temp.columns

Index(['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',
       'sufficient_input', 'count_x', 'count_y'],
      dtype='object')

In [30]:
temp.count_x.describe()

count    7971880.0
mean           1.0
std            0.0
min            1.0
25%            1.0
50%            1.0
75%            1.0
max            1.0
Name: count_x, dtype: float64

In [33]:
temp = temp.fillna("")

In [34]:
dedup_bii =temp.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', "HistoryDate"])['count_x'].sum().reset_index(name='count_x')
print(dedup_bii.shape)
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 100, 1000, np.inf])
freq(dedup_bii["count_x"], format = [fmt])

(7213105, 10)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,6689196.0,0.927367,6689196.0,0.927367
2,459160.0,0.063656,7148356.0,0.991023
3,36265.0,0.005028,7184621.0,0.996051
4,10755.0,0.001491,7195376.0,0.997542
5,4329.0,0.0006,7199705.0,0.998142
6-10,7504.0,0.00104,7207209.0,0.999183
11-100,5852.0,0.000811,7213061.0,0.999994
101-1000,44.0,6e-06,7213105.0,1.0
1001+,0.0,0.0,7213105.0,1.0


In [35]:
dedup_bii['count_x'] = 1
dedup_bii2 =dedup_bii.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber'])['count_x'].sum().reset_index(name='count_x')
print(dedup_bii2.shape)
fmt = make_format(cuts = [-np.inf, 0, 1, 2, 3, 4, 5, 10, 100, 1000, np.inf])
freq(dedup_bii2["count_x"], format = [fmt])

(1500000, 9)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,722777.0,0.4818513,722777.0,0.481851
2,252620.0,0.1684133,975397.0,0.650265
3,54926.0,0.03661733,1030323.0,0.686882
4,28978.0,0.01931867,1059301.0,0.706201
5,21594.0,0.014396,1080895.0,0.720597
6-10,216738.0,0.144492,1297633.0,0.865089
11-100,202366.0,0.1349107,1499999.0,0.999999
101-1000,1.0,6.666667e-07,1500000.0,1.0
1001+,0.0,0.0,1500000.0,1.0


In [36]:
dedup_bii2["count_x"].describe()

count    1.500000e+06
mean     4.808737e+00
std      6.341019e+00
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      7.000000e+00
max      1.950000e+02
Name: count_x, dtype: float64

In [37]:
dedup_bii2[dedup_bii2["count_x"] == 195]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,count_x
556304,GOOGLE LLC,,1600 AMPHITHEATRE PKWY,MOUNTAIN VIEW,CA,940431351,6502530000.0,,195


## EDA

In [39]:
# data.to_adls("Analytics/RnD Projects/Product RnD/Business/LexisNexis 9999 (SBFE Inquiry POC)/Data Modeling Clean/final_cleaned_filtered_input.parquet",
#              format = ".parquet")
data.columns

Index(['AccountNumber', 'CompanyName', 'AlternateCompanyName', 'Addr', 'City',
       'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate',
       'sufficient_input', 'HistoryDateTime'],
      dtype='object')

In [46]:
freq(data["HistoryDate"].str.slice(0,6))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
HistoryDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
202107,10580351.0,0.113945,10580351.0,0.113945
202108,4961686.0,0.053435,15542037.0,0.167379
202109,7551680.0,0.081327,23093717.0,0.248707
202110,8729360.0,0.09401,31823077.0,0.342717
202111,7075626.0,0.076201,38898703.0,0.418917
202112,7874424.0,0.084803,46773127.0,0.503721
202201,10880052.0,0.117172,57653179.0,0.620893
202202,3967399.0,0.042727,61620578.0,0.663619
202203,15275783.0,0.164512,76896361.0,0.828131
202204,5498433.0,0.059215,82394794.0,0.887346


In [44]:
data["hour"] = data["HistoryDateTime"].str.slice(11, 13)
freq(data["hour"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,7562908.0,0.081448,7562908.0,0.081448
1,9302182.0,0.100179,16865090.0,0.181628
2,5233410.0,0.056361,22098500.0,0.237989
3,4123220.0,0.044405,26221720.0,0.282393
4,1154576.0,0.012434,27376296.0,0.294828
5,7638569.0,0.082263,35014865.0,0.377091
6,6143345.0,0.06616,41158210.0,0.443251
7,2473964.0,0.026643,43632174.0,0.469894
8,5915128.0,0.063703,49547302.0,0.533597
9,13721959.0,0.147778,63269261.0,0.681375


In [48]:
data["count"] = 1

In [49]:
## group by timestamp
stacked_roll_1 =data.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDateTime',])['count'].sum().reset_index(name='count')
freq("count", df = stacked_roll_1, format = [fmt])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,85048628.0,0.9565666,85048628.0,0.956567
2,3811264.0,0.04286639,88859892.0,0.999433
3,35571.0,0.0004000774,88895463.0,0.999833
4,9447.0,0.0001062532,88904910.0,0.999939
5,2701.0,3.037893e-05,88907611.0,0.99997
6-10,2204.0,2.478903e-05,88909815.0,0.999994
11-100,486.0,5.466183e-06,88910301.0,1.0
101-1000,4.0,4.498916e-08,88910305.0,1.0
1001+,0.0,0.0,88910305.0,1.0


In [53]:
stacked_roll_1[stacked_roll_1["count"] == 117]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDateTime,count
31892382,GIRL SCOUTS OF GREATER CHICAGO,,650 LAKEVIEW PARKWAY,VERNON HILLS,IL,60061,7244215,,2022-03-01T02:25:13,117


In [51]:
stacked_roll_1["count"].describe()

count    8.891030e+07
mean     1.044370e+00
std      2.208217e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.170000e+02
Name: count, dtype: float64

In [60]:
## group by hour
stacked_roll_1 =data.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate', "hour"])['count'].sum().reset_index(name='count')
print(stacked_roll_1["count"].describe())
freq("count", df = stacked_roll_1, format = [fmt])

count    8.403028e+07
mean     1.105022e+00
std      1.440820e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.114000e+03
Name: count, dtype: float64


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,78473087.0,0.9338668,78473087.0,0.933867
2,4843583.0,0.05764092,83316670.0,0.991508
3,385850.0,0.004591797,83702520.0,0.996099
4,102552.0,0.001220417,83805072.0,0.99732
5,42372.0,0.0005042468,83847444.0,0.997824
6-10,92081.0,0.001095807,83939525.0,0.99892
11-100,90444.0,0.001076326,84029969.0,0.999996
101-1000,289.0,3.439236e-06,84030258.0,1.0
1001+,23.0,2.737109e-07,84030281.0,1.0


In [61]:
stacked_roll_1[stacked_roll_1["CompanyName"] == "ISTCMSTEST322"]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate,hour,count
37031244,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210701,10,4
37031245,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,8,95
37031246,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,9,636
37031247,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,10,658
37031248,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,11,534
37031249,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,12,559
37031250,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,13,588
37031251,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,14,580
37031252,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210716,15,162
37031253,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210722,11,293


In [55]:
stacked_roll_1[stacked_roll_1["count"] == 2114]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate,hour,count
30271541,GIRL SCOUTS OF GREATER CHICAGO,,650 LAKEVIEW PARKWAY,VERNON HILLS,IL,60061,7244215,,20220301,2,2114


In [56]:
## group by date
stacked_roll_1 =data.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate'])['count'].sum().reset_index(name='count')
print(stacked_roll_1["count"].describe())
freq("count", df = stacked_roll_1, format = [fmt])

count    8.319994e+07
mean     1.116050e+00
std      4.446888e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.898300e+04
Name: count, dtype: float64


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= 0,0.0,0.0,0.0,0.0
1,77155338.0,0.927348,77155338.0,0.927348
2,5292931.0,0.063617,82448269.0,0.990965
3,418393.0,0.005029,82866662.0,0.995994
4,125097.0,0.001504,82991759.0,0.997498
5,49797.0,0.000599,83041556.0,0.998096
6-10,87000.0,0.001046,83128556.0,0.999142
11-100,69085.0,0.00083,83197641.0,0.999972
101-1000,1977.0,2.4e-05,83199618.0,0.999996
1001+,323.0,4e-06,83199941.0,1.0


In [58]:
stacked_roll_1[stacked_roll_1["CompanyName"] == "GIRL SCOUTS OF GREATER CHICAGO"]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate,count
29960592,GIRL SCOUTS OF GREATER CHICAGO,,1005 W 178TH ST,HOMEWOOD,IL,60430,8475730500,,20220301,46
29960593,GIRL SCOUTS OF GREATER CHICAGO,,1551 SPENCER ROAD,JOLIET,IL,604338591,3129126373,,20220301,119
29960594,GIRL SCOUTS OF GREATER CHICAGO,,20 S CLARK ST LBBY 2,CHICAGO,IL,606031809,3124162500,,20220316,1
29960595,GIRL SCOUTS OF GREATER CHICAGO,,3155 71ST ST,WOODRIDGE,IL,60517,3124162500,,20220301,58
29960596,GIRL SCOUTS OF GREATER CHICAGO,,650 LAKEVIEW PARKWAY,VERNON HILLS,IL,60061,7244215,,20211223,1
29960597,GIRL SCOUTS OF GREATER CHICAGO,,650 LAKEVIEW PARKWAY,VERNON HILLS,IL,60061,7244215,,20220301,2114
29960598,GIRL SCOUTS OF GREATER CHICAGO,,8699 BROADWAY,MERRILLVILLE,IN,464107012,3124162500,,20220301,53


In [59]:
stacked_roll_1[stacked_roll_1["count"] == 18983]

Unnamed: 0,CompanyName,AlternateCompanyName,Addr,City,State,Zip,BusinessPhone,TaxIdNumber,HistoryDate,count
36642797,ISTCMSTEST322,,915 E BRETT ST,INGLEWOOD,CA,,0,,20210723,18983


In [None]:
## group by date
stacked_roll_1 =data.groupby(by = ['CompanyName', 'AlternateCompanyName', 'Addr', 'City', 'State', 'Zip', 'BusinessPhone', 'TaxIdNumber', 'HistoryDate'])['count'].sum().reset_index(name='count')
print(stacked_roll_1["count"].describe())
freq("count", df = stacked_roll_1, format = [fmt])