In [3]:
# Libraries needed: 
import time
import pickle
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate
from dsgtools.reporting import col_summary
from dsgtools import azure

## Sample preparation

#### SBFE Sample

In [4]:
path = "Analytics/Personal Folders/liuwei01/2023/ARMBS_ticket/2390_SOS/combined_prod_liuwei01_37119_customer_input_W20230221-140008.csv"
SBFE_combined = pd.read_adls(path, reader = pd.read_csv,  encoding='iso-8859-1', dtype = str)
print(SBFE_combined.shape)

(981928, 42)


In [5]:
SBFE_combined["blank_ct"] = SBFE_combined[['businessname', 'businessaddress', 'businesscity', 'businessstate']].isnull().sum(axis = 1)
freq(SBFE_combined["blank_ct"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
blank_ct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,891481.0,0.907888,891481.0,0.907888
1,33132.0,0.033742,924613.0,0.94163
2,41404.0,0.042166,966017.0,0.983796
3,3845.0,0.003916,969862.0,0.987712
4,12066.0,0.012288,981928.0,1.0


In [6]:
SBFE_combined = SBFE_combined[SBFE_combined["blank_ct"] == 0]
keep = ['transactionid', 'dateadded', 'businessname', 'businessaddress', 'businesscity','businessstate', 'businesszip', 'businessphone',]
SBFE_combined = SBFE_combined[keep]
SBFE_combined = SBFE_combined.sort_values(by = ['businessname', 'businessaddress', 'businesscity','businessstate', 'dateadded'])
SBFE_combined = SBFE_combined.drop_duplicates(subset = ['businessname', 'businessaddress', 'businesscity','businessstate'], 
                                              keep = "last", ignore_index = True)
print(SBFE_combined.shape)

(795958, 8)


In [7]:
freq(SBFE_combined["dateadded"].astype(str).str.slice(0, 6))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
dateadded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
202210,167259.0,0.210135,167259.0,0.210135
202211,167168.0,0.210021,334427.0,0.420157
202212,158105.0,0.198635,492532.0,0.618791
202301,187094.0,0.235055,679626.0,0.853847
202302,116332.0,0.146153,795958.0,1.0


In [17]:
SBFE_combined = SBFE_combined[SBFE_combined["dateadded"].astype(str).str.slice(0, 6) != "202210"]
temp_sample = SBFE_combined.sample(500_000, replace = False, random_state = 0, ignore_index = True)
print(temp_sample.shape)

(500000, 8)


In [18]:
temp_sample["business_fein"] = ""
temp_sample["source"] = "SBFE_combined"

#### BIID Sample

In [9]:
path = "Analytics/Personal Folders/liuwei01/2023/ARMBS_ticket/2390_SOS/prod_liuwei01_37109_biid2_input_W20230221-133008.csv"
biid = pd.read_adls(path, reader = pd.read_csv,  encoding='iso-8859-1', dtype = str)
print(biid.shape)

(7411276, 93)


In [11]:
keep = ['transaction_id', 'datetime', 'incompanyname', 'incompanystreetaddress', 'incompanycity','incompanystate', 'incompanyzip5', 'incompanyphone', "incompanyfein"]
biid = biid[keep]
biid.columns = ['transactionid', 'dateadded', 'businessname', 'businessaddress', 'businesscity','businessstate', 'businesszip', 'businessphone', "business_fein"]
biid["blank_ct"] = biid[['businessname', 'businessaddress', 'businesscity', 'businessstate']].isnull().sum(axis = 1)
freq(biid["blank_ct"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biid["blank_ct"] = biid[['businessname', 'businessaddress', 'businesscity', 'businessstate']].isnull().sum(axis = 1)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
blank_ct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6978629.0,0.9416231,6978629.0,0.941623
1,13883.0,0.001873227,6992512.0,0.943496
2,7477.0,0.001008868,6999989.0,0.944505
3,5.0,6.746477e-07,6999994.0,0.944506
4,411282.0,0.05549409,7411276.0,1.0


In [12]:
biid = biid[biid["blank_ct"] == 0]
biid = biid.sort_values(by = ['businessname', 'businessaddress', 'businesscity','businessstate', 'dateadded'])
biid = biid.drop_duplicates(subset = ['businessname', 'businessaddress', 'businesscity','businessstate'], keep = "last", ignore_index = True)
print(biid.shape)

(5616978, 10)


In [13]:
freq(biid["dateadded"].astype(str).str.slice(0, 6))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
dateadded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
202210,1178829.0,0.209869,1178829.0,0.209869
202211,1143592.0,0.203596,2322421.0,0.413464
202212,1140410.0,0.203029,3462831.0,0.616494
202301,1308767.0,0.233002,4771598.0,0.849496
202302,845380.0,0.150504,5616978.0,1.0


In [16]:
biid = biid[biid["dateadded"].astype(str).str.slice(0, 6) == "202302"]
temp_sample_biid = biid.sample(500_000, replace = False, random_state = 0, ignore_index = True)
print(temp_sample_biid.shape)
temp_sample_biid["source"] = "IIDBv2"

(500000, 10)


#### Final Sample

In [19]:
final = pd.concat([temp_sample, temp_sample_biid], ignore_index= True)
print(final.shape)
freq(final.source)

(1000000, 11)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IIDBv2,500000.0,0.5,500000.0,0.5
SBFE_combined,500000.0,0.5,1000000.0,1.0


In [22]:
final.to_adls("Analytics/RnD Projects/Product RnD/Business/_ARMBS_Tickets/2023/ARMBS_2390/sbfe_combined_biid_1mil_2390_input.parquet", format = '.parquet', overwrite = True)