In [2]:
import pandas as pd
import numpy as np
import sqlite3
from tqdm.notebook import tqdm
import os 

In [2]:
#Prefilter out all npidata csv columns that we dont need, created a new csv with the column names to keep
save_cols = pd.read_csv('./NPPES_Data_Dissemination_February_2023/new_columns.csv', header=None)
save_cols_list = save_cols.iloc[0,:].tolist()

drop_cols = pd.read_csv('./NPPES_Data_Dissemination_February_2023/drop_cols.csv', header=None)
drop_cols_list = drop_cols.iloc[0,:].tolist()

In [3]:
# Create a function to deal with the leading zeroes that are missing from zip codes

def zip_zeroes(x):
    if len(x) > 5:
        return x.zfill(9)[:5]
    elif len(x) > 0:
        return x.zfill(5)
    else: return np.nan

In [4]:
def create_database(save_cols_list, drop_cols_list):

    #Create the 2 tables from DocGraph and NPPES downloads, 
    # and the 2 tables from the NUCC taxonomy and Zip to CBSA info 
    # and add them to a new data base called 'hop_db.sqlite'
    
    db = sqlite3.connect('./data/hop_db.sqlite')

    ##### Table 1
    for chunk in tqdm(pd.read_csv('./DocGraph_Hop_Teaming_2018_Commercial/DocGraph_Hop_Teaming_2018.csv', chunksize=100000)):
        chunk = chunk[(chunk['transaction_count'] >= 25) & (chunk['average_day_wait'] < 90)] #Prefiltering accidental referrals
        chunk.to_sql('hop', db, if_exists = 'append', index = False) 
    
    ##### Table 2
    for chunk in tqdm(pd.read_csv('./NPPES_Data_Dissemination_February_2023/npidata_pfile_20050523-20230212.csv', dtype=str, chunksize=100000, usecols=save_cols_list)):
        
        # For providers that have indicated a primary taxonomy code, pull that code into a new column
        chunk['taxonomy_code'] = np.nan
        
        for n in range(1, 16):
            x = str(n)
            chunk.loc[chunk[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                        'taxonomy_code'] = chunk[f'Healthcare Provider Taxonomy Code_{x}']
            
        # For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
        chunk.loc[chunk['taxonomy_code'].isna(), 'taxonomy_code'] = chunk['Healthcare Provider Taxonomy Code_1'] 
        # Drop the columns no longer needed
        chunk = chunk.drop(columns = drop_cols_list)
        #clean up zip code column by putting missing leading zeroes back and getting the 9-digit entries down to 5
        chunk['Provider Business Practice Location Address Postal Code'] = chunk['Provider Business Practice Location Address Postal Code'].astype(str).apply(zip_zeroes)
        # Clean up the column names
        chunk.columns = [x.lower()
                 .replace('provider ', '')
                 .replace('business ', '')
                 .replace('practice ', '')
                 .replace(' text', '')
                 .replace(' (legal name)', '')
                 .replace(' ', '_') for x in chunk.columns]  
        
        # append to provider table
        chunk.to_sql('npidata', db, if_exists = 'append', index = False) 

    # 2 auxilary tables from other links
    
    ##### Table 3
    for chunk in tqdm(pd.read_csv('./data/nucc_taxonomy_230.csv', encoding='unicode_escape', chunksize=1000)):
        # make column names consistent with formatting of other tables
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
        # make the taxonomy code column name match the taxonomy code column name in the provider table
        chunk = chunk.rename(columns = {'code' : 'taxonomy_code'})
        chunk.to_sql('taxonomy', db, if_exists = 'append', index = False)  
    
    ##### Table 4
    for chunk in tqdm(pd.read_csv('./data/ZIP_CBSA.csv', chunksize=10000)):
        #simplify column names
        chunk = chunk.rename(columns = {'usps_zip_pref_city' : 'city',
                              'usps_zip_pref_state' : 'state'})
        #get the leading zeroes back in place
        chunk['zip'] = chunk['zip'].astype(str).str.zfill(5)
        chunk.to_sql('zip_to_cbsa', db, if_exists = 'append', index = False)  

    db.close()

In [5]:

if os.path.isfile('./data/hop_db.sqlite') != True: # Make sure the data base isn't already created, this is a large DB
    create_database(save_cols_list, drop_cols_list)


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [3]:
db = sqlite3.connect('./data/hop_db.sqlite')

##### Table 5, th community data from neo4j algorithm 
for chunk in tqdm(pd.read_csv('./data/npi_community.csv', chunksize=1000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
    chunk.to_sql('npi_community', db, if_exists = 'append', index = False) 

db.close()

0it [00:00, ?it/s]