In [15]:
import pandas as pd
import sqlite3
import numpy as np
from tqdm.notebook import tqdm
import os.path
from os import path


# 1 DocGraph_Hop_Teaming_2018

## 1.1 Read sample hop data

In [2]:
#Sample filters
hop = pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', 
                          nrows = 10000) #.query('transaction_count  >= 50').query('patient_count < 50')
hop= hop[(hop['transaction_count'] >= 25) & (hop['average_day_wait'] < 90 )]


In [4]:
hop

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
38,1508205808,1730166224,48,64,14.156,35.859
89,1508147810,1730166844,41,72,30.556,30.859
90,1508061151,1730166851,42,78,30.974,27.043
91,1508000332,1730166851,41,73,32.877,29.256
99,1508845850,1730166943,23,61,9.361,36.323
...,...,...,...,...,...,...
9889,1508848029,1730475542,48,73,29.425,56.513
9904,1508882432,1730475757,49,53,38.151,60.661
9939,1508097759,1730476391,37,51,46.039,52.430
9940,1508008228,1730476391,19,51,25.314,40.784


## 1.2 Create hop_db database and load hop table

In [22]:
if path.exists("../data/hop_db.sqlite") == True:   
   with sqlite3.connect('../data/hop_db.sqlite') as db: 

    for chunk in tqdm(pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)):
        chunk = chunk[(chunk['transaction_count'] >= 25 ) & chunk['average_day_wait'] < 90]       
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
        chunk.to_sql('hop', db, if_exists = 'append', index = False)  

0it [00:00, ?it/s]

# 2 Read NPPES file and load npi table  

## 2.1 Create a list for column names from npi file

In [4]:
result_cols =['NPI',
              'Entity Type Code',
              'Provider Organization Name (Legal Business Name)',
              'Provider Last Name (Legal Name)',
              'Provider First Name',
              'Provider Middle Name',
              'Provider Name Prefix Text',
              'Provider Name Suffix Text',
              'Provider Credential Text',
              'Provider First Line Business Practice Location Address',
              'Provider Second Line Business Practice Location Address',
              'Provider Business Practice Location Address City Name',
              'Provider Business Practice Location Address State Name',
              'Provider Business Practice Location Address Postal Code',
              'Healthcare Provider Taxonomy Code_1',
              'Healthcare Provider Taxonomy Code_2',
              'Healthcare Provider Taxonomy Code_3',
              'Healthcare Provider Taxonomy Code_4',
              'Healthcare Provider Taxonomy Code_5',
              'Healthcare Provider Taxonomy Code_6',
              'Healthcare Provider Taxonomy Code_7',
              'Healthcare Provider Taxonomy Code_8',
              'Healthcare Provider Taxonomy Code_9',
              'Healthcare Provider Taxonomy Code_10',
              'Healthcare Provider Taxonomy Code_11',
              'Healthcare Provider Taxonomy Code_12',
              'Healthcare Provider Taxonomy Code_13',
              'Healthcare Provider Taxonomy Code_14',
              'Healthcare Provider Taxonomy Code_15',
              'Healthcare Provider Primary Taxonomy Switch_1',
              'Healthcare Provider Primary Taxonomy Switch_2',
              'Healthcare Provider Primary Taxonomy Switch_3',
              'Healthcare Provider Primary Taxonomy Switch_4',
              'Healthcare Provider Primary Taxonomy Switch_5',
              'Healthcare Provider Primary Taxonomy Switch_6',
              'Healthcare Provider Primary Taxonomy Switch_7',
              'Healthcare Provider Primary Taxonomy Switch_8',
              'Healthcare Provider Primary Taxonomy Switch_9',
              'Healthcare Provider Primary Taxonomy Switch_10',
              'Healthcare Provider Primary Taxonomy Switch_11',
              'Healthcare Provider Primary Taxonomy Switch_12',
              'Healthcare Provider Primary Taxonomy Switch_13',
              'Healthcare Provider Primary Taxonomy Switch_14',
              'Healthcare Provider Primary Taxonomy Switch_15']

## 2.2 Read sample data

In [8]:
# Read sample data
df = pd.read_csv('../NPPES_Data_Dissemination_February_2023/npidata_pfile_20050523-20230212.csv', 
                          nrows = 1000, usecols =result_cols)
df

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,...,Healthcare Provider Taxonomy Code_11,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15
0,1740284231,,,,,,,,,,...,,,,,,,,,,
1,1346245800,,,,,,,,,,...,,,,,,,,,,
2,1487650776,,,,,,,,,,...,,,,,,,,,,
3,1033113022,,,,,,,,,,...,,,,,,,,,,
4,1043216138,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1306849922,1.0,,PENRY,DAVID,V,,,APRN,"NP RESOURCES, LLC.",...,,,,,,,,,,
996,1124021746,1.0,,COTTER,JOHN,BURLEY,DR.,,MD,1270 PEACH ST,...,,,,,,,,,,
997,1851394472,1.0,,PETERS,DAVID,S,,,MD,112 S NORTHWEST HWY,...,,,,,,,,,,
998,1487657003,1.0,,CULALA,RYAN,MICHAEL,,,PHARM.D.,11521 NE 128TH ST,...,,,,,,,,,,


In [9]:
# Create a function to deal with the leading zeroes that are missing from zip codes

def zip_zeroes(x):
    if len(x) > 5:
        return x.zfill(9)[:5]
    elif len(x) > 0:
        return x.zfill(5)
    else: return np.nan

In [16]:

db = sqlite3.connect('../data/hop_db.sqlite')

In [17]:
# create the providers table from npi/nppes data and add it to the database
for chunk in tqdm(pd.read_csv('../NPPES_Data_Dissemination_February_2023/npidata_pfile_20050523-20230212.csv', 
                              usecols = result_cols,
                              chunksize = 100000,
                              low_memory = False)):
    
    # For providers that have indicated a primary taxonomy code, pull that code into a new column
    chunk['taxonomy_code'] = np.nan
    for n in range(1, 16):
        x = str(n)
        chunk.loc[chunk[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                       'taxonomy_code'] = chunk[f'Healthcare Provider Taxonomy Code_{x}']
        
    # For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
    chunk.loc[chunk['taxonomy_code'].isna(), 'taxonomy_code'] = chunk['Healthcare Provider Taxonomy Code_1']
    
    # Drop the columns no longer needed
    chunk = chunk.drop(columns = ['Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
                                  'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
                                  'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
                                  'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
                                  'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
                                  'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
                                  'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
                                  'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
                                  'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
                                  'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
                                  'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
                                  'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
                                  'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
                                  'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
                                  'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15'])
     
    #clean up zip code column by putting missing leading zeroes back and getting the 9-digit entries down to 5
    chunk['Provider Business Practice Location Address Postal Code'] = chunk['Provider Business Practice Location Address Postal Code'].astype(str).apply(zip_zeroes)
    
    # Clean up the column names
    chunk.columns = [x.lower()
                     .replace('provider ', '')
                     .replace('business ', '')
                     .replace('practice ', '')
                     .replace(' text', '')
                     .replace(' (legal name)', '')
                     .replace(' ', '_') for x in chunk.columns]                  
   
    # append to provider table
    chunk.to_sql('npi', db, if_exists = 'append', index = False)   

0it [00:00, ?it/s]

In [25]:
db.close()

# 3. taxonomy file

## 3.1 Sample read

In [23]:
tax_deets = pd.read_csv('../data/nucc_taxonomy_230.csv')


## 3.2 Load table taxonomy

In [30]:
#uncomment  
#with sqlite3.connect('../data/hop_db.sqlite') as db: 

    for chunk in tqdm(pd.read_csv('../data/nucc_taxonomy_230.csv', chunksize = 10000)):             
        chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
        # make the taxonomy code column name match the taxonomy code column name in the provider table
        chunk = chunk.rename(columns = {'code' : 'taxonomy_code'})

        chunk.to_sql('taxonomy', db, if_exists = 'append', index = False)    

0it [00:00, ?it/s]

# 4 ZIP_CBSA file

## 4.1 Read sample ZIP_CBSA

In [32]:
df = pd.read_excel('../data/ZIP_CBSA_122021.xlsx')

In [33]:
df

Unnamed: 0,zip,cbsa,usps_zip_pref_city,usps_zip_pref_state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,683,41900,SAN GERMAN,PR,0.999842,1.0,1.0,0.999855
1,683,32420,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
2,923,41980,SAN JUAN,PR,1.000000,1.0,1.0,1.000000
3,1010,44140,BRIMFIELD,MA,0.976896,1.0,1.0,0.977816
4,1010,49340,BRIMFIELD,MA,0.023104,0.0,0.0,0.022184
...,...,...,...,...,...,...,...,...
47479,60684,16980,CHICAGO,IL,0.000000,1.0,0.0,1.000000
47480,33945,15980,PINELAND,FL,0.000000,0.0,1.0,1.000000
47481,78144,99999,PANNA MARIA,TX,0.000000,1.0,0.0,1.000000
47482,12257,10580,ALBANY,NY,0.000000,1.0,0.0,1.000000


## 4.2 Load zip_cbsa table

In [41]:
with sqlite3.connect('../data/hop_db.sqlite') as db: 
    df_zip = pd.read_excel('../data/ZIP_CBSA_122021.xlsx')
    df_zip = df_zip.rename(columns = {'usps_zip_pref_city' : 'city','usps_zip_pref_state' : 'state'})
    #get the leading zeroes back in place
    df_zip['zip'] = df_zip['zip'].astype(str).str.zfill(5)       
    df_zip.to_sql('zip_cbsa', db, if_exists = 'append', index = False) 