## Import Libraries

In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import numpy as np

## Hop Teaming Data

In [2]:
hop_sample = pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', nrows = 100000)

In [3]:
hop_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   from_npi           100000 non-null  int64  
 1   to_npi             100000 non-null  int64  
 2   patient_count      100000 non-null  int64  
 3   transaction_count  100000 non-null  int64  
 4   average_day_wait   100000 non-null  float64
 5   std_day_wait       100000 non-null  float64
dtypes: float64(2), int64(4)
memory usage: 4.6 MB


In [4]:
#Playing with pre-filtering a bit
hop_sample.loc[(hop_sample['transaction_count'] >= 25) & (hop_sample['average_day_wait'] < 90)]

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508062167,1730166109,350,370,53.922,72.612
1,1508065640,1730166109,25,25,49.800,55.006
6,1508076001,1730166109,46,48,72.625,83.263
7,1508085911,1730166125,58,67,23.925,43.923
10,1508167040,1730166125,51,51,28.196,52.876
...,...,...,...,...,...,...
99986,1508803701,1780756486,46,95,20.600,30.382
99994,1508027491,1780756924,38,39,55.359,61.435
99995,1508859679,1780756957,52,58,60.121,66.038
99997,1508849605,1780756957,25,39,29.513,34.073


# Create the hop_team SQLite database
db = sqlite3.connect('../data/hop_db.sqlite')

# create the referrals table and add it to the database
for chunk in tqdm(pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)):
    # Do some preemptive filtering for likely "accidental" referrals
    chunk = chunk.loc[(chunk['transaction_count'] >= 25) & (chunk['average_day_wait'] < 90)]
    # append to referrals table
    chunk.to_sql('hop', db, if_exists = 'append', index = False)            

## NPI/NPPES Data

In [6]:
# list of columns needed for our project
select_cols = ['NPI',
               'Entity Type Code',
               'Provider Organization Name (Legal Business Name)',
               'Provider Last Name (Legal Name)',
               'Provider First Name',
               'Provider Middle Name',
               'Provider Name Prefix Text',
               'Provider Name Suffix Text',
               'Provider Credential Text',
               'Provider First Line Business Practice Location Address',
               'Provider Second Line Business Practice Location Address',
               'Provider Business Practice Location Address City Name',
               'Provider Business Practice Location Address State Name',
               'Provider Business Practice Location Address Postal Code',
               'Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
               'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
               'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
               'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
               'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
               'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
               'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
               'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
               'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
               'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
               'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
               'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
               'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
               'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
               'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15']

In [33]:
# Pull a sample of the data to explore
npi_sample = pd.read_csv('../data/npidata_pfile_20050523-20230212.csv',
                         usecols = select_cols,
                         skiprows = range(1, 600),
                         nrows = 100000)

  npi_sample = pd.read_csv('../data/npidata_pfile_20050523-20230212.csv',


In [34]:
npi_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 44 columns):
 #   Column                                                   Non-Null Count   Dtype  
---  ------                                                   --------------   -----  
 0   NPI                                                      100000 non-null  int64  
 1   Entity Type Code                                         100000 non-null  int64  
 2   Provider Organization Name (Legal Business Name)         0 non-null       float64
 3   Provider Last Name (Legal Name)                          99998 non-null   object 
 4   Provider First Name                                      100000 non-null  object 
 5   Provider Middle Name                                     77258 non-null   object 
 6   Provider Name Prefix Text                                56006 non-null   object 
 7   Provider Name Suffix Text                                3123 non-null    object 
 8   Provider Creden

In [35]:
# How many providers don't indicate a primary taxonomy code, designated by a value of 'X'?
npi_sample['Healthcare Provider Primary Taxonomy Switch_1'].value_counts()
# Based on looking at a few chunks, it appears to be around 4%

Y    94361
X     4935
N      704
Name: Healthcare Provider Primary Taxonomy Switch_1, dtype: int64

In [36]:
# For providers that have indicated a primary taxonomy code, pull that code into a new column
npi_sample['taxonomy_code'] = np.nan

for n in range(1, 16):
    x = str(n)
    npi_sample.loc[npi_sample[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                   'taxonomy_code'] = npi_sample[f'Healthcare Provider Taxonomy Code_{x}']

In [37]:
# For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
npi_sample.loc[npi_sample['taxonomy_code'].isna(), 'taxonomy_code'] = npi_sample['Healthcare Provider Taxonomy Code_1']

In [38]:
npi_sample = npi_sample.drop(columns = ['Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
                                  'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
                                  'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
                                  'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
                                  'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
                                  'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
                                  'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
                                  'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
                                  'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
                                  'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
                                  'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
                                  'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
                                  'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
                                  'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
                                  'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15'])

In [39]:
# Create a function to deal with the leading zeroes that are missing from zip codes

def zip_zeroes(x):
    if len(x) > 5:
        return x.zfill(9)[:5]
    elif len(x) > 0:
        return x.zfill(5)
    else: return np.nan

In [40]:
# apply function to the zip code column
npi_sample['Provider Business Practice Location Address Postal Code'] = npi_sample['Provider Business Practice Location Address Postal Code'].astype(str).apply(zip_zeroes)

In [41]:
npi_sample.columns = [x.lower()
                     .replace('provider ', '')
                     .replace('business ', '')
                     .replace('practice ', '')
                     .replace(' text', '')
                     .replace(' (legal name)', '')
                     .replace(' ', '_') for x in npi_sample.columns] 

In [42]:
npi_sample

Unnamed: 0,npi,entity_type_code,organization_name,last_name,first_name,middle_name,name_prefix,name_suffix,credential,first_line_location_address,second_line_location_address,location_address_city_name,location_address_state_name,location_address_postal_code,taxonomy_code
0,1629071188,1,,HANNA,RUBA,H.,,,MD,1033 ROUTE 46,,CLIFTON,NJ,07013,208000000X
1,1447253901,1,,KATTNER,PAUL,FREDERICK,DR.,,"DDS, MS",1300 GOLF RD,,WAUKEGAN,IL,60087,1223X0400X
2,1265435721,1,,LAZARIS,EVYENIA,M.,,,MD,1033 ROUTE 46,,CLIFTON,NJ,07013,208000000X
3,1891798351,1,,MEDITERRANEO,SUSAN,,,,MD,1033 ROUTE 46,,CLIFTON,NJ,07013,208000000X
4,1700889268,1,,NIZIOL,JOHN,A.,,,MD,1033 ROUTE 46,,CLIFTON,NJ,07013,208000000X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1790723765,1,,LOMBARI-NELLE,MARIANNE,,,,LPC,701 PHILLIPS PL,,HUNTSVILLE,AR,72740,101YP2500X
99996,1508804576,1,,CHIAPPA,JULIE,R,MRS.,,CPNP,3400 CIVIC CENTER BLVD,,PHILADELPHIA,PA,19104,363LP0200X
99997,1780622753,1,,DEHART,THOMAS,ELI,MR.,,,4815 N ASSEMBLY ST,,SPOKANE,WA,99205,225100000X
99998,1841238763,1,,BALZLI,HELEN,M,,,PT,13111 HOOPER RD,,BATON ROUGE,LA,70818,225100000X


In [43]:
# create the providers table from npi/nppes data and add it to the database
for chunk in tqdm(pd.read_csv('../data/npidata_pfile_20050523-20230212.csv', 
                              usecols = select_cols,
                              chunksize = 100000,
                              low_memory = False)):
    
    # For providers that have indicated a primary taxonomy code, pull that code into a new column
    chunk['taxonomy_code'] = np.nan
    for n in range(1, 16):
        x = str(n)
        chunk.loc[chunk[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                       'taxonomy_code'] = chunk[f'Healthcare Provider Taxonomy Code_{x}']
        
    # For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
    chunk.loc[chunk['taxonomy_code'].isna(), 'taxonomy_code'] = chunk['Healthcare Provider Taxonomy Code_1']
    
    # Drop the columns no longer needed
    chunk = chunk.drop(columns = ['Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
                                  'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
                                  'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
                                  'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
                                  'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
                                  'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
                                  'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
                                  'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
                                  'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
                                  'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
                                  'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
                                  'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
                                  'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
                                  'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
                                  'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15'])
     
    #clean up zip code column by putting missing leading zeroes back and getting the 9-digit entries down to 5
    chunk['Provider Business Practice Location Address Postal Code'] = chunk['Provider Business Practice Location Address Postal Code'].astype(str).apply(zip_zeroes)
    
    # Clean up the column names
    chunk.columns = [x.lower()
                     .replace('provider ', '')
                     .replace('business ', '')
                     .replace('practice ', '')
                     .replace(' text', '')
                     .replace(' (legal name)', '')
                     .replace(' ', '_') for x in chunk.columns]                  
   
    # append to provider table
    chunk.to_sql('npi', db, if_exists = 'append', index = False)            

0it [00:00, ?it/s]

## Taxonomy Details

In [None]:
tax_deets = pd.read_csv('../data/nucc_taxonomy_230.csv')

In [None]:
tax_deets.info()

In [None]:
# make column names consistent with formatting of other tables
tax_deets.columns = [x.lower()
                     .replace(' ', '_') for x in tax_deets.columns]
# make the taxonomy code column name match the taxonomy code column name in the provider table
tax_deets = tax_deets.rename(columns = {'code' : 'taxonomy_code'})

In [None]:
tax_deets

In [None]:
# create the taxonomy table and add it to the database
tax_deets.to_sql('taxonomy', db, if_exists = 'append', index = False)

## Zip Code/CBSA Date

In [None]:
zip_cbsa = pd.read_excel('../data/ZIP_CBSA_122021.xlsx')

In [None]:
zip_cbsa.info()

In [None]:
#simplify column names
zip_cbsa = zip_cbsa.rename(columns = {'usps_zip_pref_city' : 'city',
                              'usps_zip_pref_state' : 'state'})

#get the leading zeroes back in place
zip_cbsa['zip'] = zip_cbsa['zip'].astype(str).str.zfill(5)

In [None]:
zip_cbsa.info()

In [None]:
# create the zip_cbsa table and add it to the database
zip_cbsa.to_sql('zip_cbsa', db, if_exists = 'append', index = False)

In [None]:
# db.close()