In [2]:
import pandas as pd
import numpy as np
import censusgeocode as cg
import time
from datetime import datetime

import geopandas as gpd
import os

In [2]:
from requests.exceptions import ConnectionError

### Data loading functions

1. Clinic/Center - Amputee: 261QA0900X
2. Orthotist: 222Z00000X
3. Prosthetist: 224P00000X
4. Prosthetic/Orthotic Supplier:335E00000X

In [3]:
keep_col = ['NPI','Entity Type Code','Provider Organization Name (Legal Business Name)',
            'NPI Deactivation Reason Code','NPI Deactivation Date','NPI Reactivation Date',
            'Provider First Line Business Practice Location Address',
            'Provider Business Practice Location Address City Name',
            'Provider Business Practice Location Address State Name',
            'Provider Business Practice Location Address Postal Code',
            'Provider Business Practice Location Address Telephone Number']

taxon_codes = ['Healthcare Provider Taxonomy Code_' + str(i+1) for i in range(15)]
taxonswitch_codes = ['Healthcare Provider Primary Taxonomy Switch_' + str(i+1) for i in range(15)]
keep_col += taxon_codes
keep_col += taxonswitch_codes
community_pharm = ['261QA0900X','222Z00000X', '224P00000X','335E00000X']
npi_csv = 'npidata_pfile_20050523-20230212.csv'

def csv_chunks(file,chunk_size,keep_cols,row_sub):
    header_fields = list(pd.read_csv(npi_csv, nrows=1))
    header_locs = [header_fields.index(i) for i in keep_cols]
    skip = 1
    it_n = 0
    sub_n = 0
    ret_chunk = chunk_size
    fin_li_dat = []
    while ret_chunk == chunk_size:
        file_chunk = pd.read_csv(file, usecols=header_locs, skiprows=skip, 
                     nrows=chunk_size, names=header_fields, dtype='str')
        sub_dat = row_sub(file_chunk)
        fin_li_dat.append( sub_dat.copy() )
        skip += chunk_size
        it_n += 1
        sub_n += sub_dat.shape[0]
        print(f'Grabbed iter {it_n} total sub n so far {sub_n}')
        ret_chunk = file_chunk.shape[0]
    fin_dat = pd.concat(fin_li_dat, axis=0)
    return fin_dat

### Geo spliting functions

In [4]:
end_str = [' STE', ' SUITE', ' BLDG', ' TOWER', ', #', ' UNIT',
           ' APT', ' BUILDING',',', '#']

def clean_add(address):
    add_new = address.upper()
    for su in end_str:
        sf = address.find(su)
        if sf > -1:
            add_new = add_new[0:sf]
    add_new = add_new.replace('.','')
    add_new = add_new.strip()
    return add_new

In [17]:
def split_geo(df, add, city, state, zipcode, chunk_size):
    df_new = df.copy()
    df_new.reset_index(inplace=True)
    splits = np.ceil(df_new.shape[0]/chunk_size)
    chunk_li = np.array_split(df_new['index'], splits)
    res_li = []
    pick_fi = []
    for i,c in enumerate(chunk_li):
        # Grab data, export to csv
        sub_data = df_new.loc[c, ['index',add,city,state,zipcode]]
        return sub_data
        sub_data.to_csv('temp_geo.csv',header=False,index=False)
        # Geo the results and turn back into df
        print(f'Geocoding round {int(i)+1} of {int(splits)}, {datetime.now()}')
        result = cg.addressbatch('temp_geo.csv')
        names = list(result[0].keys())
        res_zl = []
        for r in result:
            res_zl.append(list(r.values()))
        res_df = pd.DataFrame(res_zl, columns=names)
        res_li.append(res_df.copy())
        time.sleep(10) #sleep 10 seconds to not get cutoff from request
    final_df = pd.concat(res_li)
    final_df.rename(columns={'id':'row'}, inplace=True)
    final_df.reset_index(inplace=True, drop=True)
    # Clean up csv file
    os.remove('temp_geo.csv')
    return final_df

### California

In [5]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'CA'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_ca = pharm_tx

2023-04-22 14:31:07.603287
Grabbed iter 1 total sub n so far 67
Grabbed iter 2 total sub n so far 191
Grabbed iter 3 total sub n so far 345
Grabbed iter 4 total sub n so far 448
Grabbed iter 5 total sub n so far 529
Grabbed iter 6 total sub n so far 664
Grabbed iter 7 total sub n so far 761
Grabbed iter 8 total sub n so far 846
2023-04-22 14:33:09.696174


### Arizona

In [6]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'AZ'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_az = pharm_tx

2023-04-22 14:33:09.700057
Grabbed iter 1 total sub n so far 7
Grabbed iter 2 total sub n so far 35
Grabbed iter 3 total sub n so far 52
Grabbed iter 4 total sub n so far 77
Grabbed iter 5 total sub n so far 101
Grabbed iter 6 total sub n so far 142
Grabbed iter 7 total sub n so far 178
Grabbed iter 8 total sub n so far 202
2023-04-22 14:35:17.888237


### Nevada

In [7]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'NV'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )

ph_tx_nv = pharm_tx

2023-04-22 14:35:17.892054
Grabbed iter 1 total sub n so far 4
Grabbed iter 2 total sub n so far 18
Grabbed iter 3 total sub n so far 28
Grabbed iter 4 total sub n so far 36
Grabbed iter 5 total sub n so far 46
Grabbed iter 6 total sub n so far 62
Grabbed iter 7 total sub n so far 78
Grabbed iter 8 total sub n so far 102
2023-04-22 14:37:29.528111


### Texas

In [8]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'TX'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )

ph_tx_tx = pharm_tx

2023-04-22 14:37:29.532856
Grabbed iter 1 total sub n so far 55
Grabbed iter 2 total sub n so far 189
Grabbed iter 3 total sub n so far 283
Grabbed iter 4 total sub n so far 361
Grabbed iter 5 total sub n so far 442
Grabbed iter 6 total sub n so far 572
Grabbed iter 7 total sub n so far 667
Grabbed iter 8 total sub n so far 748
2023-04-22 14:39:38.061640


### Washington

In [9]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'WA'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )

ph_tx_wa = pharm_tx

2023-04-22 14:39:38.064735
Grabbed iter 1 total sub n so far 15
Grabbed iter 2 total sub n so far 55
Grabbed iter 3 total sub n so far 93
Grabbed iter 4 total sub n so far 108
Grabbed iter 5 total sub n so far 127
Grabbed iter 6 total sub n so far 154
Grabbed iter 7 total sub n so far 192
Grabbed iter 8 total sub n so far 206
2023-04-22 14:41:50.031955


### Oregon

In [10]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'OR'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )

ph_tx_or = pharm_tx

2023-04-22 14:41:50.035190
Grabbed iter 1 total sub n so far 3
Grabbed iter 2 total sub n so far 22
Grabbed iter 3 total sub n so far 47
Grabbed iter 4 total sub n so far 62
Grabbed iter 5 total sub n so far 78
Grabbed iter 6 total sub n so far 90
Grabbed iter 7 total sub n so far 113
Grabbed iter 8 total sub n so far 124
2023-04-22 14:43:59.060028


### Illinios

In [11]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'IL'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )

ph_tx_il = pharm_tx

2023-04-22 14:43:59.063375
Grabbed iter 1 total sub n so far 29
Grabbed iter 2 total sub n so far 82
Grabbed iter 3 total sub n so far 114
Grabbed iter 4 total sub n so far 137
Grabbed iter 5 total sub n so far 190
Grabbed iter 6 total sub n so far 263
Grabbed iter 7 total sub n so far 329
Grabbed iter 8 total sub n so far 352
2023-04-22 14:46:07.400974


### Save csv

In [12]:
ph_tx = pd.concat([ph_tx_ca, ph_tx_az, ph_tx_nv, ph_tx_tx, ph_tx_wa, ph_tx_or, ph_tx_il])

In [13]:
len(ph_tx)

2580

In [14]:
# ph_tx.to_csv('statesDf.csv',index=False)

### version 2 states

In [4]:
# ph_txx = pd.read_csv('statesDf.csv')

### New York

In [10]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'NY'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_ny = pharm_tx

2023-05-01 03:28:47.906011
Grabbed iter 1 total sub n so far 56
Grabbed iter 2 total sub n so far 168
Grabbed iter 3 total sub n so far 238
Grabbed iter 4 total sub n so far 314
Grabbed iter 5 total sub n so far 367
Grabbed iter 6 total sub n so far 459
Grabbed iter 7 total sub n so far 513
Grabbed iter 8 total sub n so far 571
2023-05-01 03:30:58.649080


### Florida

In [11]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'FL'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_fl = pharm_tx

2023-05-01 03:30:58.652396
Grabbed iter 1 total sub n so far 53
Grabbed iter 2 total sub n so far 167
Grabbed iter 3 total sub n so far 252
Grabbed iter 4 total sub n so far 349
Grabbed iter 5 total sub n so far 401
Grabbed iter 6 total sub n so far 580
Grabbed iter 7 total sub n so far 644
Grabbed iter 8 total sub n so far 729
2023-05-01 03:33:12.308394


### Massachusates

In [12]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'MA'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_ma = pharm_tx

2023-05-01 03:33:12.311955
Grabbed iter 1 total sub n so far 26
Grabbed iter 2 total sub n so far 66
Grabbed iter 3 total sub n so far 83
Grabbed iter 4 total sub n so far 96
Grabbed iter 5 total sub n so far 115
Grabbed iter 6 total sub n so far 145
Grabbed iter 7 total sub n so far 163
Grabbed iter 8 total sub n so far 170
2023-05-01 03:35:24.225522


### PA

In [13]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'PA'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_pa = pharm_tx

2023-05-01 03:35:24.230022
Grabbed iter 1 total sub n so far 40
Grabbed iter 2 total sub n so far 105
Grabbed iter 3 total sub n so far 173
Grabbed iter 4 total sub n so far 247
Grabbed iter 5 total sub n so far 295
Grabbed iter 6 total sub n so far 362
Grabbed iter 7 total sub n so far 423
Grabbed iter 8 total sub n so far 480
2023-05-01 03:37:35.468427


### Ohio

In [14]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'OH'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_oh = pharm_tx

2023-05-01 03:37:35.471603
Grabbed iter 1 total sub n so far 30
Grabbed iter 2 total sub n so far 118
Grabbed iter 3 total sub n so far 173
Grabbed iter 4 total sub n so far 220
Grabbed iter 5 total sub n so far 252
Grabbed iter 6 total sub n so far 343
Grabbed iter 7 total sub n so far 392
Grabbed iter 8 total sub n so far 413
2023-05-01 03:39:40.909724


### MI

In [15]:
def sub_rows(data):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'MI'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )
ph_tx_mi = pharm_tx

2023-05-01 03:39:40.913203
Grabbed iter 1 total sub n so far 26
Grabbed iter 2 total sub n so far 111
Grabbed iter 3 total sub n so far 203
Grabbed iter 4 total sub n so far 238
Grabbed iter 5 total sub n so far 291
Grabbed iter 6 total sub n so far 399
Grabbed iter 7 total sub n so far 446
Grabbed iter 8 total sub n so far 500
2023-05-01 03:41:48.849697


### New dataset

In [20]:
ph_tx = pd.concat([ph_txx, ph_tx_ny, ph_tx_fl, ph_tx_ma, ph_tx_pa, ph_tx_oh, ph_tx_mi])

In [23]:
len(ph_tx)

5443

### Data Cleaning - 1.filter p&o

In [24]:
# ph_tx = pd.read_csv('statesDf.csv')

In [25]:
ph_tx['Provider Business Practice Location Address Postal Code'] = ph_tx['Provider Business Practice Location Address Postal Code'].str[0:5]
ph_tx['Zip5'] = ph_tx['Provider Business Practice Location Address Postal Code'].str[0:5]
ph_tx['Address'] = ph_tx['Provider First Line Business Practice Location Address'].apply(clean_add)

ph_tx.rename(columns={'Provider Business Practice Location Address City Name':'City',
                      'Provider Business Practice Location Address State Name':'State2'},
             inplace=True)
ph_tx = ph_tx[-ph_tx['Provider Organization Name (Legal Business Name)'].str.contains("NORDSTROM")].reset_index(drop = True)
ph_tx.head(2)

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider First Line Business Practice Location Address,City,State2,Provider Business Practice Location Address Postal Code,Provider Business Practice Location Address Telephone Number,NPI Deactivation Reason Code,NPI Deactivation Date,...,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Zip5,Address
0,1548468614,2,DIMENSION PROSTHETICS & ORTHOTICS,33374 DOWE AVE,UNION CITY,CA,,5103243400,,,...,,,,,,,,,,33374 DOWE AVE
1,1952507303,2,"SOUND BALANCE AUDIOLOGY, INC",2420 VISTA WAY,OCEANSIDE,CA,,7607217417,,,...,,,,,,,,,,2420 VISTA WAY


In [26]:
newc = []
for n in range(len(ph_tx)):
    for i in range(15):
        if ph_tx.iloc[n]['Healthcare Provider Primary Taxonomy Switch_' + str(i+1)] == 'Y':
            v = ph_tx.iloc[n]['Healthcare Provider Taxonomy Code_' + str(i+1)]
    newc.append(v)

In [27]:
dec_codes = ['NPI Deactivation Reason Code','NPI Deactivation Date','NPI Reactivation Date']
ph_tx = ph_tx.drop(columns=taxon_codes+taxonswitch_codes+dec_codes).reset_index(drop=True)
ph_tx['taxonomy'] = newc
# ph_tx = ph_tx[ph_tx['taxonomy'].isin(community_pharm)]

In [28]:
ph_tx2 = ph_tx[ph_tx['taxonomy'].str.contains("261QA0900X")]

In [29]:
ph_tx = ph_tx[ph_tx['Provider Organization Name (Legal Business Name)'].str.contains("ORTHO|PROS|P&O|LIMB")].reset_index(drop = True)

In [30]:
# ph_tx2

In [31]:
len(ph_tx)

2371

In [32]:
ph_tx_3 = pd.concat([ph_tx, ph_tx2]).drop_duplicates()

In [33]:
ph_tx_3.shape

(2380, 11)

In [34]:
ph_tx_3 = ph_tx_3.sort_values(by=['State2'])

In [35]:
ph_tx_3.to_excel('list_states.xlsx',index=False)

### Data Cleaning - 2.same location

In [36]:
len(ph_tx_3)

2380

In [37]:
len(ph_tx_3["Address"].unique())

2206

In [38]:
ph_tx_4 = ph_tx_3[~ph_tx_3.duplicated(subset=['Address'], keep="last")].reset_index(drop = True)

### Geo coding

In [39]:
geo_pharm = split_geo(ph_tx_4, add='Address', city='City', state='State2', zipcode='Zip5', chunk_size=500)
print(geo_pharm['match'].value_counts())

Geocoding round 1 of 5, 2023-05-01 03:43:07.015001
Geocoding round 2 of 5, 2023-05-01 03:43:18.492517
Geocoding round 3 of 5, 2023-05-01 03:43:49.099931
Geocoding round 4 of 5, 2023-05-01 03:44:06.003275
Geocoding round 5 of 5, 2023-05-01 03:44:18.589385
True     1955
False     251
Name: match, dtype: int64


In [40]:
geo_pharm['rowN'] = geo_pharm['row'].astype(int)
gp2 = geo_pharm.sort_values(by='rowN').reset_index(drop=True)

kg = ['address','match','lat','lon']
kd = ['NPI',
      'Provider Organization Name (Legal Business Name)',
      'Provider Business Practice Location Address Telephone Number',
      'City','State2','Zip5']
final_pharm = pd.concat([ph_tx_4[kd], gp2[kg]], axis=1)

final_pharm.rename(columns={'Provider Organization Name (Legal Business Name)':'Name',
                      'Provider Business Practice Location Address Telephone Number':'Telephone'}, inplace=True)
final_pharm.head(2)

Unnamed: 0,NPI,Name,Telephone,City,State2,Zip5,address,match,lat,lon
0,1225228265,HANGER PROSTHETICS & ORTHOTICS WEST INC,9287759280,PRESCOTT VALLEY,AZ,,"3173 N WINDSONG DR, PRESCOTT VALLEY, AZ,",True,34.58709,-112.326802
1,1477627354,HANGER PROSTHETICS & ORTHOTICS WEST INC,9283411965,YUMA,AZ,,"1025 W 24TH ST, YUMA, AZ,",True,32.68395,-114.63089


### Gjson

In [41]:
hosp_data = final_pharm
hosp_data = hosp_data[hosp_data['match']].copy()
hosp_data.reset_index(inplace=True, drop=True)
hosp_data.head(2)

Unnamed: 0,NPI,Name,Telephone,City,State2,Zip5,address,match,lat,lon
0,1225228265,HANGER PROSTHETICS & ORTHOTICS WEST INC,9287759280,PRESCOTT VALLEY,AZ,,"3173 N WINDSONG DR, PRESCOTT VALLEY, AZ,",True,34.58709,-112.326802
1,1477627354,HANGER PROSTHETICS & ORTHOTICS WEST INC,9283411965,YUMA,AZ,,"1025 W 24TH ST, YUMA, AZ,",True,32.68395,-114.63089


In [42]:
hosp_geo = gpd.GeoDataFrame(hosp_data, geometry=gpd.points_from_xy(hosp_data.lon, hosp_data.lat), crs="EPSG:4326")

In [43]:
cali_counties = gpd.read_file(r'tl_2019_us_county/tl_2019_us_county.shp')
cali_outline = cali_counties.dissolve('STATEFP')
cali_proj = cali_outline.to_crs('EPSG:5070')
print(cali_outline.crs)

EPSG:4269


In [44]:
def dissolve_buff(point_df,d,resolution):
    bu = point_df.buffer(d,resolution)
    geodf = gpd.GeoDataFrame(geometry=bu)
    geodf['Const'] = 0
    single = geodf.dissolve('Const')
    return single[['geometry']]

In [45]:
def dist_cont(point_df,dist_list,outside,buff_res):
    if point_df.crs != outside.crs:
        print('Point df and Outside df are not the same CRS')
        return None
    # Making outside area out dissolved object
    out_cop = outside[['geometry']].copy()
    out_cop['Constant'] = 1
    out_cop = out_cop.dissolve('Constant')
    # Make sure points are inside area
    inside = point_df.within(out_cop['geometry'][1])
    point_cop = point_df[inside].copy()
    point_cop = point_df.copy()
    point_cop['Constant'] = 1 #Constant for dissolve
    point_cop = point_cop[['Constant','geometry']].copy()
    res_buffers = []
    for i,d in enumerate(dist_list):
        print(f'Doing buffer {d}')
        if i == 0:
            res = dissolve_buff(point_cop, d, buff_res)
            res_buffers.append(res.copy())
        else:
            res_new = dissolve_buff(point_cop, d, buff_res)
            res_buffonly = gpd.overlay(res_new, res, how='difference')
            res = res_new.copy()
            res_buffers.append( res_buffonly.copy() )
    # Now take the difference with the larger area
    print('Working on leftover difference now')
    leftover = gpd.overlay(out_cop, res, how='difference')
    res_buffers.append(leftover)
    for i,d in enumerate(dist_list):
        res_buffers[i]['Distance'] = str(d)
    res_buffers[-1]['Distance'] = 'Outside'
    # New geopandas DF
    comb_df = pd.concat(res_buffers)
    comb_df.reset_index(inplace=True, drop=True)
    return comb_df

In [46]:
hos_proj = hosp_geo.to_crs('EPSG:5070') #'epsg:4269'

dist_met = [2000, 4000, 8000, 16000] #, 32000
buff_city = dist_cont(hos_proj, dist_met, cali_proj, buff_res=100)

Doing buffer 2000
Doing buffer 4000
Doing buffer 8000
Doing buffer 16000
Working on leftover difference now


In [47]:
#Now making folium plot
buff_map = buff_city.to_crs('EPSG:4326')
kv = list(hosp_geo)[1:10]

In [48]:
#"fill": "#00aa22",
#"fill-opacity": 0.5

cols = ['#f1eef6',
'#d7b5d8',
'#df65b0',
'#dd1c77',
'#980043']

buff_map['fill'] = cols
buff_map['fill-opacity'] = 0.35

#os.chdir(r'D:\Dropbox\Dropbox\PublicCode_Git\Blog_Code')

### Save gjson

In [49]:
buff_map.to_file('Buffers_States.geojson', driver='GeoJSON')
hosp_geo.to_file('Hosp_States.geojson', driver='GeoJSON')

### Combined

In [56]:
states = ("AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID",
          "IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO",
          "MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA",
          "RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY")

In [57]:
def sub_rows(data, state):
    ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == state
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    ac = data['NPI Deactivation Reason Code'].isna()
    all_together = ec & st & ta & ac 
    sub = data[all_together]
    return sub

def csv_chunks(file,chunk_size,keep_cols,state,row_sub):
    header_fields = list(pd.read_csv(npi_csv, nrows=1))
    header_locs = [header_fields.index(i) for i in keep_cols]
    skip = 1
    it_n = 0
    sub_n = 0
    ret_chunk = chunk_size
    fin_li_dat = []
    while ret_chunk == chunk_size:
        file_chunk = pd.read_csv(file, usecols=header_locs, skiprows=skip, 
                     nrows=chunk_size, names=header_fields, dtype='str')
        sub_dat = row_sub(file_chunk, state)
        fin_li_dat.append( sub_dat.copy() )
        skip += chunk_size
        it_n += 1
        sub_n += sub_dat.shape[0]
        print(f'Grabbed iter {it_n} total sub n so far {sub_n}')
        ret_chunk = file_chunk.shape[0]
    fin_dat = pd.concat(fin_li_dat, axis=0)
    return fin_dat

In [63]:
ph_txs = pd.DataFrame()

In [81]:
dfs = []

for s in states:
    print(s)
    print(datetime.now())
    pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, state = s, row_sub=sub_rows)
    print(datetime.now()) 
    dfs.append(pharm_tx)


AL
2023-05-01 04:41:56.244558
Grabbed iter 1 total sub n so far 6
Grabbed iter 2 total sub n so far 36
Grabbed iter 3 total sub n so far 65
Grabbed iter 4 total sub n so far 86
Grabbed iter 5 total sub n so far 97
Grabbed iter 6 total sub n so far 125
Grabbed iter 7 total sub n so far 152
Grabbed iter 8 total sub n so far 163
2023-05-01 04:43:58.082195
AK
2023-05-01 04:43:58.082253
Grabbed iter 1 total sub n so far 2
Grabbed iter 2 total sub n so far 6
Grabbed iter 3 total sub n so far 11
Grabbed iter 4 total sub n so far 11
Grabbed iter 5 total sub n so far 11
Grabbed iter 6 total sub n so far 13
Grabbed iter 7 total sub n so far 17
Grabbed iter 8 total sub n so far 17
2023-05-01 04:46:02.169932
AZ
2023-05-01 04:46:02.169959
Grabbed iter 1 total sub n so far 7
Grabbed iter 2 total sub n so far 35
Grabbed iter 3 total sub n so far 52
Grabbed iter 4 total sub n so far 77
Grabbed iter 5 total sub n so far 101
Grabbed iter 6 total sub n so far 142
Grabbed iter 7 total sub n so far 178
Gra

In [82]:
combined = pd.concat(dfs)

In [83]:
len(combined)

10230

In [9]:
combined['Provider Business Practice Location Address Postal Code'] = combined['Provider Business Practice Location Address Postal Code'].str[0:5]
combined['Zip5'] = combined['Provider Business Practice Location Address Postal Code'].str[0:5]
combined['Address'] = combined['Provider First Line Business Practice Location Address'].apply(clean_add)

combined.rename(columns={'Provider Business Practice Location Address City Name':'City',
                      'Provider Business Practice Location Address State Name':'State2'},
             inplace=True)
combined = combined[-combined['Provider Organization Name (Legal Business Name)'].str.contains("NORDSTROM")].reset_index(drop = True)
combined.head(2)

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider First Line Business Practice Location Address,City,State2,Provider Business Practice Location Address Postal Code,Provider Business Practice Location Address Telephone Number,Zip5,Address,taxonomy
0,1447443551,2,"ALABAMA PROSTHETICS & ORTHOTICS, INC",101 MEDICAL CENTER CT,PRATTVILLE,AL,36066,3343620000.0,36066,101 MEDICAL CENTER CT,335E00000X
1,1831369404,2,"ORTHOPRO, INC",1812 28TH AVE S,BIRMINGHAM,AL,35209,2058799000.0,35209,1812 28TH AVE S,335E00000X


In [10]:
newc = []
for n in range(len(combined)):
    for i in range(15):
        if combined.iloc[n]['Healthcare Provider Primary Taxonomy Switch_' + str(i+1)] == 'Y':
            v = combined.iloc[n]['Healthcare Provider Taxonomy Code_' + str(i+1)]
    newc.append(v)

KeyError: 'Healthcare Provider Primary Taxonomy Switch_1'

In [86]:
dec_codes = ['NPI Deactivation Reason Code','NPI Deactivation Date','NPI Reactivation Date']
combined = combined.drop(columns=taxon_codes+taxonswitch_codes+dec_codes).reset_index(drop=True)
combined['taxonomy'] = newc
# ph_tx = ph_tx[ph_tx['taxonomy'].isin(community_pharm)]

In [92]:
# combined.to_csv('combinedDf.csv',index=False)

In [3]:
combined = pd.read_csv('combinedDf.csv')

In [5]:
combined2 = combined[combined['taxonomy'].str.contains("261QA0900X")]

In [6]:
combined = combined[combined['Provider Organization Name (Legal Business Name)'].str.contains("ORTHO|PROS|P&O|LIMB")].reset_index(drop = True)
len(combined)

4375

In [7]:
combined_3 = pd.concat([combined, combined2]).drop_duplicates()
combined_3.shape

(4375, 11)

In [8]:
combined_3 = combined_3.sort_values(by=['State2'])
len(combined_3["Address"].unique())

4062

In [9]:
combined_4 = combined_3[~combined_3.duplicated(subset=['Address'], keep="last")].reset_index(drop = True)

In [12]:
# combined_4.to_excel('us_clinics.xlsx',index=False)

In [23]:
# sam = geo_pharm
# geo_pharm = pd.DataFrame()

In [24]:
combined_5 = pd.DataFrame([combined_4.index, combined_4.Address, combined_4.City, combined_4.State2, combined_4.Zip5]).transpose()
combined_5.columns = ['index', 'Address', 'City', 'State2', 'Zip5']
combined_5

Unnamed: 0,index,Address,City,State2,Zip5
0,0,1405 KELLUM ST,FAIRBANKS,AK,99701
1,1,3719 E MERIDIAN LOOP,WASILLA,AK,99654
2,2,3400 LATOUCHE ST,ANCHORAGE,AK,99508
3,3,44604,SOLDOTNA,AK,99669
4,4,5701 LAKE OTIS PKWY,ANCHORAGE,AK,99507
...,...,...,...,...,...
4057,4057,135 S WASHINGTON ST,CASPER,WY,82601
4058,4058,611 E CARLSON STREET,CHEYENNE,WY,82009
4059,4059,240 W 9TH ST,CASPER,WY,82601
4060,4060,720 LINDSAY LN,CODY,WY,82414


In [25]:
combined_5.to_csv('cgDf.csv',index=False)

In [26]:
k = cg.addressbatch('cgDf.csv')

In [27]:
geo_pharm = pd.DataFrame(k, columns=k[0].keys())

In [30]:
print(geo_pharm['match'].value_counts())

True     3557
False     506
Name: match, dtype: int64


In [33]:
# geo_pharm['rowN'] = geo_pharm['row'].astype(int)
# gp2 = geo_pharm.sort_values(by='rowN').reset_index(drop=True)

kg = ['address','match','lat','lon']
kd = ['NPI',
      'Provider Organization Name (Legal Business Name)',
      'Provider Business Practice Location Address Telephone Number',
      'City','State2','Zip5']
final_pharm = pd.concat([combined_4[kd], geo_pharm[kg]], axis=1)

final_pharm.rename(columns={'Provider Organization Name (Legal Business Name)':'Name',
                      'Provider Business Practice Location Address Telephone Number':'Telephone'}, inplace=True)
final_pharm.head(2)

Unnamed: 0,NPI,Name,Telephone,City,State2,Zip5,address,match,lat,lon
0,1689920000.0,ALCHEMY ORTHOTICS AND PROSTHETICS,9075621000.0,FAIRBANKS,AK,99701.0,"15301 SPECTRUM DR, ADDISON, TX, 75001",True,32.960742,-96.824101
1,1952445000.0,NORTHERN ORTHOPEDICS INC,9073574000.0,WASILLA,AK,99654.0,"123 PROFESSIONAL PARK DR, MOORESVILLE, NC, 28117",True,35.557215,-80.854539


In [34]:
hosp_data = final_pharm
hosp_data = hosp_data[hosp_data['match']].copy()
hosp_data.reset_index(inplace=True, drop=True)
hosp_data.head(2)

Unnamed: 0,NPI,Name,Telephone,City,State2,Zip5,address,match,lat,lon
0,1689920000.0,ALCHEMY ORTHOTICS AND PROSTHETICS,9075621000.0,FAIRBANKS,AK,99701.0,"15301 SPECTRUM DR, ADDISON, TX, 75001",True,32.960742,-96.824101
1,1952445000.0,NORTHERN ORTHOPEDICS INC,9073574000.0,WASILLA,AK,99654.0,"123 PROFESSIONAL PARK DR, MOORESVILLE, NC, 28117",True,35.557215,-80.854539


In [35]:
hosp_geo = gpd.GeoDataFrame(hosp_data, geometry=gpd.points_from_xy(hosp_data.lon, hosp_data.lat), crs="EPSG:4326")

In [36]:
cali_counties = gpd.read_file(r'tl_2019_us_county/tl_2019_us_county.shp')
cali_outline = cali_counties.dissolve('STATEFP')
cali_proj = cali_outline.to_crs('EPSG:5070')
print(cali_outline.crs)

EPSG:4269


In [37]:
def dissolve_buff(point_df,d,resolution):
    bu = point_df.buffer(d,resolution)
    geodf = gpd.GeoDataFrame(geometry=bu)
    geodf['Const'] = 0
    single = geodf.dissolve('Const')
    return single[['geometry']]

In [38]:
def dist_cont(point_df,dist_list,outside,buff_res):
    if point_df.crs != outside.crs:
        print('Point df and Outside df are not the same CRS')
        return None
    # Making outside area out dissolved object
    out_cop = outside[['geometry']].copy()
    out_cop['Constant'] = 1
    out_cop = out_cop.dissolve('Constant')
    # Make sure points are inside area
    inside = point_df.within(out_cop['geometry'][1])
    point_cop = point_df[inside].copy()
    point_cop = point_df.copy()
    point_cop['Constant'] = 1 #Constant for dissolve
    point_cop = point_cop[['Constant','geometry']].copy()
    res_buffers = []
    for i,d in enumerate(dist_list):
        print(f'Doing buffer {d}')
        if i == 0:
            res = dissolve_buff(point_cop, d, buff_res)
            res_buffers.append(res.copy())
        else:
            res_new = dissolve_buff(point_cop, d, buff_res)
            res_buffonly = gpd.overlay(res_new, res, how='difference')
            res = res_new.copy()
            res_buffers.append( res_buffonly.copy() )
    # Now take the difference with the larger area
    print('Working on leftover difference now')
    leftover = gpd.overlay(out_cop, res, how='difference')
    res_buffers.append(leftover)
    for i,d in enumerate(dist_list):
        res_buffers[i]['Distance'] = str(d)
    res_buffers[-1]['Distance'] = 'Outside'
    # New geopandas DF
    comb_df = pd.concat(res_buffers)
    comb_df.reset_index(inplace=True, drop=True)
    return comb_df

In [39]:
hos_proj = hosp_geo.to_crs('EPSG:5070') #'epsg:4269'

dist_met = [2000, 4000, 8000, 16000] #, 32000
buff_city = dist_cont(hos_proj, dist_met, cali_proj, buff_res=100)

Doing buffer 2000
Doing buffer 4000
Doing buffer 8000
Doing buffer 16000
Working on leftover difference now


In [40]:
#Now making folium plot
buff_map = buff_city.to_crs('EPSG:4326')
kv = list(hosp_geo)[1:10]

In [41]:
#"fill": "#00aa22",
#"fill-opacity": 0.5

cols = ['#f1eef6',
'#d7b5d8',
'#df65b0',
'#dd1c77',
'#980043']

buff_map['fill'] = cols
buff_map['fill-opacity'] = 0.35

#os.chdir(r'D:\Dropbox\Dropbox\PublicCode_Git\Blog_Code')

In [42]:
buff_map.to_file('Combined_buff.geojson', driver='GeoJSON')
hosp_geo.to_file('Combined_States.geojson', driver='GeoJSON')