In [1]:
# pip install censusgeocode

In [2]:
import pandas as pd
import numpy as np
import censusgeocode as cg
import time
from datetime import datetime
import os

In [3]:
# Prepping the input data in chunks

keep_col = ['NPI','Entity Type Code','Provider Organization Name (Legal Business Name)',
            'Provider Last Name (Legal Name)', 'Provider First Name', 'Provider Middle Name',
            'Provider Name Prefix Text', 'Provider Name Suffix Text',
            'NPI Deactivation Reason Code','NPI Deactivation Date','NPI Reactivation Date',
            'Provider First Line Business Practice Location Address',
            'Provider Business Practice Location Address City Name',
            'Provider Business Practice Location Address State Name',
            'Provider Business Practice Location Address Postal Code',
            'Provider Business Practice Location Address Telephone Number']

1. Prosthetics Case Management: 1744P3200X
2. Orthotic Fitter: 225000000X
3. Orthotist: 222Z00000X
4. Prosthetist: 224P00000X
5. Prosthetic/Orthotic Supplier:335E00000X

In [4]:
taxon_codes = ['Healthcare Provider Taxonomy Code_' + str(i+1) for i in range(15)]
keep_col += taxon_codes
community_pharm = ['1744P3200X','225000000X','222Z00000X','224P00000X','335E00000X']
npi_csv = 'npidata_pfile_20050523-20230212.csv' #Newer files will prob change the name

In [5]:
# This defines the rows I want
def sub_rows(data):
    # ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'CA'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    #ac = data['NPI Deactivation Reason Code'].isna()
    all_together = st & ta  #ec && ac 
    sub = data[all_together]
    return sub

In [6]:
def csv_chunks(file,chunk_size,keep_cols,row_sub):
    # First lets get the header and figure out the column indices
    header_fields = list(pd.read_csv(npi_csv, nrows=1))
    header_locs = [header_fields.index(i) for i in keep_cols]
    # Now reading in a chunk of data
    skip = 1
    it_n = 0
    sub_n = 0
    ret_chunk = chunk_size
    fin_li_dat = []
    while ret_chunk == chunk_size:
        file_chunk = pd.read_csv(file, usecols=header_locs, skiprows=skip, 
                     nrows=chunk_size, names=header_fields, dtype='str')
        sub_dat = row_sub(file_chunk)
        fin_li_dat.append( sub_dat.copy() )
        skip += chunk_size
        it_n += 1
        sub_n += sub_dat.shape[0]
        print(f'Grabbed iter {it_n} total sub n so far {sub_n}')
        ret_chunk = file_chunk.shape[0]
    fin_dat = pd.concat(fin_li_dat, axis=0)
    return fin_dat

In [7]:
# Takes about 3 minutes
print( datetime.now() )
pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
print( datetime.now() )

2023-03-13 10:18:46.932071
Grabbed iter 1 total sub n so far 325
Grabbed iter 2 total sub n so far 563
Grabbed iter 3 total sub n so far 823
Grabbed iter 4 total sub n so far 1013
Grabbed iter 5 total sub n so far 1196
Grabbed iter 6 total sub n so far 1381
Grabbed iter 7 total sub n so far 1525
Grabbed iter 8 total sub n so far 1639
2023-03-13 10:20:54.377445


In [8]:
ph_tx = pharm_tx.drop(columns=taxon_codes).reset_index(drop=True)

In [9]:
ph_tx

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider First Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,Provider Business Practice Location Address Telephone Number,NPI Deactivation Reason Code,NPI Deactivation Date,NPI Reactivation Date
0,1427033703,1,,DELA ROSA,RUSSELL,ANDRADA,MR.,,4860 Y ST,SACRAMENTO,CA,958172307,9167346732,,,
1,1184609844,1,,PINTO,DAVID,JOHN,MR.,,4860 Y ST,SACRAMENTO,CA,958172307,9167346732,,,
2,1609834274,1,,YAGDJIS,DIMITRI,PETER,MR.,,16122 COVELLO ST,VAN NUYS,CA,914062910,8189885414,,,
3,1821042219,1,,SCHAEPPER,JOHANNES,,MR.,,519 N SMITH AVE,CORONA,CA,928806911,9515820153,,,
4,1679513204,1,,VITALE,SANDY,L,,,130 REGIS ST,TURLOCK,CA,953821129,2096349021,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,1942865605,2,"ACTIVE LIFE, LLC",,,,,,16008 KAMANA RD STE 202A,APPLE VALLEY,CA,923071376,7605156311,,,
1635,1952609661,2,"ACTIVE LIFE, LLC",,,,,,7910 FROST ST STE 320,SAN DIEGO,CA,92123,6194886196,,,
1636,1952765711,2,"ACTIVE LIFE, LLC",,,,,,1700 E CESAR E CHAVEZ AVE STE 3750,LOS ANGELES,CA,90033,3233528319,,,
1637,1912339474,2,"ALEXANDER PROSTHETICS & ORTHOTICS, INC",,,,,,660 E REGENT ST,INGLEWOOD,CA,903011415,3106749179,,,


In [10]:
end_str = [' STE', ' SUITE', ' BLDG', ' TOWER', ', #', ' UNIT',
           ' APT', ' BUILDING',',', '#']

In [11]:
def clean_add(address):
    add_new = address.upper()
    for su in end_str:
        sf = address.find(su)
        if sf > -1:
            add_new = add_new[0:sf]
    add_new = add_new.replace('.','')
    add_new = add_new.strip()
    return add_new

In [12]:
ph_tx['Zip5'] = ph_tx['Provider Business Practice Location Address Postal Code'].str[0:5]
ph_tx['Address'] = ph_tx['Provider First Line Business Practice Location Address'].apply(clean_add)
ph_tx.rename(columns={'Provider Business Practice Location Address City Name':'City',
                      'Provider Business Practice Location Address State Name':'State2'},
             inplace=True)

In [13]:
ph_tx

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider First Line Business Practice Location Address,City,State2,Provider Business Practice Location Address Postal Code,Provider Business Practice Location Address Telephone Number,NPI Deactivation Reason Code,NPI Deactivation Date,NPI Reactivation Date,Zip5,Address
0,1427033703,1,,DELA ROSA,RUSSELL,ANDRADA,MR.,,4860 Y ST,SACRAMENTO,CA,958172307,9167346732,,,,95817,4860 Y ST
1,1184609844,1,,PINTO,DAVID,JOHN,MR.,,4860 Y ST,SACRAMENTO,CA,958172307,9167346732,,,,95817,4860 Y ST
2,1609834274,1,,YAGDJIS,DIMITRI,PETER,MR.,,16122 COVELLO ST,VAN NUYS,CA,914062910,8189885414,,,,91406,16122 COVELLO ST
3,1821042219,1,,SCHAEPPER,JOHANNES,,MR.,,519 N SMITH AVE,CORONA,CA,928806911,9515820153,,,,92880,519 N SMITH AVE
4,1679513204,1,,VITALE,SANDY,L,,,130 REGIS ST,TURLOCK,CA,953821129,2096349021,,,,95382,130 REGIS ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,1942865605,2,"ACTIVE LIFE, LLC",,,,,,16008 KAMANA RD STE 202A,APPLE VALLEY,CA,923071376,7605156311,,,,92307,16008 KAMANA RD
1635,1952609661,2,"ACTIVE LIFE, LLC",,,,,,7910 FROST ST STE 320,SAN DIEGO,CA,92123,6194886196,,,,92123,7910 FROST ST
1636,1952765711,2,"ACTIVE LIFE, LLC",,,,,,1700 E CESAR E CHAVEZ AVE STE 3750,LOS ANGELES,CA,90033,3233528319,,,,90033,1700 E CESAR E CHAVEZ AVE
1637,1912339474,2,"ALEXANDER PROSTHETICS & ORTHOTICS, INC",,,,,,660 E REGENT ST,INGLEWOOD,CA,903011415,3106749179,,,,90301,660 E REGENT ST


In [14]:
def split_geo(df, add, city, state, zipcode, chunk_size=500):
    df_new = df.copy()
    df_new.reset_index(inplace=True)
    splits = np.ceil(df_new.shape[0]/chunk_size)
    chunk_li = np.array_split(df_new['index'], splits)
    res_li = []
    pick_fi = []
    for i,c in enumerate(chunk_li):
        # Grab data, export to csv
        sub_data = df_new.loc[c, ['index',add,city,state,zipcode]]
        sub_data.to_csv('temp_geo.csv',header=False,index=False)
        # Geo the results and turn back into df
        print(f'Geocoding round {int(i)+1} of {int(splits)}, {datetime.now()}')
        result = cg.addressbatch('temp_geo.csv') #should try/except?
        # May want to dump the intermediate results
        #pi_str = f'pickres_{int(i)}.p'
        #pickle.dump( favorite_color, open( pi_str, "wb" ) )
        #pick_fi.append(pi_str.copy())
        names = list(result[0].keys())
        res_zl = []
        for r in result:
            res_zl.append(list(r.values()))
        res_df = pd.DataFrame(res_zl, columns=names)
        res_li.append(res_df.copy())
        # time.sleep(10) #sleep 10 seconds to not get cutoff from request
    final_df = pd.concat(res_li)
    final_df.rename(columns={'id':'row'}, inplace=True)
    final_df.reset_index(inplace=True, drop=True)
    # Clean up csv file
    os.remove('temp_geo.csv')
    return final_df

In [15]:
geo_pharm = split_geo(ph_tx, add='Address', city='City', state='State2', zipcode='Zip5', chunk_size=500)

Geocoding round 1 of 4, 2023-03-13 10:21:07.268730
Geocoding round 2 of 4, 2023-03-13 10:21:15.376580
Geocoding round 3 of 4, 2023-03-13 10:21:25.250489
Geocoding round 4 of 4, 2023-03-13 10:21:32.447006


In [16]:
print(geo_pharm['match'].value_counts())

True     1496
False     143
Name: match, dtype: int64


In [17]:
geo_pharm['rowN'] = geo_pharm['row'].astype(int)
gp2 = geo_pharm.sort_values(by='rowN').reset_index(drop=True)

In [26]:
kg = ['address','match','lat','lon']
kd = ['NPI',
      'Provider Organization Name (Legal Business Name)',
      'Provider Last Name (Legal Name)', 'Provider First Name', 'Provider Middle Name', 
      'Provider Name Prefix Text',
      'Provider First Line Business Practice Location Address',
      'Provider Business Practice Location Address Telephone Number',
      'Address','City','State2','Zip5']

final_pharm = pd.concat( [ph_tx[kd], gp2[kg]], axis=1 )

In [27]:
final_pharm

Unnamed: 0,NPI,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider First Line Business Practice Location Address,Provider Business Practice Location Address Telephone Number,Address,City,State2,Zip5,address,match,lat,lon
0,1427033703,,DELA ROSA,RUSSELL,ANDRADA,MR.,4860 Y ST,9167346732,4860 Y ST,SACRAMENTO,CA,95817,"4860 Y ST, SACRAMENTO, CA, 95817",True,38.552486,-121.448804
1,1184609844,,PINTO,DAVID,JOHN,MR.,4860 Y ST,9167346732,4860 Y ST,SACRAMENTO,CA,95817,"4860 Y ST, SACRAMENTO, CA, 95817",True,38.552486,-121.448804
2,1609834274,,YAGDJIS,DIMITRI,PETER,MR.,16122 COVELLO ST,8189885414,16122 COVELLO ST,VAN NUYS,CA,91406,"16122 COVELLO ST, VAN NUYS, CA, 91406",True,34.207523,-118.484261
3,1821042219,,SCHAEPPER,JOHANNES,,MR.,519 N SMITH AVE,9515820153,519 N SMITH AVE,CORONA,CA,92880,"519 N SMITH AVE, CORONA, CA, 92880",True,33.895226,-117.590408
4,1679513204,,VITALE,SANDY,L,,130 REGIS ST,2096349021,130 REGIS ST,TURLOCK,CA,95382,"130 REGIS ST, TURLOCK, CA, 95382",True,37.519412,-120.849607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,1942865605,"ACTIVE LIFE, LLC",,,,,16008 KAMANA RD STE 202A,7605156311,16008 KAMANA RD,APPLE VALLEY,CA,92307,"16008 KAMANA RD, APPLE VALLEY, CA, 92307",True,34.542401,-117.271759
1635,1952609661,"ACTIVE LIFE, LLC",,,,,7910 FROST ST STE 320,6194886196,7910 FROST ST,SAN DIEGO,CA,92123,"7910 FROST ST, SAN DIEGO, CA, 92123",True,32.800261,-117.154388
1636,1952765711,"ACTIVE LIFE, LLC",,,,,1700 E CESAR E CHAVEZ AVE STE 3750,3233528319,1700 E CESAR E CHAVEZ AVE,LOS ANGELES,CA,90033,"1700 E CESAR E CHAVEZ AVE, LOS ANGELES, CA, 90033",True,34.051144,-118.217488
1637,1912339474,"ALEXANDER PROSTHETICS & ORTHOTICS, INC",,,,,660 E REGENT ST,3106749179,660 E REGENT ST,INGLEWOOD,CA,90301,"660 E REGENT ST, INGLEWOOD, CA, 90301",True,33.964526,-118.344432


In [28]:
final_pharm.to_csv('clinics_Cali.csv',index=False)