In [9]:
# pip install censusgeocode

In [1]:
import pandas as pd
import numpy as np
import censusgeocode as cg
import time
from datetime import datetime

import geopandas as gpd
import os

### Setting

1. Clinic/Center - Amputee: 261QA0900X
2. Orthotist: 222Z00000X
3. Prosthetist: 224P00000X
4. Prosthetic/Orthotic Supplier:335E00000X
<br>/////////////////////
5. Prosthetics Case Management: 1744P3200X
6. Orthotic Fitter: 225000000X
7. Pedorthist: 224L00000X

In [2]:
keep_col = ['NPI','Entity Type Code', 'Replacement NPI',
            'Provider Organization Name (Legal Business Name)',
            'Provider Last Name (Legal Name)',
            'Provider First Name',
            'Provider Middle Name',
            'Provider First Line Business Practice Location Address',
            'Provider Business Practice Location Address City Name',
            'Provider Business Practice Location Address State Name',
            'Provider Business Practice Location Address Postal Code',
            'Provider Business Practice Location Address Telephone Number',
            'NPI Deactivation Reason Code','NPI Deactivation Date','NPI Reactivation Date',
            'Provider Gender Code',
            'Authorized Official Last Name',
            'Authorized Official First Name',
            'Authorized Official Middle Name',
            'Authorized Official Telephone Number',
            'Is Sole Proprietor', 'Is Organization Subpart'
           ]


taxon_codes = ['Healthcare Provider Taxonomy Code_' + str(i+1) for i in range(15)]
taxonswitch_codes = ['Healthcare Provider Primary Taxonomy Switch_' + str(i+1) for i in range(15)]
keep_col += taxon_codes
keep_col += taxonswitch_codes

community_pharm = ['261QA0900X', '222Z00000X', '224P00000X','335E00000X']
npi_csv = 'npidata_pfile_20050523-20230212.csv'

end_str = [' STE', ' SUITE', ' BLDG', ' TOWER', ', #', ' UNIT',
           ' APT', ' BUILDING',',', '#']

In [3]:
def sub_rows(data):
    # ec = data['Entity Type Code'] == "2"
    st = data['Provider Business Practice Location Address State Name'] == 'CA'
    ta = data[taxon_codes].isin(community_pharm).any(axis=1)
    # ac = data['NPI Deactivation Reason Code'].isna()
    all_together =  st & ta #& ac ec &
    sub = data[all_together]
    return sub

def csv_chunks(file,chunk_size,keep_cols,row_sub):
    header_fields = list(pd.read_csv(npi_csv, nrows=1))
    header_locs = [header_fields.index(i) for i in keep_cols]
    skip = 1
    it_n = 0
    sub_n = 0
    ret_chunk = chunk_size
    fin_li_dat = []
    while ret_chunk == chunk_size:
        file_chunk = pd.read_csv(file, usecols=header_locs, skiprows=skip, 
                     nrows=chunk_size, names=header_fields, dtype='str')
        sub_dat = row_sub(file_chunk)
        fin_li_dat.append( sub_dat.copy() )
        skip += chunk_size
        it_n += 1
        sub_n += sub_dat.shape[0]
        print(f'Grabbed iter {it_n} total sub n so far {sub_n}')
        ret_chunk = file_chunk.shape[0]
    fin_dat = pd.concat(fin_li_dat, axis=0)
    return fin_dat

def clean_add(address):
    add_new = address.upper()
    for su in end_str:
        sf = address.find(su)
        if sf > -1:
            add_new = add_new[0:sf]
    add_new = add_new.replace('.','')
    add_new = add_new.strip()
    return add_new

### Run

In [5]:
# # Takes about 3 minutes
# print( datetime.now() )
# pharm_tx = csv_chunks(npi_csv, chunk_size=1000000, keep_cols=keep_col, row_sub=sub_rows)
# print( datetime.now() )

## read temp_ph

In [6]:
# pharm_tx.to_excel('temp_ph.xlsx',index=False)

In [7]:
ph_tx = pd.read_excel('temp_ph.xlsx')

In [9]:
# ph_tx.columns

In [10]:
type(ph_tx['Provider Business Practice Location Address Postal Code'][0])

numpy.int64

In [11]:
ph_tx = ph_tx.fillna('NaN')

In [12]:
ph_tx['Provider Business Practice Location Address Postal Code'] = ph_tx['Provider Business Practice Location Address Postal Code'].astype(str)
ph_tx['Zip5'] = ph_tx['Provider Business Practice Location Address Postal Code'].str[0:5]

In [13]:
ph_tx['Address'] = ph_tx['Provider First Line Business Practice Location Address'].apply(clean_add)
ph_tx.rename(columns={'Provider Business Practice Location Address City Name':'City',
                      'Provider Business Practice Location Address State Name':'State2'}
             ,inplace=True)

In [14]:
ph_tx = ph_tx[-ph_tx['Provider Organization Name (Legal Business Name)'].str.contains("NORDSTROM")].reset_index(drop = True)

In [15]:
ph_tx.shape

(1359, 54)

In [16]:
newc = []
for n in range(len(ph_tx)):
    for i in range(15):
        if ph_tx.iloc[n]['Healthcare Provider Primary Taxonomy Switch_' + str(i+1)] == 'Y':
            v = ph_tx.iloc[n]['Healthcare Provider Taxonomy Code_' + str(i+1)]
    newc.append(v)

In [17]:
ph_tx = ph_tx.drop(columns=taxon_codes+taxonswitch_codes).reset_index(drop=True)

In [18]:
ph_tx.shape

(1359, 24)

In [19]:
ph_tx.columns

Index(['NPI', 'Entity Type Code', 'Replacement NPI',
       'Provider Organization Name (Legal Business Name)',
       'Provider Last Name (Legal Name)', 'Provider First Name',
       'Provider Middle Name',
       'Provider First Line Business Practice Location Address', 'City',
       'State2', 'Provider Business Practice Location Address Postal Code',
       'Provider Business Practice Location Address Telephone Number',
       'NPI Deactivation Reason Code', 'NPI Deactivation Date',
       'NPI Reactivation Date', 'Provider Gender Code',
       'Authorized Official Last Name', 'Authorized Official First Name',
       'Authorized Official Middle Name',
       'Authorized Official Telephone Number', 'Is Sole Proprietor',
       'Is Organization Subpart', 'Zip5', 'Address'],
      dtype='object')

In [20]:
ph_tx['taxonomy'] = newc

In [22]:
print(np.unique(ph_tx['NPI Deactivation Date']))
deact = ['03/09/2018', '03/30/2020', '12/27/2021']

['03/09/2018' '03/30/2020' '12/27/2021' 'NaN']


In [24]:
ph_tx.loc[ph_tx['NPI Deactivation Date'].isin(deact)]['NPI Reactivation Date']

865     03/12/2019
956     05/06/2020
1237    02/02/2022
Name: NPI Reactivation Date, dtype: object

Molina orthopedic laboratories

In [25]:
np.unique(ph_tx['taxonomy'])

array(['1223P0700X', '156FX1700X', '1744P3200X', '2084P0800X',
       '2086S0129X', '213ES0103X', '222Z00000X', '224900000X',
       '224L00000X', '224P00000X', '225000000X', '225100000X',
       '225X00000X', '225XH1200X', '235Z00000X', '261QA0900X',
       '261QP2000X', '332B00000X', '332BC3200X', '332BP3500X',
       '332BX2000X', '333600000X', '3336C0003X', '3336L0003X',
       '335E00000X'], dtype=object)

In [26]:
community_pharm

['261QA0900X', '222Z00000X', '224P00000X', '335E00000X']

In [27]:
ph_tx2 = ph_tx[ph_tx['taxonomy'].isin(community_pharm)]

In [28]:
# ph_tx = ph_tx[ph_tx['Provider Organization Name (Legal Business Name)'].str.contains("ORTHO|PROS|P&O|LIMB")].reset_index(drop = True)

In [29]:
# ph_tx2.to_excel('temp_ph2.xlsx',index=False)

In [31]:
ph_tx2.shape

(1222, 25)

In [32]:
ph_indi = ph_tx2[ph_tx2['Entity Type Code'] == 1]
ph_orgz = ph_tx2[ph_tx2['Entity Type Code'] == 2]

In [33]:
samelo_io = pd.merge(ph_orgz, ph_indi, on=['Provider First Line Business Practice Location Address'], how='inner')

In [34]:
samelo_io.columns

Index(['NPI_x', 'Entity Type Code_x', 'Replacement NPI_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider Last Name (Legal Name)_x', 'Provider First Name_x',
       'Provider Middle Name_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'State2_x', 'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'NPI Deactivation Reason Code_x', 'NPI Deactivation Date_x',
       'NPI Reactivation Date_x', 'Provider Gender Code_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Sole Proprietor_x',
       'Is Organization Subpart_x', 'Zip5_x', 'Address_x', 'taxonomy_x',
       'NPI_y', 'Entity Type Code_y', 'Replacement NPI_y',
       'Provider Organization Name (Legal Business Name)_y',
       'Provider Last Name (Legal 

In [35]:
listu = []
for column in samelo_io.columns:
    unique_values = samelo_io[column].unique()
    listu.append([column,len(unique_values)])

In [36]:
datafu = pd.DataFrame(listu, columns=['col', 'num'])
# datafu[datafu['num'] == 1]

In [37]:
oneuni = datafu[datafu['num'] == 1]['col'].tolist()[1:]
oneuni

['Replacement NPI_x',
 'Provider Last Name (Legal Name)_x',
 'Provider First Name_x',
 'Provider Middle Name_x',
 'State2_x',
 'NPI Deactivation Reason Code_x',
 'Provider Gender Code_x',
 'Is Sole Proprietor_x',
 'Entity Type Code_y',
 'Replacement NPI_y',
 'Provider Organization Name (Legal Business Name)_y',
 'State2_y',
 'NPI Deactivation Reason Code_y',
 'NPI Deactivation Date_y',
 'NPI Reactivation Date_y',
 'Authorized Official Last Name_y',
 'Authorized Official First Name_y',
 'Authorized Official Middle Name_y',
 'Authorized Official Telephone Number_y',
 'Is Organization Subpart_y']

In [38]:
same_io = samelo_io.drop(columns=oneuni).reset_index(drop=True)

In [39]:
same_io.columns

Index(['NPI_x', 'Entity Type Code_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'NPI Deactivation Date_x', 'NPI Reactivation Date_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Organization Subpart_x',
       'Zip5_x', 'Address_x', 'taxonomy_x', 'NPI_y',
       'Provider Last Name (Legal Name)_y', 'Provider First Name_y',
       'Provider Middle Name_y', 'City_y',
       'Provider Business Practice Location Address Postal Code_y',
       'Provider Business Practice Location Address Telephone Number_y',
       'Provider Gender Code_y', 'Is Sole Proprietor_y', 'Zip5_y', 'Address_y',
       'taxonomy_y'],
      dtype='obj

In [41]:
# same_io.shape

In [42]:
# same_io.to_excel('temp_ph3.xlsx',index=False)

## read temp_ph3

In [13]:
io_combine = pd.read_excel('temp_ph3.xlsx')

In [14]:
io_combine.columns

Index(['NPI_x', 'Entity Type Code_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'NPI Deactivation Date_x', 'NPI Reactivation Date_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Organization Subpart_x',
       'Zip5_x', 'Address_x', 'taxonomy_x', 'NPI_y',
       'Provider Last Name (Legal Name)_y', 'Provider First Name_y',
       'Provider Middle Name_y', 'City_y',
       'Provider Business Practice Location Address Postal Code_y',
       'Provider Business Practice Location Address Telephone Number_y',
       'Provider Gender Code_y', 'Is Sole Proprietor_y', 'Zip5_y', 'Address_y',
       'taxonomy_y'],
      dtype='obj

In [15]:
io_combine2 = io_combine[['NPI_x', 'Entity Type Code_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'NPI Deactivation Date_x', 'NPI Reactivation Date_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Organization Subpart_x',
       'Zip5_x', 'Address_x', 'taxonomy_x','Provider Last Name (Legal Name)_y', 
        'Provider First Name_y','Provider Middle Name_y']]

In [16]:
# io_combine2

In [17]:
fml = io_combine2['Provider First Name_y'].fillna('') + ','+ io_combine2['Provider Middle Name_y'].fillna('')+ ',' + io_combine2['Provider Last Name (Legal Name)_y'].fillna('')

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [19]:
io_combine2['fml'] = fml

In [20]:
io_combine2.columns

Index(['NPI_x', 'Entity Type Code_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'NPI Deactivation Date_x', 'NPI Reactivation Date_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Organization Subpart_x',
       'Zip5_x', 'Address_x', 'taxonomy_x',
       'Provider Last Name (Legal Name)_y', 'Provider First Name_y',
       'Provider Middle Name_y', 'fml'],
      dtype='object')

In [21]:
st_df = io_combine2[['NPI_x', 'Entity Type Code_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'NPI Deactivation Date_x', 'NPI Reactivation Date_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Organization Subpart_x',
       'Zip5_x', 'Address_x', 'taxonomy_x','fml']]

In [22]:
st_df.columns

Index(['NPI_x', 'Entity Type Code_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'NPI Deactivation Date_x', 'NPI Reactivation Date_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Organization Subpart_x',
       'Zip5_x', 'Address_x', 'taxonomy_x', 'fml'],
      dtype='object')

In [23]:
stacked_df = st_df.groupby(['NPI_x', 'Entity Type Code_x',
       'Provider Organization Name (Legal Business Name)_x',
       'Provider First Line Business Practice Location Address', 'City_x',
       'Provider Business Practice Location Address Postal Code_x',
       'Provider Business Practice Location Address Telephone Number_x',
       'Authorized Official Last Name_x', 'Authorized Official First Name_x',
       'Authorized Official Middle Name_x',
       'Authorized Official Telephone Number_x', 'Is Organization Subpart_x',
       'Zip5_x', 'Address_x', 'taxonomy_x'])['fml'].apply('; '.join).reset_index()

In [24]:
stacked_df

Unnamed: 0,NPI_x,Entity Type Code_x,Provider Organization Name (Legal Business Name)_x,Provider First Line Business Practice Location Address,City_x,Provider Business Practice Location Address Postal Code_x,Provider Business Practice Location Address Telephone Number_x,Authorized Official Last Name_x,Authorized Official First Name_x,Authorized Official Middle Name_x,Authorized Official Telephone Number_x,Is Organization Subpart_x,Zip5_x,Address_x,taxonomy_x,fml
0,1013275874,2,ELITE BIOMECHANICAL DESIGN,2208 5TH AVE,OROVILLE,959655816,5305346913,CASEY,MICHAEL,PATRICK,5308946913,N,95965,2208 5TH AVE,335E00000X,"NICHOLAS,D.,MADSEN"
1,1023188943,2,JENNINGS ORTHOPEDIC INC,10683 MAGNOLIA AVE,RIVERSIDE,925051893,9513522029,JENNINGS,HENRY,ALVIN,9513522029,N,92505,10683 MAGNOLIA AVE,224P00000X,"HENRY,ALVIN,JENNINGS"
2,1023302015,2,BALDWIN ORTHOTICS & PROSTHETICS INC,24475 SUNNYMEAD BLVD,MORENO VALLEY,925539313,9518247850,BALDWIN,HARLAN,L,9518247850,N,92553,24475 SUNNYMEAD BLVD,335E00000X,"HARLAN,L,BALDWIN"
3,1033134903,2,"FERRACO, INC.",2933 LONG BEACH BLVD,LONG BEACH,908061517,5629882414,CRONIN,BRIAN,M,6264457797,N,90806,2933 LONG BEACH BLVD,335E00000X,"PETER,L.,KORCHIN"
4,1033367271,2,"PACIFIC MEDICAL, INC.",3001 L ST,SACRAMENTO,958165225,9167061520,WEAVER,MARK,L.,8007269180,N,95816,3001 L ST,335E00000X,"FERHAN,INAM,MAHER; DAVID,J.,SCURTI; GEORGE,,VI..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,1902948250,2,"VALLEY INSTITUTE OF PROSTHETICS & ORTHOTICS, INC",23033 LYONS AVE STE 6,NEWHALL,913212777,6612531191,TOWNSEND,BARRY,W,6613221005,N,91321,23033 LYONS AVE,335E00000X,"ALLEN,,DOLBERRY"
59,1912955881,2,"VALLEY INSTITUTE OF PROSTHETICS & ORTHOTICS, INC.",1524 21ST ST,BAKERSFIELD,933014002,6613221005,TOWNSEND,BARRY,W,6613221005,N,93301,1524 21ST ST,335E00000X,"MICHAEL,KURT,NELSON; NICOLE,CAROLYN,KERR"
60,1932345964,2,BUTTE PROSTHETICS,2260 ESPLANADE,CHICO,959262234,5308934255,THURMAN,RONALD,E,5308934255,N,95926,2260 ESPLANADE,335E00000X,"RONALD,EDWARD,THURMAN"
61,1942697875,2,ACTIVE 1 PROSTHETICS AND ORTHOTICS,295 W CROMWELL AVE,FRESNO,937116167,5594935020,MAGILL,STEPHAN,MARK,5594935020,N,93711,295 W CROMWELL AVE,335E00000X,"STEPHAN,M,MAGILL"


In [36]:
# ph_tx_3 = pd.concat([ph_tx, ph_tx2]).drop_duplicates()

In [37]:
# ph_tx_3.shape

(395, 11)

In [129]:
stacked_df.to_excel('comb_indorg.xlsx',index=False)