In [123]:
import pandas as pd
import numpy as np

In [373]:
nppes = pd.read_csv('npidata_pfile_20050523-20230212.csv', chunksize = 10000)

The NPPES dataset contains a large number of fields, only a few of which are relevant to this project:
* 'NPI' 
* Entity Type, indicated by the 'Entity Type Code' field:
    - 1 = Provider (doctors, nurses, etc.)
    - 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
* Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
    - 'Provider Organization Name (Legal Business Name)'
    - 'Provider Last Name (Legal Name)'
    - 'Provider First Name'
    - 'Provider Middle Name'
    - 'Provider Name Prefix Text'
    - 'Provider Name Suffix Text'
    - 'Provider Credential Text'
* Address: Business Practice Location (not mailing), contained in the following fields:
    - 'Provider First Line Business Practice Location Address'
    - 'Provider Second Line Business Practice Location Address'
    - 'Provider Business Practice Location Address City Name'
    - 'Provider Business Practice Location Address State Name'
    - 'Provider Business Practice Location Address Postal Code'
* The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns. 
A provider can have up to 15 taxonomy codes, but we want the one which has Primary Switch = Y in the associated 'Healthcare Provider Primary Taxonomy Switch*' field. 
Note that this does not always occur in spot 1.

In [374]:
nppes = next(nppes)

  if await self.run_code(code, result, async_=asy):


In [375]:
nppes.columns.tolist()

['NPI',
 'Entity Type Code',
 'Replacement NPI',
 'Employer Identification Number (EIN)',
 'Provider Organization Name (Legal Business Name)',
 'Provider Last Name (Legal Name)',
 'Provider First Name',
 'Provider Middle Name',
 'Provider Name Prefix Text',
 'Provider Name Suffix Text',
 'Provider Credential Text',
 'Provider Other Organization Name',
 'Provider Other Organization Name Type Code',
 'Provider Other Last Name',
 'Provider Other First Name',
 'Provider Other Middle Name',
 'Provider Other Name Prefix Text',
 'Provider Other Name Suffix Text',
 'Provider Other Credential Text',
 'Provider Other Last Name Type Code',
 'Provider First Line Business Mailing Address',
 'Provider Second Line Business Mailing Address',
 'Provider Business Mailing Address City Name',
 'Provider Business Mailing Address State Name',
 'Provider Business Mailing Address Postal Code',
 'Provider Business Mailing Address Country Code (If outside U.S.)',
 'Provider Business Mailing Address Telephone Nu

In [376]:
nppes.filter(regex='^Healthcare Provider Primary Taxonomy Switch_').eq('Y').sum()


Healthcare Provider Primary Taxonomy Switch_1     9001
Healthcare Provider Primary Taxonomy Switch_2       62
Healthcare Provider Primary Taxonomy Switch_3        2
Healthcare Provider Primary Taxonomy Switch_4        0
Healthcare Provider Primary Taxonomy Switch_5        1
Healthcare Provider Primary Taxonomy Switch_6        0
Healthcare Provider Primary Taxonomy Switch_7        0
Healthcare Provider Primary Taxonomy Switch_8        0
Healthcare Provider Primary Taxonomy Switch_9        0
Healthcare Provider Primary Taxonomy Switch_10       0
Healthcare Provider Primary Taxonomy Switch_11       0
Healthcare Provider Primary Taxonomy Switch_12       0
Healthcare Provider Primary Taxonomy Switch_13       0
Healthcare Provider Primary Taxonomy Switch_14       0
Healthcare Provider Primary Taxonomy Switch_15       0
dtype: int64

In [377]:
new_nppes = nppes.drop(columns = ['Healthcare Provider Primary Taxonomy Switch_4', 
                'Healthcare Provider Primary Taxonomy Switch_6', 
                'Healthcare Provider Primary Taxonomy Switch_7', 
                'Healthcare Provider Primary Taxonomy Switch_8', 
                'Healthcare Provider Primary Taxonomy Switch_9', 
                'Healthcare Provider Primary Taxonomy Switch_10', 
                'Healthcare Provider Primary Taxonomy Switch_11', 
                'Healthcare Provider Primary Taxonomy Switch_12', 
                'Healthcare Provider Primary Taxonomy Switch_13', 
                'Healthcare Provider Primary Taxonomy Switch_14', 
                'Healthcare Provider Primary Taxonomy Switch_15'], inplace = True)

In [378]:
nppes.filter(regex='^Healthcare Provider Primary Taxonomy Switch_').eq('Y').sum()

Healthcare Provider Primary Taxonomy Switch_1    9001
Healthcare Provider Primary Taxonomy Switch_2      62
Healthcare Provider Primary Taxonomy Switch_3       2
Healthcare Provider Primary Taxonomy Switch_5       1
dtype: int64

In [379]:
nppes_sort = nppes.apply(lambda x: sorted(x, key=lambda y: y == 'Y' if y is not np.nan else -1, reverse=True))



In [380]:
pd.set_option('display.max_columns', None)

In [381]:
nppes_tax =nppes_sort[['Healthcare Provider Taxonomy Code_1',
           'Healthcare Provider Taxonomy Code_2',
           'Healthcare Provider Taxonomy Code_3',
           'Healthcare Provider Taxonomy Code_5']]

In [382]:
nppes_tax = nppes_tax.rename(columns = {'Healthcare Provider Taxonomy Code_1': 'HPTX_Code_1',
                           'Healthcare Provider Taxonomy Code_2': 'HPTX_Code_2',
                           'Healthcare Provider Taxonomy Code_3': 'HPTX_Code_3',
                           'Healthcare Provider Taxonomy Code_5': 'HPTX_Code_5'})

In [383]:
nppes_tax

Unnamed: 0,HPTX_Code_1,HPTX_Code_2,HPTX_Code_3,HPTX_Code_5
0,231H00000X,213EP1101X,103TF0000X,367500000X
1,367500000X,183500000X,2085N0904X,1835P1300X
2,1835P1200X,1041C0700X,163W00000X,1041C0700X
3,1223G0001X,152W00000X,213ES0131X,207RI0011X
4,207Q00000X,207RG0100X,122300000X,225X00000X
...,...,...,...,...
9995,,,,
9996,,,,
9997,,,,
9998,,,,


In [384]:
nppes_tax = nppes_tax.dropna()

nppes_tax

Unnamed: 0,HPTX_Code_1,HPTX_Code_2,HPTX_Code_3,HPTX_Code_5
0,231H00000X,213EP1101X,103TF0000X,367500000X
1,367500000X,183500000X,2085N0904X,1835P1300X
2,1835P1200X,1041C0700X,163W00000X,1041C0700X
3,1223G0001X,152W00000X,213ES0131X,207RI0011X
4,207Q00000X,207RG0100X,122300000X,225X00000X
5,207P00000X,103TC2200X,363LA2200X,2085R0203X
6,207N00000X,363A00000X,367500000X,2085R0203X
7,208000000X,183500000X,207NS0135X,207NP0225X
8,174400000X,183500000X,183500000X,2085R0203X
9,1223G0001X,163WH0200X,2084P2900X,183500000X


In [385]:
nucc_code = pd.read_csv('nucc_taxonomy_230.csv', chunksize = 10000, encoding = 'ISO-8859-1')

In [386]:
nucc = nucc_code

In [390]:
next(nucc)

Unnamed: 0,Code,Grouping,Classification,Specialization,Definition,Notes,Display Name,Section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,"A physician who specializes in the diagnosis, ...",Source: National Uniform Claim Committee,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,An allergy and immunology physician who specia...,"Source: National Uniform Claim Committee, 2022...",Clinical & Laboratory Immunology (Allergy & Im...,Individual
...,...,...,...,...,...,...,...,...
868,343800000X,Transportation Services,Secured Medical Transport (VAN),,A public or privately owned transportation ser...,,Secured Medical Transport (VAN),Non-Individual
869,344600000X,Transportation Services,Taxi,,A land commercial vehicle used for the transpo...,,Taxi,Non-Individual
870,347D00000X,Transportation Services,Train,,An organization or business licensed to provid...,,Train,Non-Individual
871,347E00000X,Transportation Services,Transportation Broker,,An organization that provides transportation f...,Source: Section 6083 of the Deficit Reduction ...,Transportation Broker,Non-Individual


In [94]:
tax_switch = []

for col in (nppes.filter(regex='^Healthcare Provider Primary Taxonomy Switch_') == 'Y').columns:
                tax = nppes[tax_filtered].loc[nppes[tax_filtered].eq('Y').any(axis=1)]
                tax_switch.append(tax)
                tax_df = pd.concat(tax_switch)

tax_df
#tax = nppes[tax_cols].loc[nppes[tax_cols].eq('Y').any(axis=1)]

Unnamed: 0,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Primary Taxonomy Switch_15
555,Y,,,,,,,,,,,,,,
556,Y,,,,,,,,,,,,,,
557,Y,,,,,,,,,,,,,,
558,Y,,,,,,,,,,,,,,
559,Y,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Y,,,,,,,,,,,,,,
9996,Y,,,,,,,,,,,,,,
9997,Y,,,,,,,,,,,,,,
9998,Y,,,,,,,,,,,,,,


In [118]:
tax_df.iloc[:, 0:10]

Unnamed: 0,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Primary Taxonomy Switch_10
555,Y,,,,,,,,,
556,Y,,,,,,,,,
557,Y,,,,,,,,,
558,Y,,,,,,,,,
559,Y,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,Y,,,,,,,,,
9996,Y,,,,,,,,,
9997,Y,,,,,,,,,
9998,Y,,,,,,,,,


In [66]:
tax_cols = nppes.filter(regex='^Healthcare Provider Primary Taxonomy Switch_')
tax = nppes[tax_cols].loc[nppes[tax_cols].eq('Y').any(axis=1)]

tax

ValueError: Boolean array expected for the condition, not object

In [39]:
switch = nppes.filter(regex='^Healthcare Provider Primary Taxonomy Switch_').all()

In [40]:
tax_switch = switch[~switch].index.tolist()

In [42]:
nppes.drop(tax_switch, axis =1)

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Employer Identification Number (EIN),Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,...,Healthcare Provider Taxonomy Group_7,Healthcare Provider Taxonomy Group_8,Healthcare Provider Taxonomy Group_9,Healthcare Provider Taxonomy Group_10,Healthcare Provider Taxonomy Group_11,Healthcare Provider Taxonomy Group_12,Healthcare Provider Taxonomy Group_13,Healthcare Provider Taxonomy Group_14,Healthcare Provider Taxonomy Group_15,Certification Date
0,1740284231,,,,,,,,,,...,,,,,,,,,,
1,1346245800,,,,,,,,,,...,,,,,,,,,,
2,1487650776,,,,,,,,,,...,,,,,,,,,,
3,1033113022,,,,,,,,,,...,,,,,,,,,,
4,1043216138,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1558360792,1.0,,,,RODGERS,SUSAN,V,MRS.,,...,,,,,,,,,,
9996,1467451609,1.0,,,,HILTON,BRET,D,,,...,,,,,,,,,,
9997,1346249596,1.0,,,,STANLEY,GARY,E,,,...,,,,,,,,,,
9998,1982603130,1.0,,,,SLETTEN,CHRISTINA,A.,DR.,,...,,,,,,,,,,


In [44]:
tax = []

for col in nppes.filter(regex='^Healthcare Provider Primary Taxonomy Switch_').columns:
                 tax_chunk = nppes.loc[nppes[col] == 'Y']
                 tax.append(tax_chunk)
                 tax_df = pd.concat(tax)
                
#tax_df['Healthcare Provider Primary Taxonomy Switch_11'].count()  #1-10

TypeError: ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
filtered_df = nppes.loc[(nppes['NPI'] >= 0) & (nppes['gender'] == 'M')]

In [122]:
nppes.loc[nppes['Healthcare Provider Primary Taxonomy Switch_1'] == 'Y']

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Employer Identification Number (EIN),Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,...,Healthcare Provider Taxonomy Group_7,Healthcare Provider Taxonomy Group_8,Healthcare Provider Taxonomy Group_9,Healthcare Provider Taxonomy Group_10,Healthcare Provider Taxonomy Group_11,Healthcare Provider Taxonomy Group_12,Healthcare Provider Taxonomy Group_13,Healthcare Provider Taxonomy Group_14,Healthcare Provider Taxonomy Group_15,Certification Date
10000,1215936463,1,,,,CAPPIELLO,ENRICO,J,DR.,,...,,,,,,,,,,
10001,1588663736,1,,,,CORLEY,REBECCA,S,,,...,,,,,,,,,,
10002,1376542530,1,,,,BAYLESS,ALVIN,KENT,DR.,,...,,,,,,,,,,
10003,1730188087,1,,,,GORBACK,MICHAEL,SCOTT,DR.,,...,,,,,,,,,,
10004,1649279993,1,,,,BENNETT,CAROL,S.,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1669467890,1,,,,CORWIN,MARTIN,E,DR.,,...,,,,,,,,,,
19996,1578558706,1,,,,FLEMING,MEI,LEE,,,...,,,,,,,,,,
19997,1548255771,1,,,,FOSTER,HELEN,MONTAGUE,DR.,,...,,,,,,,,,,
19998,1992790125,1,,,,BLUMENTHAL,DAVID,,,,...,,,,,,,,,,


In [136]:
tax = nppes.filter(regex = '^Healthcare Provider Primary Taxonomy Switch_')



Unnamed: 0,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Primary Taxonomy Switch_15
10407,N,N,Y,,,,,,,,,,,,
17420,N,N,Y,,,,,,,,,,,,
17757,N,N,Y,,,,,,,,,,,,
18205,N,N,Y,,,,,,,,,,,,


In [125]:
nppes_tax = nppes.apply(lambda x: x.loc[x == 'Y'])


In [39]:
hop = pd.read_csv('DocGraph_Hop_Teaming_2018.csv', 
                          nrows = 100)

In [40]:
hop

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508062167,1730166109,350,370,53.922,72.612
1,1508065640,1730166109,25,25,49.800,55.006
2,1508052093,1730166109,16,16,109.500,70.593
3,1508172545,1730166109,14,14,103.357,75.483
4,1508285131,1730166109,20,21,89.952,89.880
...,...,...,...,...,...,...
95,1508178229,1730166893,31,32,67.125,61.279
96,1508196445,1730166893,23,24,60.833,59.129
97,1508811076,1730166935,14,15,62.533,62.827
98,1508871252,1730166935,29,31,25.323,36.693
