### Import data table

In [1]:
import numpy as np
import pandas as pd

In [2]:
provider_2016 = pd.read_csv('Medicare_Part_D_Opioid_Prescriber_Summary_File_2016.csv')

In [3]:
provider_2016.shape

(1131550, 11)

In [4]:
provider_2016.sample(5)

Unnamed: 0,NPI,NPPES Provider Last Name,NPPES Provider First Name,NPPES Provider ZIP Code,NPPES Provider State,Specialty Description,Total Claim Count,Opioid Claim Count,Opioid Prescribing Rate,Long-Acting Opioid Claim Count,Long-Acting Opioid Prescribing Rate
798568,1700835956,BANASZAK,LLOYD,55441.0,MN,Family Practice,65,0.0,0.0,0.0,
6521,1003839267,O CONNOR,CHRISTINE,21202.0,MD,Obstetrics & Gynecology,172,,,0.0,
501915,1447290663,HAWLEY,WILLIAM,35960.0,AL,Emergency Medicine,295,87.0,29.49,,
768483,1679895049,CHEUK,JULIA,70039.0,LA,Dentist,41,,,0.0,
866061,1760590673,LAM,TIMOTHY,98104.0,WA,Dentist,38,,,0.0,


In [5]:
provider_2016.columns

Index(['NPI', 'NPPES Provider Last Name', 'NPPES Provider First Name',
       'NPPES Provider ZIP Code', 'NPPES Provider State',
       'Specialty Description', 'Total Claim Count', 'Opioid Claim Count',
       'Opioid Prescribing Rate', 'Long-Acting Opioid Claim Count',
       'Long-Acting Opioid Prescribing Rate'],
      dtype='object')

In [6]:
# Rename the columns for further explore
provider_2016 = provider_2016.rename(columns={'NPPES Provider Last Name':'last_name', 'NPPES Provider First Name':'first_name',
                                  'NPPES Provider ZIP Code':'zip_code', 'NPPES Provider State':'state',
                                  'Specialty Description':'specialty', 'Total Claim Count':'claim_total',
                                  'Opioid Claim Count':'opioid_total', 'Opioid Prescribing Rate':'opioid_rate',
                                  'Long-Acting Opioid Claim Count':'la_opioid_count',
                                  'Long-Acting Opioid Prescribing Rate':'la_rate'})

In [8]:
# Remove prescriber that cannot write opioid drugs
temp = provider_2016.groupby(['specialty']).opioid_total.sum().reset_index(name='sum')
temp = temp[(temp == 0).any(1)]
temp.sample(5)

Unnamed: 0,specialty,sum
166,Prosthetist,0.0
222,Voluntary Health or Charitable Agencies,0.0
36,Community/Behavioral Health,0.0
85,In Home Supportive Care,0.0
12,Audiologist (billing independently),0.0


In [10]:
white_list = list(temp.specialty.unique())
provider_2016 = provider_2016[~provider_2016['specialty'].isin(white_list)]

In [11]:
provider_2016.shape

(1131235, 11)

In [16]:
# reformat all the specialy names
provider_2016['specialty'] = provider_2016.specialty.str.strip().str.lower()
provider_2016.specialty.value_counts().head(10)

nurse practitioner                                                137822
internal medicine                                                 133174
dentist                                                           129650
family practice                                                   109334
physician assistant                                                86733
student in an organized health care education/training program     51615
emergency medicine                                                 47229
obstetrics & gynecology                                            29713
optometry                                                          28570
psychiatry                                                         25528
Name: specialty, dtype: int64

In [18]:
#remove all characters that aren't digits
provider_2016['opioid_rate'] = provider_2016['opioid_rate'].replace('%','').astype('float64')
provider_2016['la_rate'] = provider_2016['la_rate'].replace('%','').astype('float64')

In [19]:
provider_2016.head()

Unnamed: 0,NPI,last_name,first_name,zip_code,state,specialty,claim_total,opioid_total,opioid_rate,la_opioid_count,la_rate
0,1003000126,ENKESHAFI,ARDALAN,21502.0,MD,internal medicine,545,23.0,4.22,,
1,1003000142,KHALIL,RASHID,43623.0,OH,anesthesiology,1733,941.0,54.3,165.0,17.53
2,1003000167,ESCOBAR,JULIO,89403.0,NV,dentist,49,11.0,22.45,0.0,0.0
3,1003000282,BLAKEMORE,ROSIE,37243.0,TN,nurse practitioner,146,,,0.0,
4,1003000407,GIRARDI,DAVID,15825.0,PA,family practice,2225,17.0,0.76,,


In [21]:
# Only keep all the frequent appeared specialty and update the rest to others
count = provider_2016.specialty.value_counts()
count = pd.DataFrame(count)
count = count.loc[count.specialty <100 ]
count = list(count.index)

In [22]:
provider_2016.specialty.replace(count, 'other', inplace=True)
provider_2016.sample(10)

Unnamed: 0,NPI,last_name,first_name,zip_code,state,specialty,claim_total,opioid_total,opioid_rate,la_opioid_count,la_rate
972787,1851785323,REYES,JAVIER,33136.0,FL,student in an organized health care education/...,27,0.0,0.0,0.0,
237896,1215010863,MONGOLD,BRADLEY,25401.0,WV,emergency medicine,173,46.0,26.59,0.0,0.0
1075249,1952311425,ENDRES,DONALD,99508.0,AK,otolaryngology,149,,,0.0,
2730,1003161068,ROY,DAWN,4765.0,ME,nurse practitioner,3602,20.0,0.56,0.0,0.0
105737,1093722480,ENDE,THEODORE,8731.0,NJ,internal medicine,6210,583.0,9.39,188.0,32.25
906892,1801087309,LESSER,LENARD,94111.0,CA,family practice,22,,,,
738002,1659349934,LARSON,JANET,56431.0,MN,nurse practitioner,7836,608.0,7.76,85.0,13.98
970341,1851627376,HARIRI,MARYAM,24012.0,VA,dentist,53,0.0,0.0,0.0,
263108,1235171216,SIMS,SUSAN,29412.0,SC,nurse practitioner,1341,26.0,1.94,0.0,0.0
841068,1740287598,DICKEY,THOMAS,80620.0,CO,physician assistant,30,,,0.0,


In [24]:
specialty_df = provider_2016.groupby('specialty')['opioid_rate'].mean()
specialty_df = pd.DataFrame(data=specialty_df).reset_index()
specialty_df.head()

Unnamed: 0,specialty,opioid_rate
0,addiction medicine,7.451515
1,allergy/ immunology,0.239921
2,allergy/immunology,0.643378
3,anesthesiology,28.827551
4,cardiac surgery,22.294374
