In [1]:
import numpy as np
import pandas as pd
import pickle
import string
import scipy
import re

In [2]:
def clean_drug_name(df,wrong_name, right_name):
    """
    Input:
        df: df, dataframe to be used
        wrong_name: Str, wrong name that is listed in df
        right_name: Str, right name to be changed to
    Output:
        None
    """
    #Getting all the locations where it says the wrong name
    for col in ['drug_name','generic_name']:
        l = list(df[df[col]==wrong_name].index.values)
        for i in l:
            df.at[i,col] = right_name

In [4]:
cols = [0,1,2,3,4,5,7,8,10,12,13]

In [5]:
scripts16 = pd.read_csv('/Volumes/Seagate/Galvanize/Prescriptions 2016.csv', usecols=cols)
#Grabbing only the NJ docs
scriptsnj16 = scripts16[scripts16['nppes_provider_state']=='NJ']
#Loading in All Npis to remove organizations
npi = pd.read_csv('/Volumes/Seagate/Galvanize/NPPES_Data_Dissemination_May_2018/npidata_pfile_20050523-20180513.csv', \
                  usecols=['Entity Type Code','NPI','Provider Middle Name', \
                           'Provider First Name','Provider Last Name (Legal Name)'])
#Grabbing only the individuals from the npi database, so I can remove orgs from the dataframe
npi = npi[npi['Entity Type Code'].isin([1.0])]
scriptsnj16 = scriptsnj16.merge(npi, left_on='npi',right_on='NPI')
#Dropping the added columns from the NPI
scriptsnj16.drop(['NPI','Entity Type Code','Provider Last Name (Legal Name)','Provider First Name','Provider Middle Name'],axis=1,inplace=True)
scriptsnj16.to_csv('/Volumes/Seagate/Galvanize/2016_scriptsnj.csv',index=False)

In [8]:
#Cleaning up generic & drug names of symbols as this could cause mismatching of names (I'll replace symbols with a space)
scriptsnj16['drug_name'] = [re.sub(r'[^\w\s]',' ',str(x).upper()) for x in scriptsnj16['drug_name']]
scriptsnj16['generic_name'] = [re.sub(r'[^\w\s]',' ',str(x).upper()) for x in scriptsnj16['generic_name']]
#Removing some common extras from the names of our drugs since I want to compare name brand vs generic, not go into specifc drugs
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: x.replace(' HCL', ''))
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: x.replace(' XL', ''))
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: x.replace(' ER', ''))
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: x.replace(' DIP', ''))
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: x.replace(' PF', ''))
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: x.replace(' XR', ''))
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: x.replace(' HBR', '  HYDROBROMIDE'))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: x.replace(' HCL', ''))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: x.replace(' XL', ''))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: x.replace(' ER', ''))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: x.replace(' DIP', ''))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: x.replace(' PF', ''))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: x.replace(' XR', ''))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: x.replace(' HBR', '  HYDROBROMIDE'))
#I noticed some drug names & generic names weren't matching up due to the word 'with', even when a doc prescribed the generic
#Also removes exra spaces
scriptsnj16['drug_name'] = scriptsnj16['drug_name'].map(lambda x: ' '.join(str(x).replace(' WITH','').split()))
scriptsnj16['generic_name'] = scriptsnj16['generic_name'].map(lambda x: ' '.join(str(x).replace(' WITH','').split()))
#Some drug names that weren't match that I had to hard code in
#I used the same functoin that I used for the 2013 data

In [11]:
pd.options.display.max_rows = 65

In [12]:
d1 = (scriptsnj16[([x[0] in x[1] for x in zip(scriptsnj16['drug_name'], scriptsnj16['generic_name'])])&(scriptsnj16['drug_name']!=scriptsnj16['generic_name'])].groupby(['generic_name','drug_name']).agg({'total_claim_count':'count'})).reset_index()
d1

Unnamed: 0,generic_name,drug_name,total_claim_count
0,0 9 SODIUM CHLORIDE,SODIUM CHLORIDE,113
1,ABACAVIR SULFATE,ABACAVIR,60
2,ACETIC ACID ALUMINUM ACETATE,ACETIC ACID ALUMINUM,3
3,BACITRACIN POLYMYXIN B SULFATE,BACITRACIN POLYMYXIN,7
4,CALCITONIN SALMON SYNTHETIC,CALCITONIN SALMON,231
5,CEFAZOLIN SODIUM DEXTROSE ISO,CEFAZOLIN SODIUM DEXTROSE,4
6,CEFEPIME IN ISO OSM DEXTROSE,CEFEPIME,5
7,CEFTRIAXONE IN IS OSM DEXTROSE,CEFTRIAXONE,7
8,CEFTRIAXONE SODIUM,CEFTRIAXONE,92
9,CEFUROXIME AXETIL,CEFUROXIME,1234


In [13]:
#Making this a bit easier on myself & going through and looking at the columns where the drug names are in the generic name
#This basically lets me see where a shorthand is being used & is not being linked as a generic, even when it is
d1 = (scriptsnj16[([x[0] in x[1] for x in zip(scriptsnj16['drug_name'], scriptsnj16['generic_name'])])&(scriptsnj16['drug_name']!=scriptsnj16['generic_name'])].groupby(['generic_name','drug_name']).agg({'total_claim_count':'count'})).reset_index()
#Dropping the columns where these are actually name brand drugs
d1.drop(list(d1[d1['drug_name']=='CIPRO'].index.values),inplace=True)
d1.drop(list(d1[d1['drug_name']=='DIGOX'].index.values),inplace=True)
d1.drop(list(d1[d1['drug_name']=='DILT'].index.values),inplace=True)
d1.drop(list(d1[d1['drug_name']=='ERY'].index.values),inplace=True)
d1.drop(list(d1[d1['drug_name']=='FROVA'].index.values),inplace=True)
d1.drop(list(d1[d1['drug_name']=='TOPROL'].index.values),inplace=True)
d1.drop(list(d1[d1['drug_name']=='URSO'].index.values),inplace=True)
d1.drop(list(d1[d1['generic_name']=='FENOFIBRATE MICRONIZED'].index.values),inplace=True)
#Using a for loop to go through & fix the names
for generic, drug in zip(list(d1['generic_name']),list(d1['drug_name'])):
    clean_drug_name(scriptsnj16,generic,drug)


In [16]:
#Doing the same for the opposite now
d2 = (scriptsnj16[([x[0] in x[1] for x in zip(scriptsnj16['generic_name'], \
                                              scriptsnj16['drug_name'])]) & \
                  (scriptsnj16['drug_name']!=scriptsnj16['generic_name'])].groupby(['generic_name','drug_name']). \
      agg({'total_claim_count':'count'})).reset_index()
d2

Unnamed: 0,generic_name,drug_name,total_claim_count
0,PEN NEEDLES,ADVOCATE PEN NEEDLES,1
1,PEN NEEDLES,EASY COMFORT PEN NEEDLES,244
2,PEN NEEDLES,RELION PEN NEEDLES,1


In [15]:
#Dropping the columns where these are actually name brand drugs
d2.drop(list(d2[(d2['generic_name']=='PEN NEEDLE') & (d2['drug_name']!='PEN NEEDLES')].index.values),inplace=True)
#Using a for loop to go through & fix the names
for generic, drug in zip(list(d2['generic_name']),list(d2['drug_name'])):
    clean_drug_name(scriptsnj16,generic,drug)

In [17]:
#Checking where I have swapped for generics names Ex: PROBENECID COLCHICINE != COLCHICINE PROBENECID
d3 = (scriptsnj16[([sorted(x[0].split(' ')) == sorted(x[1].split(' ')) for x in zip(scriptsnj16['generic_name'], \
                                                                    scriptsnj16['drug_name'])]) & \
                  (scriptsnj16['drug_name']!=scriptsnj16['generic_name'])]).groupby(['generic_name','drug_name']). \
agg({'total_claim_count':'count'}).reset_index()
#Dropping the columns where these are actually name brand drugs
for generic, drug in zip(list(d3['generic_name']),list(d3['drug_name'])):
    clean_drug_name(scriptsnj16,generic,drug)


In [18]:
#There's a special case where insulin syringe falls under 2 generic names (insulin syringe and syring w ndl ...)
(scriptsnj16[scriptsnj16['drug_name']=='INSULIN SYRINGE']).groupby(['generic_name','drug_name']). \
agg({'total_claim_count':'sum'})
#I'm just going to set all insulin syringe (drug) = insulin syringe (generic) becuase I only care about if the drug prescribed is a generic or not

Unnamed: 0_level_0,Unnamed: 1_level_0,total_claim_count
generic_name,drug_name,Unnamed: 2_level_1
SYRGE NDL INS 0 3 ML HALF MARK,INSULIN SYRINGE,73
SYRING NEEDL DISP INSUL 0 3 ML,INSULIN SYRINGE,2063
SYRINGE AND NEEDLE INSULIN 1ML,INSULIN SYRINGE,6268
SYRINGE NEEDLE INSULIN 0 5 ML,INSULIN SYRINGE,9029
SYRINGE W O NEEDL INSULIN 1 ML,INSULIN SYRINGE,12


In [19]:
#I don't care if it is a sep drug or not
for i in (scriptsnj16[scriptsnj16['drug_name']=='INSULIN SYRINGE']).index.values:
    scriptsnj16.at[i,'generic_name'] = 'INSULIN SYRINGE'

In [20]:
"""Hopefully this gives you an idea of how messy some data is :)"""
#Same ones from 2013, just incase
clean_drug_name(scriptsnj16,'DORZOLAMIDE TIMOLOL MALEAT','DORZOLAMIDE TIMOLOL')
clean_drug_name(scriptsnj16,'CEFUROXIME AXETIL','CEFUROXIME')
clean_drug_name(scriptsnj16,'CLOPIDOGREL BISULFATE','CLOPIDOGREL')
#   Still the same generic for my purposes of brand name v generic
clean_drug_name(scriptsnj16,'DEXTROSE 70 IN WATER','DEXTROSE IN WATER')
clean_drug_name(scriptsnj16,'DEXTROSE 5 IN WATER','DEXTROSE IN WATER')
#   Still the same generic for my purposes of brand name v generic
clean_drug_name(scriptsnj16,'SYRING W NDL DISP INSUL 0 3ML','INSULIN SYRINGE')
clean_drug_name(scriptsnj16,'SYRING W NDL DISP INSUL,0 5ML','INSULIN SYRINGE')
clean_drug_name(scriptsnj16,'SYRINGE NEEDLE INSULIN 1 ML', 'INSULIN SYRINGE')
clean_drug_name(scriptsnj16,'SYR W NDL INS 0 3 ML HALF MARK',"INSULIN SYRINGE")
#   NITROFURANTOIN MONO MACRO is also known as NITROFURANTOIN MONOHYD M CRYST, both are the same generic
clean_drug_name(scriptsnj16,'NITROFURANTOIN MONO MACRO','NITROFURANTOIN')
clean_drug_name(scriptsnj16,'NITROFURANTOIN MONOHYD M CRYST,','NITROFURANTOIN')
clean_drug_name(scriptsnj16,'NITROFURANTOIN MACROCRYSTAL','NITROFURANTOIN')
#   Generic that got cut off
clean_drug_name(scriptsnj16,'ACETIC ACID ALUMINUM','ACETIC ACID ALUMINUM ACETATE')
clean_drug_name(scriptsnj16,'ALCOHOL ANTISEPTIC PADS','ALCOHOL PADS')
clean_drug_name(scriptsnj16,'ALCOHOL PREP PADS','ALCOHOL PADS')
clean_drug_name(scriptsnj16,'ALCOHOL SWAB','ALCOHOL PADS')
clean_drug_name(scriptsnj16,'ALCOHOL SWABS','ALCOHOL PADS')
clean_drug_name(scriptsnj16,'ALCOHOL WIPES','ALCOHOL PADS')
clean_drug_name(scriptsnj16,'SINGLE USE SWAB','ALCOHOL PADS')
clean_drug_name(scriptsnj16,'YF VAX','YELLOW FEVER VACCINE LIVE')
clean_drug_name(scriptsnj16,'0 9 SODIUM CHLORIDE','SODIUM CHLORIDE IRRIG SOLUTION')
clean_drug_name(scriptsnj16,'SODIUM CHLORIDE 0 45','SODIUM CHLORIDE IRRIG SOLUTION')
clean_drug_name(scriptsnj16,'ABACAVIR','ABACAVIR SULFATE')
clean_drug_name(scriptsnj16,'ABATACEPT MALTOSE','ABATACEPT')
clean_drug_name(scriptsnj16,'TYLENOL CODEINE NO 3','TYLENOL CODEINE')
clean_drug_name(scriptsnj16,'TYLENOL CODEINE NO 4','TYLENOL CODEINE')
clean_drug_name(scriptsnj16,'ACETIC ACID HYDROCORTISONE','HYDROCORTISONE ACETIC ACID')
clean_drug_name(scriptsnj16,'WATER FOR IRRIGATION STERILE','WATER')
clean_drug_name(scriptsnj16,'VERAPAMIL PM','VERAPAMIL')
clean_drug_name(scriptsnj16,'CHLORDIAZEPOXIDE AMITRIPTYLINE', 'AMITRIP CHLORDIAZEPOXIDE')
clean_drug_name(scriptsnj16,'AMOXICILLIN POTASSIUM CLAV','AMOXICILLIN CLAVULANATE')
clean_drug_name(scriptsnj16,'AMOX TR POTASSIUM CLAVULANATE','AMOXICILLIN CLAVULANATE')
clean_drug_name(scriptsnj16,'AMPICILLIN SODIUM SULBACTAM NA','AMPICILLIN SULBACTAM')
clean_drug_name(scriptsnj16,'ATROPINE CARE','ATROPINE SULFATE')
clean_drug_name(scriptsnj16,'AZACTAM ISO OSMOTIC DEXTROSE','AZTREONAM DEXTROSE WATER')
clean_drug_name(scriptsnj16,'BETAMETHASONEROPIONATE','BETAMETHASONE PROPYLENE GLYCOL')
clean_drug_name(scriptsnj16,'BUTALBITAL ACETAMINOPHEN CAFFE','BUTALBITAL ACETAMINOPHEN CAFFEINE')
clean_drug_name(scriptsnj16,'BUTALB ACETAMINOPHEN CAFFEINE','BUTALBITAL ACETAMINOPHEN CAFFEINE')
clean_drug_name(scriptsnj16,'ATROPINE CARE','ATROPINE SULFATE')
clean_drug_name(scriptsnj16,'BISOPROLOL HYDROCHLOROTHIAZIDE','BISOPROLOL FUMARATE HCTZ')
clean_drug_name(scriptsnj16,'BUTALB CAFF ACETAMINOPH CODEIN','BUTALBIT ACETAMIN CAFF CODEIN')
clean_drug_name(scriptsnj16,'CALCIUM FOLIC ACID PLUS D','CAL CARB MGOX D3 B12 FA B6 BOR')
clean_drug_name(scriptsnj16,'CIPROFLOXACIN LACTATE D5W','CIPROFLOXACIN D5W')
clean_drug_name(scriptsnj16,'CITALOPRAM HYDROBROMIDE','CITALOPRAM HBR')
clean_drug_name(scriptsnj16,'CLINDAMYCIN PHOS BENZOYL PEROX','CLINDAMYCIN BENZOYL PEROXIDE')
clean_drug_name(scriptsnj16,'CLOBETASOL EMULSION','CLOBETASOL PROPIONATE EMOLL')
clean_drug_name(scriptsnj16,'CLOBETASOL EMOLLIENT','CLOBETASOL PROPIONATE EMOLL')
clean_drug_name(scriptsnj16,'CODEINE BUTALBITAL ASA CAFFEIN','BUTALBITAL COMPOUND CODEINE')
clean_drug_name(scriptsnj16,'FOLIC ACID VIT B6 VIT B12','CYANOCOBALAMIN FA PYRIDOXINE')
clean_drug_name(scriptsnj16,'CYANOCOBALAMIN INJECTION','CYANOCOBALAMIN VITAMIN B 12')
clean_drug_name(scriptsnj16,'DESMOPRESSIN NONREFRIGERATED','DESMOPRESSIN ACETATE')
clean_drug_name(scriptsnj16,'DEXAMETHASONE SOD PHOSPHATE','DEXAMETHASONE SODIUM PHOSPHATE')
clean_drug_name(scriptsnj16,'AMPHETAMINE SALT COMBO','DEXTROAMPHETAMINE AMPHETAMINE')
clean_drug_name(scriptsnj16,'DEXTROSE 5 0 45 NACL','DEXTROSE SODIUM CHLORIDE')
clean_drug_name(scriptsnj16,'DEXTROSE 5 AND 0 9 NACL','DEXTROSE SODIUM CHLORIDE')
clean_drug_name(scriptsnj16,'FLUCONAZOLE IN SALINE','FLUCONAZOLE IN NACL ISO OSM')
clean_drug_name(scriptsnj16,'FLUOCINOLONE ACETONIDE','FLUOCINOLONE SHOWER CAP')
clean_drug_name(scriptsnj16,'GLUCAGON EMERGENCY KIT','GLUCAGON HUMAN RECOMBINANT')
clean_drug_name(scriptsnj16,'HALDOL DECANOATE 100','HALOPERIDOL DECANOATE 100')
clean_drug_name(scriptsnj16,'HALDOL DECANOATE 50','HALOPERIDOL DECANOATE 100')
clean_drug_name(scriptsnj16,'HYDROCODONE BT HOMATROPINE MBR','HYDROCODONE HOMATROPINE MBR')
clean_drug_name(scriptsnj16,'HYDROCODONE BIT HOMATROP ME BR','HYDROCODONE HOMATROPINE MBR')
clean_drug_name(scriptsnj16,'HYDROCODONE CHLORPHEN POLIS','HYDROCODONE CHLORPHENIRAMINE')
clean_drug_name(scriptsnj16,'HYDROCODONE BIT IBUPROFEN','HYDROCODONE IBUPROFEN')
clean_drug_name(scriptsnj16,'LIDOCAINE HYDROCORTISON','HYDROCORTISONE AC LIDOCAINE')
clean_drug_name(scriptsnj16,'L METHYLFOLATE CALCIUM','LEVOMEFOLATE CALCIUM')
clean_drug_name(scriptsnj16,'L METHYL B6 B12','METHYL B12 L MEFOLATE B6 PHOS')
clean_drug_name(scriptsnj16,'METHYLPHENIDATE LA','METHYLPHENIDATE CD')
clean_drug_name(scriptsnj16,'METHYLPHENIDATE SR','METHYLPHENIDATE CD')
clean_drug_name(scriptsnj16,'PEN NEEDLE','NEEDLES INSULIN DISPOSABLE')
clean_drug_name(scriptsnj16,'PEN NEEDLES','NEEDLES INSULIN DISPOSABLE')
clean_drug_name(scriptsnj16,'NEOMYCIN POLYMYXIN HC','NEOMYCIN POLYMYXIN B SULF HC')
clean_drug_name(scriptsnj16,'NEOMYCIN POLYMYXIN HYDROCORT','NEOMYCIN POLYMYXIN B SULF HC')
clean_drug_name(scriptsnj16,'NEOMYCIN POLYMYXN B GRAMICIDIN','NEOMYCIN POLYMYXIN GRAMICIDIN')
clean_drug_name(scriptsnj16,'PEN NEEDLES','NEEDLES INSULIN DISPOSABLE')
clean_drug_name(scriptsnj16,'MULTIVITAMINS FLUORIDE','PEDI M VIT NO 17 FLUORIDE')
clean_drug_name(scriptsnj16,'PEG 3350 AND ELECTROLYTES','PEG 3350 NA SULF BICARB CL KCL')
clean_drug_name(scriptsnj16,'PEG 3350 ELECTROLYTE','PEG 3350 NA SULF BICARB CL KCL')
clean_drug_name(scriptsnj16,'PIPERACILLIN TAZOBACTAM','PIPERACILLIN SODIUM TAZOBACTAM')
clean_drug_name(scriptsnj16,'POLYMYXIN B SUL TRIMETHOPRIM','POLYMYXIN B SULF TRIMETHOPRIM')
clean_drug_name(scriptsnj16,'DEXTROSE 5 0 45 NACL KCL','POTASSIUM CHLORIDE D5 0 45NACL')
clean_drug_name(scriptsnj16,'PREDNISOLONE SOD PHOSPHATE','PREDNISOLONE SODIUM PHOSPHATE')
clean_drug_name(scriptsnj16,'PROMETHAZINE VC CODEINE','PROMETHAZINE PHENYLEPH CODEINE')
clean_drug_name(scriptsnj16,'SSD','SILVER SULFADIAZINE')
clean_drug_name(scriptsnj16,'PEG 3350','SODIUM CHLORIDE NAHCO3 KCL PEG')
clean_drug_name(scriptsnj16,'PEG 3350 FLAVOR PACKS','SODIUM CHLORIDE NAHCO3 KCL PEG')
clean_drug_name(scriptsnj16,'SF','SODIUM FLUORIDE')
clean_drug_name(scriptsnj16,'SF 5000 PLUS','SODIUM FLUORIDE')
clean_drug_name(scriptsnj16,'SPS','SODIUM POLYSTYRENE SULFONATE')
clean_drug_name(scriptsnj16,'SPIRONOLACTONE HCTZ','SPIRONOLACT HYDROCHLOROTHIAZID')
clean_drug_name(scriptsnj16,'TETANUSHTHERIA TOXOIDS','TETANUS HTHERIA TOX ADULT')
clean_drug_name(scriptsnj16,'TRIAMTERENE HCTZ','TRIAMTERENE HYDROCHLOROTHIAZID')
clean_drug_name(scriptsnj16,'ALPRAZOLAM ODT','ALPRAZOLAM INTENSOL')
#New ones from 2014
clean_drug_name(scriptsnj16,'RENAL CAPS','B COMPLEX C NO 20 FOLIC ACID')
clean_drug_name(scriptsnj16,'BETAMETHASONE PROPYLENE GLYCOL','BETAMETHASONE PROPYLENE GLYC')
clean_drug_name(scriptsnj16,'BUTALB ACETAMINOPH CAFF CODEIN','BUTALBIT ACETAMIN CAFF CODEINE')
clean_drug_name(scriptsnj16,'BUTALBIT ACETAMIN CAFF CODEIN','BUTALBIT ACETAMIN CAFF CODEINE')
clean_drug_name(scriptsnj16,'CHOLESTYRAMINE LIGHT','CHOLESTYRAMINE ASPARTAME')
clean_drug_name(scriptsnj16,'CARISOPRODOL COMPOUND CODEINE','CODEINE CARISOPRODOL ASPIRIN')
clean_drug_name(scriptsnj16,'DEXTROAMPHETAMINE AMPHETAMINE','DEXTROAMPHETAMINE AMPHET')
clean_drug_name(scriptsnj16,'DEXTROSE IN LACTATED RINGERS','DEXTROSE 5 LACTATED RINGERS')
clean_drug_name(scriptsnj16,'DEXTROSE 5 0 9 NACL','DEXTROSE SODIUM CHLORIDE')
clean_drug_name(scriptsnj16,'DILTIAZEM 24HR','DILTIAZEM')
clean_drug_name(scriptsnj16,'DILTIAZEM 12HR','DILTIAZEM')
clean_drug_name(scriptsnj16,'DILTIAZEM 24HR CD','DILTIAZEM')
clean_drug_name(scriptsnj16,'DOBUTAMINE IN DEXTROSE','DOBUTAMINE D5W')
clean_drug_name(scriptsnj16,'DOXORUBICIN PEG LIPOSOMAL','DOXORUBICIN LIPOSOMAL')
clean_drug_name(scriptsnj16,'HYDROCODONE CHLORPHENIRAMNE','HYDROCODONE CHLORPHEN P STIREX')
clean_drug_name(scriptsnj16,'LANSOPRAZOL AMOXICIL CLARITHRO','LANSOPRAZOLE AMOXICILN CLARITH')
clean_drug_name(scriptsnj16,'MYCOPHENOLATE SODIUM','MYCOPHENOLIC ACID')
clean_drug_name(scriptsnj16,'INSULIN PEN NEEDLE','NEEDLES INSULIN DISPOSABLE')
clean_drug_name(scriptsnj16,'NEOMYCIN POLYMYXIN DEXAMETH','NEO POLYMYX B SULF DEXAMETH')
clean_drug_name(scriptsnj16,'NEOMYCIN BACITRACIN POLY HC','NEOMY SULF BACITRAC ZN POLY HC')
clean_drug_name(scriptsnj16,'NITROFURANTOIN','NITROFURANTOIN MONOHYD M CRYST')
clean_drug_name(scriptsnj16,'MULTIVITAMIN FLUORIDE','PEDI M VIT NO 17 FLUORIDE')
clean_drug_name(scriptsnj16,'DEXTROSE 5 1 2NS KCL','POTASSIUM CHLORIDE D5 0 45NACL')
clean_drug_name(scriptsnj16,'LACTATED RINGERS','RINGERS SOLUTION LACTATED')
clean_drug_name(scriptsnj16,'VERAPAMIL SR','VERAPAMIL')
#New ones form 2016
clean_drug_name(scriptsnj16,'ALCOHOL PREP SWABS','ALCOHOL PADS')
clean_drug_name(scriptsnj16,'AMLODIPINE VALSARTAN HCTZ','AMLODIPINE VALSARTAN HCTHIAZID')
clean_drug_name(scriptsnj16,'AMOXICILLIN CLAVULANATE POT','AMOXICILLIN CLAVULANATE')
clean_drug_name(scriptsnj16,'AMOXICILLIN CLAVULANATE POTASS','AMOXICILLIN CLAVULANATE')
clean_drug_name(scriptsnj16,'DOXYCYCLINE IR DR','DOXYCYCLINE MONOHYDRATE')
clean_drug_name(scriptsnj16,'FLUCONAZOLE IN NACL ISO OSM','FLUCONAZOLE NACL')
clean_drug_name(scriptsnj16,'GENTAMICIN SULFATE IN NS','GENTAMICIN IN NACL ISO OSM')
clean_drug_name(scriptsnj16,'LEVONORGESTREL ETH ESTRADIOL','LEVONORGESTREL ETHIN ESTRADIOL')
clean_drug_name(scriptsnj16,'MILRINONE IN 5 DEXTROSE','MILRINONE LACTATE D5W')
clean_drug_name(scriptsnj16,'NORETHINDRON ETHINYL ESTRADIOL','NORETHINDRON AC ETH ESTRADIOL')
clean_drug_name(scriptsnj16,'K EFFERVESCENT','POTASSIUM BICARBONATE CIT AC')

In [21]:
#Now that the data is cleaned ... Making a new column to see if the drug prescibed is a generic or brand name
scriptsnj16['brand_drug?'] = scriptsnj16['drug_name']!=scriptsnj16['generic_name']
#Making an amount of brand drugs column so we know how many brand drugs they wrote presciptions for
scriptsnj16['amount_brand'] = (scriptsnj16['total_claim_count']*scriptsnj16['brand_drug?'])
#Converted the drug cost from a string to a float
#Removing the dollar sign
scriptsnj16['total_drug_cost'] = scriptsnj16['total_drug_cost'].map(lambda x: x.replace('$', ''))
#Converting
scriptsnj16['total_drug_cost'] = scriptsnj16['total_drug_cost'].astype('float')
#Making a new column of the year
scriptsnj16['year'] = 2016

In [22]:
scriptsnj16['specialty_description'].unique()

array(['Internal Medicine', 'Physician Assistant',
       'Cardiovascular Disease (Cardiology)', 'Hematology-Oncology',
       'Family Practice', 'Nurse Practitioner', 'Urology', 'Dentist',
       'Rheumatology', 'Gastroenterology', 'Pediatric Medicine',
       'Pulmonary Disease', 'Infectious Disease',
       'Interventional Cardiology', 'Psychiatry', 'Neurology',
       'Nuclear Medicine', 'Emergency Medicine', 'Podiatry',
       'Otolaryngology', 'Nephrology', 'Neurosurgery', 'Dermatology',
       'Pain Management', 'Optometry', 'Orthopedic Surgery',
       'Geriatric Medicine', 'Diagnostic Radiology', 'Endocrinology',
       'General Surgery', 'Ophthalmology', 'Obstetrics & Gynecology',
       'Colorectal Surgery (Proctology)',
       'Physical Medicine and Rehabilitation', 'Anesthesiology',
       'General Practice', 'Hematology', 'Psychiatry & Neurology',
       'Student in an Organized Health Care Education/Training Program',
       'Registered Nurse', 'Gynecological Oncology', 

In [23]:
scriptsnj16[scriptsnj16['specialty_description'].isin(['Unknown Supplier/Provider','Unknown Physician Specialty Code'])]['npi'].unique()

array([], dtype=int64)

In [24]:
paymentsnj16 = pd.read_csv('/Volumes/Seagate/Galvanize/nj_payments_2016_consl.csv',
                            dtype={'zip':object,'name_d1':object,'name_d5':object,'ndc_d1':object,'ndc_d2':object, \
                                   'ndc_d3':object, 'ndc_d4':object,'ndc_d5':object, 'npi':object,'company_id':object, \
                                  'payment_id':object,'record_id':object})

In [25]:
paid_docs = list(set(paymentsnj16.npi).intersection(set(scriptsnj16.npi)))
scriptsnj16['recieved_payments'] = scriptsnj16['npi'].isin(paid_docs)

In [26]:
#Saving the hardwork
scriptsnj16.to_csv('/Volumes/Seagate/Galvanize/2016_scriptsnj.csv',index=False)

In [43]:
s2 = scriptsnjfull[scriptsnjfull.columns.tolist()[:6]+scriptsnjfull.columns.tolist()[9:10]+ \
                            scriptsnjfull.columns.tolist()[7:9] + scriptsnjfull.columns.tolist()[6:7] + scriptsnjfull.columns.tolist()[10:]]

In [34]:
scriptsnjfull = scriptsnjfull.rename(columns={'nppes_provider_city': 'city'})

In [35]:
scriptsnjfull.head(2)

Unnamed: 0,npi,ln,fn,city,state,specialty_description,amount_brand,recieved_payments,total_claim_count,year,drug_name,generic_name,brand_drug?,total_day_supply,total_drug_cost
0,1669522595,YAGER,SCOTT,EAST BRUNSWICK,NJ,Internal Medicine,0,False,79,2016,PRAVASTATIN SODIUM,PRAVASTATIN SODIUM,False,6090,2337.89
1,1669522595,YAGER,SCOTT,EAST BRUNSWICK,NJ,Internal Medicine,0,False,12,2016,AMIODARONE,AMIODARONE,False,360,108.0


In [29]:
s = pd.read_csv('/Volumes/Seagate/Galvanize/nj_scripts_all_years.csv',dtype={'zip':object,'npi':object})

In [44]:
s.head(2)

Unnamed: 0,npi,ln,fn,city,state,specialty_description,year,recieved_payments,total_claim_count,amount_brand,drug_name,generic_name,brand_drug?,total_day_supply,total_drug_cost,drug_company
0,1528036670,LANTERI,VINCENT,MAYWOOD,NJ,Urology,2013,True,48,48,VIAGRA,SILDENAFIL,True,1481,8957.33,PFIZER
1,1528036670,LANTERI,VINCENT,MAYWOOD,NJ,Urology,2013,True,16,0,CLOTRIMAZOLE BETAMETHASONE,CLOTRIMAZOLE BETAMETHASONE,False,367,806.59,GENERIC


In [45]:
s2.head(2)

Unnamed: 0,npi,ln,fn,city,state,specialty_description,year,recieved_payments,total_claim_count,amount_brand,drug_name,generic_name,brand_drug?,total_day_supply,total_drug_cost
0,1669522595,YAGER,SCOTT,EAST BRUNSWICK,NJ,Internal Medicine,2016,False,79,0,PRAVASTATIN SODIUM,PRAVASTATIN SODIUM,False,6090,2337.89
1,1669522595,YAGER,SCOTT,EAST BRUNSWICK,NJ,Internal Medicine,2016,False,12,0,AMIODARONE,AMIODARONE,False,360,108.0


In [40]:
s2 = s2.drop(10,axis=1)

In [46]:
scriptsnjfull = pd.concat([s,s2])

In [47]:
scriptsnjfull.head()

Unnamed: 0,amount_brand,brand_drug?,city,drug_company,drug_name,fn,generic_name,ln,npi,recieved_payments,specialty_description,state,total_claim_count,total_day_supply,total_drug_cost,year
0,48,True,MAYWOOD,PFIZER,VIAGRA,VINCENT,SILDENAFIL,LANTERI,1528036670,True,Urology,NJ,48,1481,8957.33,2013
1,0,False,MAYWOOD,GENERIC,CLOTRIMAZOLE BETAMETHASONE,VINCENT,CLOTRIMAZOLE BETAMETHASONE,LANTERI,1528036670,True,Urology,NJ,16,367,806.59,2013
2,0,False,MAYWOOD,GENERIC,IMIPRAMINE,VINCENT,IMIPRAMINE,LANTERI,1528036670,True,Urology,NJ,27,1380,519.91,2013
3,0,False,MAYWOOD,GENERIC,NITROFURANTOIN,VINCENT,NITROFURANTOIN,LANTERI,1528036670,True,Urology,NJ,34,1480,1502.16,2013
4,29,True,MAYWOOD,ABBVIE,ANDROGEL,VINCENT,TESTOSTERONE,LANTERI,1528036670,True,Urology,NJ,29,1236,17234.63,2013


In [48]:
scriptsnjfull.to_csv('/Volumes/Seagate/Galvanize/nj_scripts_all_years.csv',index=False)

In [52]:
medsgrp = pd.read_csv('/Volumes/Seagate/Galvanize/medicare_drugs.csv')

In [53]:
scripts = scriptsnjfull[['drug_name','generic_name']]

In [54]:
cripts = scripts[scripts['drug_name']!=scripts['generic_name']]

In [55]:
scriptsgrp = cripts.groupby('drug_name')['generic_name'].nunique().to_frame().reset_index()

In [56]:
scriptsgrp.to_csv('/Volumes/Seagate/Galvanize/brand_drugs_scripts.csv',index=False)

In [57]:
d = {}
def company_drug(x):
    if x['drug_name'] not in d:
        try:
            df = medsgrp[medsgrp['FDA Product Name'].str.contains \
                                        (x['drug_name'])].groupby('Labeler Name')['NDC'].count()
            df = df.to_frame().reset_index().sort_values('NDC',ascending=False).reset_index().drop('index',axis=1)
            d[x['drug_name']] = df['Labeler Name'][0]
        except IndexError:
            df[x['drug_name']] = np.nan
scriptsgrp.apply(company_drug,axis=1)

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
30      None
31      None
        ... 
1539    None
1540    None
1541    None
1542    None
1543    None
1544    None
1545    None
1546    None
1547    None
1548    None
1549    None
1550    None
1551    None
1552    None
1553    None
1554    None
1555    None
1556    None
1557    None
1558    None
1559    None
1560    None
1561    None
1562    None
1563    None
1564    None
1565    None
1566    None
1567    None
1568    None
1569    None
1570    None
Length: 1571, dtype: object

In [59]:
comps = pd.read_csv('/Volumes/Seagate/Galvanize/ndcxls/product.csv', usecols=[1,3,12])

In [61]:
comps['PROPRIETARYNAME'] = [re.sub(r'[^\w\s]','',str(x).upper()) for x in comps['PROPRIETARYNAME']]
comps['PROPRIETARYNAME'] = comps['PROPRIETARYNAME'].str.strip()
comps['LABELERNAME'] = [re.sub(r'[^\w\s]','',str(x).upper()) for x in comps['LABELERNAME']]
comps['LABELERNAME'] = comps['LABELERNAME'].str.strip()
#Grouping
medsgrp = comps.groupby(['PROPRIETARYNAME','LABELERNAME'])['PRODUCTNDC'].nunique().to_frame().reset_index()

In [71]:
def company_drug(x):
    if x['drug_name'] not in d:
        try:
            df = medsgrp[medsgrp['PROPRIETARYNAME'].str.contains \
                                        (x['drug_name'])].groupby('LABELERNAME')['PRODUCTNDC'].count()
            df = df.to_frame().reset_index().sort_values('PRODUCTNDC',ascending=False).reset_index().drop('index',axis=1)
            d[x['drug_name']] = df['LABELERNAME'][0]
        except IndexError:
            df[x['drug_name']] = np.nan
scriptsgrp.apply(company_drug,axis=1)

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
30      None
31      None
        ... 
1539    None
1540    None
1541    None
1542    None
1543    None
1544    None
1545    None
1546    None
1547    None
1548    None
1549    None
1550    None
1551    None
1552    None
1553    None
1554    None
1555    None
1556    None
1557    None
1558    None
1559    None
1560    None
1561    None
1562    None
1563    None
1564    None
1565    None
1566    None
1567    None
1568    None
1569    None
1570    None
Length: 1571, dtype: object

In [67]:
medsgrp.head()

Unnamed: 0,PROPRIETARYNAME,LABELERNAME,PRODUCTNDC
0,01 MOISTURE FOUNDATION SPF 15,THE BODY SHOP WAKE FOREST,1
1,02 MOISTURE FOUNDATION SPF 15,THE BODY SHOP WAKE FOREST,1
2,03 MOISTURE FOUNDATION SPF 15,THE BODY SHOP WAKE FOREST,1
3,04 MOISTURE FOUNDATION SPF 15,THE BODY SHOP WAKE FOREST,1
4,05 MOISTURE FOUNDATION SPF 15,THE BODY SHOP WAKE FOREST,1


In [72]:
scriptsnjfull.head()

Unnamed: 0,amount_brand,brand_drug?,city,drug_company,drug_name,fn,generic_name,ln,npi,recieved_payments,specialty_description,state,total_claim_count,total_day_supply,total_drug_cost,year
0,48,True,MAYWOOD,PFIZER,VIAGRA,VINCENT,SILDENAFIL,LANTERI,1528036670,True,Urology,NJ,48,1481,8957.33,2013
1,0,False,MAYWOOD,GENERIC,CLOTRIMAZOLE BETAMETHASONE,VINCENT,CLOTRIMAZOLE BETAMETHASONE,LANTERI,1528036670,True,Urology,NJ,16,367,806.59,2013
2,0,False,MAYWOOD,GENERIC,IMIPRAMINE,VINCENT,IMIPRAMINE,LANTERI,1528036670,True,Urology,NJ,27,1380,519.91,2013
3,0,False,MAYWOOD,GENERIC,NITROFURANTOIN,VINCENT,NITROFURANTOIN,LANTERI,1528036670,True,Urology,NJ,34,1480,1502.16,2013
4,29,True,MAYWOOD,ABBVIE,ANDROGEL,VINCENT,TESTOSTERONE,LANTERI,1528036670,True,Urology,NJ,29,1236,17234.63,2013


In [78]:
scripts = scriptsnjfull.copy()

In [6]:
scripts["drug_company"] = scripts["drug_name"].map(d)

In [7]:
def clean_company_name(df,wrong_name, right_name):
    """
    Input:
        df: df, dataframe to be used
        wrong_name: Str, wrong name that is listed in df
        right_name: Str, right name to be changed to
    Output:
        None
    """
    #Getting all the locations where it says the wrong name
    l = list(df[df['drug_company']==wrong_name].index.values)
    #Replacing all the "ZOLL SERVICES AKA ZOLL LIFECOR" with "ZOLL LIFECOR"
    for i in l:
        df.at[i,'drug_company'] = right_name

In [8]:
clean_company_name(scripts,'ACTAVIS KADIAN','ACTAVIS')
clean_company_name(scripts,'ACTAVIS PHARMA','ACTAVIS')

In [9]:
clean_company_name(scripts,'ASTELLAS PHARMA US','ASTELLAS')
clean_company_name(scripts,'BRISTOLMYERS SQUIBB AND GILEAD SCIENCE','GILEAD SCIENCES')
clean_company_name(scripts,'BRISTOLMYERS SQUIBB COMPANY','BRISTOLMYERS SQUIBB')

In [10]:
clean_company_name(scripts,'ARBOR PHARMACEUTICALS IRELAND LIMITED','ARBOR PHARMACEUTICALS')

In [11]:
clean_company_name(scripts,'BRISTOLMYERS SQUIBBSANOFI PARTNERSHIP','BRISTOLMYERS SQUIBB')
clean_company_name(scripts,'BRISTOLMYERS SQUIBB PHARMA CO','BRISTOLMYERS SQUIBB')

In [13]:
scripts['drug_company'] = scripts['drug_company'].map(lambda x: ' '.join((str(x).replace('INCORPORATED','').replace('CORPORATION','')\
                                    .replace('INC','').replace('CORP','').replace('LLC','').replace('LP','')).split()))

In [14]:
scripts.to_csv('/volumes/Seagate/Galvanize/nj_scripts_all_years.csv',index=False)

In [2]:
scripts = pd.read_csv('/Volumes/Seagate/Galvanize/nj_scripts_all_years.csv')

In [3]:
d =  pickle.load(open('script_company_dict.pkl', 'rb'))

In [12]:
clean_company_name(scripts,'COVIS PHARMA SARL','COVIS PHARMACEUTICALS')
clean_company_name(scripts,'CSL BEHRING GMBH','CSL BEHRING')
clean_company_name(scripts,'GLAXOSMITHKLINE BIOLOGICALS SA','GLAXOSMITHKLINE')
clean_company_name(scripts,'GSK CONSUMER HEALTHCARE','GSK CONSUMER HEALTH')
clean_company_name(scripts,'IMPAX SPECIALTY PHARMA','IMPAX LABORATORIES')
clean_company_name(scripts,'JANSSEN BIOTECH','JANSSEN PHARMACEUTICALS')
clean_company_name(scripts,'JANSSEN PRODUCTS','JANSSEN PHARMACEUTICALS')
clean_company_name(scripts,'JAZZ PHARMACEUTICALS COMMERCIAL','JAZZ PHARMACEUTICALS')
clean_company_name(scripts,'KREMERS URBAN','KREMERS URBAN PHARMACEUTICALS')
clean_company_name(scripts,'MALLKRODT BRAND PHARMACEUTICALS','MALLKRODT PHARMACEUTICALS')
clean_company_name(scripts,'MALLKRODT','MALLKRODT PHARMACEUTICALS')
clean_company_name(scripts,'MERCK SHARP DOHME','MERCK')
clean_company_name(scripts,'MERCKSCHERINGPLOUGH JV','MERCK')
clean_company_name(scripts,'MYLAN INSTITUTIONAL','MYLAN PHARMACEUTICALS')
clean_company_name(scripts,'MYLAN SPECIALTY','MYLAN PHARMACEUTICALS')
clean_company_name(scripts,'PAR PHARMACEUTICAL','PAR PHARMACEUTICALS')
clean_company_name(scripts,'SCHERING HEALTHCARE PRODUCTS','SCHERING')
clean_company_name(scripts,'TEVA GLOBAL RESPIRATORY RESEARCH','TEVA PHARMACEUTICALS')
clean_company_name(scripts,'TEVA NEUROSCIENCE','TEVA PHARMACEUTICALS')
clean_company_name(scripts,'TEVA PARENTERAL MEDICINES','TEVA PHARMACEUTICALS')
clean_company_name(scripts,'TEVA PHARMACEUTICALS USA','TEVA PHARMACEUTICALS')
clean_company_name(scripts,'TEVA RESPIRATORY','TEVA PHARMACEUTICALS')
clean_company_name(scripts,'TEVA WOMENS HEALTH','TEVA PHARMACEUTICALS')
clean_company_name(scripts,'UCB','UCB PHARMA')
clean_company_name(scripts,'UCB MANUFACTURING','UCB PHARMA')
clean_company_name(scripts,'WYETH LABORATORIES','PFIZER')
clean_company_name(scripts,'WYETH PHARMACEUTICALS A SUBSIDIARY OF PFIZER','PFIZER')