# Drug Utilization EDA
Explore the drug utilization CSV found at https://data.medicaid.gov/api/1/datastore/query/daba7980-e219-5996-9bec-90358fd156f1/0/download

In [1]:
import pandas as pd
import os

The preferred route to get all the raw data is via the Makefile.\
The following cell is just a catch all to ensure the data is available for EDA.

In [2]:
path = 'raw_data/util/drug_utilization_2019.csv'
if not os.path.exists(path):
    import get_data
    get_data.download_drug_utiliztion(path)

In [3]:
df = pd.read_csv(path)

### What do we notice about the first 3 rows?

In [4]:
df.head(3)

Unnamed: 0,utilization_type,state,ndc,labeler_code,product_code,package_size,year,quarter,suppression_used,product_name,units_reimbursed,number_of_prescriptions,total_amount_reimbursed,medicaid_amount_reimbursed,non_medicaid_amount_reimbursed
0,FFSU,AK,2143380,2,1433,80.0,2019,4,False,TRULICITY,250.0,125.0,90617.88,87561.3,3056.58
1,FFSU,AK,2143480,2,1434,80.0,2019,4,False,TRULICITY,262.0,131.0,95679.25,92129.08,3550.17
2,FFSU,AK,2143611,2,1436,11.0,2019,4,True,EMGALITY P,,,,,


In [5]:
df.dtypes

utilization_type                   object
state                              object
ndc                                 int64
labeler_code                        int64
product_code                        int64
package_size                      float64
year                                int64
quarter                             int64
suppression_used                     bool
product_name                       object
units_reimbursed                  float64
number_of_prescriptions           float64
total_amount_reimbursed           float64
medicaid_amount_reimbursed        float64
non_medicaid_amount_reimbursed    float64
dtype: object

In [6]:
 dtypes = {
        'utilization_type':'object',
        'state': 'object',
        'ndc': 'object',
        'labeler_code': 'int64',
        'product_code': 'int64',
        'package_size': 'float64',
        'year': 'int64',
        'quarter': 'int64',
        'suppression_used': 'bool',
        'product_name': 'object',
        'units_reimbursed': 'float64',
        'number_of_prescriptions': 'float64',
        'total_amount_reimbursed': 'float64',
        'medicaid_amount_reimbursed': 'float64',
        'non_medicaid_amount_reimbursed': 'float64',
    }

In [7]:
dfdt = pd.read_csv('raw_data/util/drug_utilization_2014.csv') #, dtype=dtypes)
dfdt.dtypes

  dfdt = pd.read_csv('raw_data/util/drug_utilization_2014.csv') #, dtype=dtypes)


utilization_type                   object
state                              object
ndc                                object
labeler_code                        int64
product_code                        int64
package_size                       object
year                                int64
quarter                             int64
suppression_used                     bool
product_name                       object
units_reimbursed                  float64
number_of_prescriptions           float64
total_amount_reimbursed           float64
medicaid_amount_reimbursed        float64
non_medicaid_amount_reimbursed    float64
dtype: object

In [8]:
dfdt.package_size.unique()

array(['90', '30', '60', '33', '02', '01', '59', '17', '05', '20', '28',
       '21', '11', '12', '10', '22', '16', '09', '29', '85', '50', '82',
       '31', '54', '61', '44', '62', '36', '18', '71', '03', '32', '13',
       '25', '04', '51', '55', '06', '19', '08', '35', '23', '81', '70',
       '07', '91', '83', '45', '00', '41', '46', '58', '26', '27', '63',
       '24', '99', '49', '47', '75', '92', '15', '14', '56', '38', '86',
       '40', '68', '66', '80', '42', '52', '34', '87', '98', '43', '53',
       '94', '73', '74', '64', '79', '76', '78', '65', '89', '88', '95',
       '93', '37', '97', '96', '39', '48', '77', '69', '67', '72', '57',
       '84', '5-', '7-', 10, 13, 21, 81, 91, 11, 12, 83, 1, 22, 28, 73,
       45, 5, 30, 44, 79, 60, 31, 33, 88, 6, 3, 2, 4, 0, 32, 85, 25, 20,
       29, 41, 46, 58, 47, 26, 27, 63, 49, 24, 55, 19, 16, 7, 68, 50, 18,
       36, 57, 99, 23, 15, 51, 70, 75, 90, 92, 42, 35, 8, 48, 14, 96, 62,
       71, 56, 38, 93, 66, 34, 40, 53, 80, 54, 67,

In [14]:
dfdt[dfdt.package_size=='7-'].ndc

31185      59762-4537-
1847206    59762-4537-
Name: ndc, dtype: object

In [15]:
dtypes = {
        'utilization_type':'object',
        'state': 'object',
        'ndc': 'object', #'int64',
        'labeler_code': 'int64',
        'product_code': 'int64',
        'package_size': 'object', #'float64',
        'year': 'int64',
        'quarter': 'int64',
        'suppression_used': 'bool',
        'product_name': 'object',
        'units_reimbursed': 'float64',
        'number_of_prescriptions': 'float64',
        'total_amount_reimbursed': 'float64',
        'medicaid_amount_reimbursed': 'float64',
        'non_medicaid_amount_reimbursed': 'float64',
    }

In [16]:
dfdt = pd.read_csv('raw_data/util/drug_utilization_2014.csv', dtype=dtypes)


In [32]:
%time
dfdt.ndc = dfdt.ndc.str.replace('-','').str.strip()
dfdt.package_size = dfdt.package_size.str.replace('-','').str.strip()
dfdt = dfdt.astype({'ndc':'int64', 'package_size':'int64'})

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [33]:
dfdt.dtypes

utilization_type                   object
state                              object
ndc                                 int64
labeler_code                        int64
product_code                        int64
package_size                        int64
year                                int64
quarter                             int64
suppression_used                     bool
product_name                       object
units_reimbursed                  float64
number_of_prescriptions           float64
total_amount_reimbursed           float64
medicaid_amount_reimbursed        float64
non_medicaid_amount_reimbursed    float64
dtype: object

In [28]:
dfdt.package_size.unique() #iloc[-2:]

array(['90', '30', '60', '33', '02', '01', '59', '17', '05', '20', '28',
       '21', '11', '12', '10', '22', '16', '09', '29', '85', '50', '82',
       '31', '54', '61', '44', '62', '36', '18', '71', '03', '32', '13',
       '25', '04', '51', '55', '06', '19', '08', '35', '23', '81', '70',
       '07', '91', '83', '45', '00', '41', '46', '58', '26', '27', '63',
       '24', '99', '49', '47', '75', '92', '15', '14', '56', '38', '86',
       '40', '68', '66', '80', '42', '52', '34', '87', '98', '43', '53',
       '94', '73', '74', '64', '79', '76', '78', '65', '89', '88', '95',
       '93', '37', '97', '96', '39', '48', '77', '69', '67', '72', '57',
       '84', '5-', '7-', '0', '2', '1', '6', '3', '5'], dtype=object)

In [10]:
df[(df.ndc==2143480)&(df.state=='AK')]

Unnamed: 0,utilization_type,state,ndc,labeler_code,product_code,package_size,year,quarter,suppression_used,product_name,units_reimbursed,number_of_prescriptions,total_amount_reimbursed,medicaid_amount_reimbursed,non_medicaid_amount_reimbursed
1,FFSU,AK,2143480,2,1434,80.0,2019,4,False,TRULICITY,262.0,131.0,95679.25,92129.08,3550.17
10638,FFSU,AK,2143480,2,1434,80.0,2019,3,False,TRULICITY,220.0,110.0,80252.04,77907.68,2344.36
21363,FFSU,AK,2143480,2,1434,80.0,2019,2,False,TRULICITY,162.0,81.0,59392.87,57762.24,1630.63
32020,FFSU,AK,2143480,2,1434,80.0,2019,1,False,TRULICITY,98.0,49.0,35072.64,34000.13,1072.51


Product names are more human readable than ndc codes so let's see how many we're dealing with.

In [27]:
# let's look at a specific record
df[df.ndc==2143380]['product_name'].unique()

array(['TRULICITY '], dtype=object)

Padding can be kind of a pain.  Let's strip that out and see if that changes our unique count

In [28]:
# count of unique product_name before blanks stripped out
len(df['product_name'].unique())

15248

In [29]:
df['product_name'] = df['product_name'].str.strip()

# count of unique product_name before blanks stripped out - should be the same as above
len(df['product_name'].unique())

15248

Since no ill effects (before/after counts are the same), we'll include stripping the padding in our cleaning step (clean.py) as it will dealing with product name queries a little easier.

### General data set stats

In [30]:
df.describe()

Unnamed: 0,ndc,labeler_code,product_code,package_size,year,quarter,units_reimbursed,number_of_prescriptions,total_amount_reimbursed,medicaid_amount_reimbursed,non_medicaid_amount_reimbursed
count,4948336.0,4948336.0,4948336.0,4948334.0,4948336.0,4948336.0,2518495.0,2518495.0,2518495.0,2518495.0,2518495.0
mean,34696530000.0,34696.65,1493.848,22.4945,2019.0,2.499867,35340.84,582.1268,56362.09,53685.69,2676.403
std,28460410000.0,28460.52,2237.929,27.47396,0.0,1.120011,551355.0,5463.931,949193.1,930119.4,86841.07
min,275.0,2.0,0.0,0.0,2019.0,1.0,0.001,11.0,0.0,0.0,0.0
25%,591086000.0,591.0,192.0,1.0,2019.0,1.0,750.0,22.0,401.58,381.71,0.0
50%,43598020000.0,43598.0,519.0,10.0,2019.0,3.0,2260.0,56.0,1487.23,1409.31,0.0
75%,62756000000.0,62756.0,1346.0,31.0,2019.0,4.0,9019.75,197.0,7240.89,6819.01,58.5
max,100000000000.0,99999.0,9999.0,99.0,2019.0,4.0,210429800.0,1723822.0,230875100.0,229729400.0,48962100.0


The first thing I notice is while there are 5M rows, only about 1/2 of them have entries for: 
* units_reimbursed
* number_of_prescriptions
* total_amount_reimbursed
* medicaid_amount_reimbursed
* non_medicaid_amount_reimbursed

These seem to correlate with suppression_used as false.  To confirm, we'll perform the following two tests
1. Whenever suppression_used is true, all the above fields are na (no non na fields)
2. Whenever suppression_used is false, none of the above fields are na (no na fields)

##### 1. Whenever suppression_used is true, all the above fields are na (no non na fields)

In [31]:
# ensure count of suppression_used==True > 0
df[df['suppression_used']].shape[0]

2429841

In [32]:
# Find the intersection of suppression_used and any non-na values
# A count of 0 means no overlap between supression true and any other of specified fields having a non-na value

df[(df['suppression_used']) & 
   (
       (~df['number_of_prescriptions'].isna()) | 
       (~df['units_reimbursed'].isna()) | 
       (~df['total_amount_reimbursed'].isna()) |
       (~df['medicaid_amount_reimbursed'].isna()) |
       (~df['non_medicaid_amount_reimbursed'].isna()) 
)].shape[0]

0

##### 2. Whenever suppression_used is false, none of the above fields are na (no na fields)

In [33]:
# ensure count of suppression_used==False > 0
df[~df['suppression_used']].shape[0]

2518495

In [34]:
# Find the intersection of suppression_used==False and any na values
# A count of 0 means no overlap between supression false and any other of specified fields with value of na

df[(~df['suppression_used']) & 
   (
       (df['number_of_prescriptions'].isna()) | 
       (df['units_reimbursed'].isna()) | 
       (df['total_amount_reimbursed'].isna()) |
       (df['medicaid_amount_reimbursed'].isna()) |
       (df['non_medicaid_amount_reimbursed'].isna()) 
)].shape[0]

0

#### Conclusion
suppression_used can be used as a filter to determine if the above mentioned fields will have a non-na value