# Python Code to Process Vaccine Allocation-Plan Data from CDC

Source of weekly updated raw data:

https://data.cdc.gov/Vaccinations/COVID-19-Vaccine-Distribution-Allocations-by-Juris/saz5-9hgg

https://data.cdc.gov/Vaccinations/COVID-19-Vaccine-Distribution-Allocations-by-Juris/b7pe-5nws

In [None]:
import pandas as pd
import re

In [None]:
# read data
file_dir = '/Users/xywu/Documents/HPC_datahub/vaccine/COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_'
temp_date = '20210209'
pfizer = pd.read_csv(file_dir + 'Pfizer_' + temp_date + '.csv')
moderna = pd.read_csv(file_dir + 'Moderna_' + temp_date + '.csv')


In [None]:
# inspect whether the second-dose columns is the same as the first-dose columns
#   Pfizer
pfizer_dup = []
for i in range(1, int(pfizer.shape[1]/2)):
    pfizer_dup.append(pfizer.iloc[:, 2*i].equals(pfizer.iloc[:, 2*i+1]))
pfizer_dup # the second-dose columns are complete duplicate of the first-dose column

In [None]:
#   Moderna
moderna_dup = []
for i in range(1, int(moderna.shape[1]/2)):
    moderna_dup.append(moderna.iloc[:, 2*i].equals(moderna.iloc[:, 2*i+1]))
moderna_dup # the second-dose columns are complete duplicate of the first-dose column

In [None]:
# define an auxiliary function to extrace date from a column name
def edit_colname(colname, brand):
    '''
    Parameters
        colname: a string of column name

    Returns: 
        rv: a string of vaccine allocation date if it's a data column
            none if it's a jurisdiction or region code column
    '''
    temp_date = re.search('[0-9]{2}\/[0-9]{2}', colname)
    if temp_date == None:
        rv = colname
    else:
        temp_date = temp_date.group(0)
        temp_date = str.replace(temp_date, '/', '')
        if re.search('[0-9]{2}', temp_date).group(0) == '12':
            temp_year = '2020'
        else:
            temp_year= '2021'
        # first dose or second dose
        temp_dose = re.search('[Ss]econd', colname)
        if temp_dose == None:
            temp_dose = 'dose1'
        else:
            temp_dose = 'dose2'
        # whether plan or actual allocation
        is_plan = re.search('[Ll]ater', colname)
        if is_plan != None:
            if brand == 'pfizer':
                temp_lag = str(21)
            else:
                temp_lag = str(28)
            temp_sufix = '_' + temp_lag + 'DaysLater'
        else:
            temp_sufix = ''
        rv = temp_year + temp_date + '_allocation_' + brand + '_' + \
                temp_dose + temp_sufix

    # return the output
    return rv

In [None]:
# modify column names of dataframe
pfizer.columns = [edit_colname(x, 'pfizer') for x in pfizer.columns]
pfizer = pfizer.rename(columns = {'Total Pfizer Allocation "First Dose" Shipments':'total_pfizer_dose1_shipments',\
                                    'Total Allocation Pfizer "Second Dose" Shipments':'total_pfizer_dose2_shipments'})
moderna.columns = [edit_colname(x, 'moderna') for x in moderna.columns]
moderna = moderna.rename(columns = {'Total Moderna Allocation "First Dose" Shipments':'total_moderna_dose1_shipments',\
                                    'Total Allocation Moderna"Second Dose" Shipments':'total_moderna_dose2_shipments'})


In [None]:
# merge two dataframes
df = pfizer.merge(moderna, left_on=['Jurisdiction', 'HHS Region'], \
                  right_on=['Jurisdiction', 'HHS Region'])


In [None]:
## remove the US territory and federal entities rows
to_rm = ['Puerto Rico', 'U.S. Virgin Islands', 'San Antonio ~', 'Houston ~',\
         'American Samoa**', 'Guam**', 'Marshall Islands*', 'Micronesia*', \
         'Mariana Islands**', 'Palau*', 'Federal Entities ****', 'Total']
to_rm_id = [df.index[pfizer['Jurisdiction']==x].tolist()[0] for x in to_rm]
df['Jurisdiction'][to_rm_id]
df = df.drop(to_rm_id, axis=0)


In [None]:
## remove the star marks from the jurisdiction names
df['Jurisdiction'] = df['Jurisdiction'].str.replace('[^\w\s]', '')

In [None]:
## output to csv file
df.to_csv('/Users/xywu/Documents/HPC_Datahub/vaccine/vaccine_allocation.csv', index=False)
