In [1]:
from datetime import datetime
import pandas as pd

In [2]:
# Read State table to get foreign key value
df_ref = pd.read_csv("./cleaned/State.csv")
df_ref.head()

Unnamed: 0,id,name,country,country_id
0,1,Alabama,United States,224
1,2,Alaska,United States,224
2,3,Arizona,United States,224
3,4,Arkansas,United States,224
4,5,California,United States,224


In [3]:
# Load original csv
date_parser = lambda x: datetime.strptime(x, '%m/%d/%Y')
df_in = pd.read_csv("./raw/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv", parse_dates=['Date'], date_parser=date_parser)
df_in.head()

Unnamed: 0,Date,MMWR_week,Location,Distributed,Distributed_Janssen,Distributed_Moderna,Distributed_Pfizer,Distributed_Unk_Manuf,Dist_Per_100K,Distributed_Per_100k_12Plus,...,Additional_Doses_18Plus,Additional_Doses_18Plus_Vax_Pct,Additional_Doses_50Plus,Additional_Doses_50Plus_Vax_Pct,Additional_Doses_65Plus,Additional_Doses_65Plus_Vax_Pct,Additional_Doses_Moderna,Additional_Doses_Pfizer,Additional_Doses_Janssen,Additional_Doses_Unk_Manuf
0,2021-11-09,45,AS,75120,600,20700,53820,0,158508,203483,...,499.0,1.9,285.0,3.0,120.0,4.5,126.0,377.0,0.0,0.0
1,2021-11-09,45,DD2,5668520,204300,1950440,3513780,0,0,0,...,686.0,0.0,29.0,0.0,12.0,0.0,162.0,516.0,10.0,0.0
2,2021-11-09,45,DE,1768425,79500,702720,986205,0,181607,210575,...,87624.0,15.8,76940.0,23.6,60085.0,35.2,31188.0,55879.0,625.0,4.0
3,2021-11-09,45,MD,11320030,520900,4191520,6607610,0,187242,219258,...,600955.0,16.2,477187.0,25.3,334355.0,38.3,229045.0,363763.0,8775.0,107.0
4,2021-11-09,45,ME,2468160,136200,1014660,1317300,0,183614,208295,...,160194.0,17.8,138334.0,26.1,105486.0,38.7,58461.0,99967.0,1718.0,139.0


In [4]:
country_codes = df_in["Location"].unique()
country_codes.sort()
country_codes

array(['AK', 'AL', 'AR', 'AS', 'AZ', 'BP2', 'CA', 'CO', 'CT', 'DC', 'DD2',
       'DE', 'FL', 'FM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IH2', 'IL', 'IN',
       'KS', 'KY', 'LA', 'LTC', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO',
       'MP', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
       'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'RP', 'SC', 'SD', 'TN', 'TX',
       'US', 'UT', 'VA', 'VA2', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'],
      dtype=object)

In [5]:
state_abbreviation_mapping = {
    "AK": "Alaska",
    "AL": "Alabama",
    "AR": "Arkansas",
    "AZ": "Arizona",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DE": "Delaware",
    "DC": "District of Columbia",
    "FL": "Florida",
    "GA": "Georgia",
    "HI": "Hawaii",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "IA": "Iowa",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "ME": "Maine",
    "MD": "Maryland",
    "MA": "Massachusetts",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MS": "Mississippi",
    "MO": "Missouri",
    "MT": "Montana",
    "NE": "Nebraska",
    "NV": "Nevada",
    "NH": "New Hamsphire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NY": "New York",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PA": "Pennsylvania",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VT": "Vermont",
    "VA": "Virginia",
    "WA": "Washington",
    "WV": "West Virginia",
    "WI": "Wisconsin",
    "WY": "Wyoming"
}

In [6]:
columns = [
    "record_date",
    "daily_first_doses",
    "daily_first_doses_12plus",
    "daily_first_doses_18plus",
    "daily_first_doses_65plus",
    "daily_fully_vaccinated",
    "daily_fully_vaccinated_12plus",
    "daily_fully_vaccinated_18plus",
    "daily_fully_vaccinated_65plus",
    "state",
    "state_id"
]
df_out = pd.DataFrame([], columns=columns)
df_out

Unnamed: 0,record_date,daily_first_doses,daily_first_doses_12plus,daily_first_doses_18plus,daily_first_doses_65plus,daily_fully_vaccinated,daily_fully_vaccinated_12plus,daily_fully_vaccinated_18plus,daily_fully_vaccinated_65plus,state,state_id


In [7]:
for state_code in df_in["Location"].unique():
    state = state_abbreviation_mapping.get(state_code)
    state_ids = df_ref[df_ref['name'] == state]["id"] 
    if(len(state_ids) == 0):
        continue
    state_id = state_ids.iloc[0]
    if(state):
        print("[INFO] State_id: {}, State: {}".format(state_id, state))
        df_state = df_in[df_in["Location"] == state_code].sort_values(by=['Date'])
        for i in range(len(df_state)):
            current = df_state.iloc[i]
            date = current["Date"]
            if(i == 0):
                daily_first_doses = current["Administered_Dose1_Recip"]
                daily_first_doses_12plus = current["Administered_Dose1_Recip_12Plus"]
                daily_first_doses_18plus = current["Administered_Dose1_Recip_18Plus"]
                daily_first_doses_65plus = current["Administered_Dose1_Recip_65Plus"]
                daily_fully_vaccinated = current["Series_Complete_Yes"]
                daily_fully_vaccinated_12plus = current["Series_Complete_12Plus"]
                daily_fully_vaccinated_18plus = current["Series_Complete_18Plus"]
                daily_fully_vaccinated_65plus = current["Series_Complete_65Plus"]
            else:
                previous = df_state.iloc[i - 1]
                daily_first_doses = current["Administered_Dose1_Recip"] - previous["Administered_Dose1_Recip"]
                daily_first_doses_12plus = current["Administered_Dose1_Recip_12Plus"] - previous["Administered_Dose1_Recip_12Plus"]
                daily_first_doses_18plus = current["Administered_Dose1_Recip_18Plus"] - previous["Administered_Dose1_Recip_18Plus"]
                daily_first_doses_65plus = current["Administered_Dose1_Recip_65Plus"] - previous["Administered_Dose1_Recip_65Plus"]
                daily_fully_vaccinated = current["Series_Complete_Yes"] - previous["Series_Complete_Yes"]
                daily_fully_vaccinated_12plus = current["Series_Complete_12Plus"] - previous["Series_Complete_12Plus"]
                daily_fully_vaccinated_18plus = current["Series_Complete_18Plus"] - previous["Series_Complete_18Plus"]
                daily_fully_vaccinated_65plus = current["Series_Complete_65Plus"] - previous["Series_Complete_65Plus"]
            new_row = {
                "record_date": date,
                "daily_first_doses": daily_first_doses,
                "daily_first_doses_12plus": daily_first_doses_12plus,
                "daily_first_doses_18plus": daily_first_doses_18plus,
                "daily_first_doses_65plus": daily_first_doses_65plus,
                "daily_fully_vaccinated": daily_fully_vaccinated,
                "daily_fully_vaccinated_12plus": daily_fully_vaccinated_12plus,
                "daily_fully_vaccinated_18plus": daily_fully_vaccinated_18plus,
                "daily_fully_vaccinated_65plus": daily_fully_vaccinated_65plus,
                "state": state,
                "state_id": state_id
            }
            df_out = df_out.append(new_row, ignore_index=True)

[INFO] State_id: 8, State: Delaware
[INFO] State_id: 21, State: Maryland
[INFO] State_id: 20, State: Maine
[INFO] State_id: 13, State: Idaho
[INFO] State_id: 34, State: North Carolina
[INFO] State_id: 40, State: Rhode Island
[INFO] State_id: 45, State: Utah
[INFO] State_id: 51, State: Wyoming
[INFO] State_id: 28, State: Nebraska
[INFO] State_id: 43, State: Tennessee
[INFO] State_id: 22, State: Massachusetts
[INFO] State_id: 27, State: Montana
[INFO] State_id: 6, State: Colorado
[INFO] State_id: 18, State: Kentucky
[INFO] State_id: 26, State: Missouri
[INFO] State_id: 14, State: Illinois
[INFO] State_id: 4, State: Arkansas
[INFO] State_id: 2, State: Alaska
[INFO] State_id: 11, State: Georgia
[INFO] State_id: 19, State: Louisiana
[INFO] State_id: 9, State: District of Columbia
[INFO] State_id: 7, State: Connecticut
[INFO] State_id: 10, State: Florida
[INFO] State_id: 5, State: California
[INFO] State_id: 49, State: West Virginia
[INFO] State_id: 12, State: Hawaii
[INFO] State_id: 47, Sta

In [8]:
df_out.head()

Unnamed: 0,record_date,daily_first_doses,daily_first_doses_12plus,daily_first_doses_18plus,daily_first_doses_65plus,daily_fully_vaccinated,daily_fully_vaccinated_12plus,daily_fully_vaccinated_18plus,daily_fully_vaccinated_65plus,state,state_id
0,2020-12-14,0,0,0,0,0,0,0,0,Delaware,8
1,2020-12-15,0,0,0,0,0,0,0,0,Delaware,8
2,2020-12-16,0,0,0,0,0,0,0,0,Delaware,8
3,2020-12-17,0,0,0,0,0,0,0,0,Delaware,8
4,2020-12-18,0,0,0,0,0,0,0,0,Delaware,8


In [9]:
df_out.to_csv("./cleaned/Vaccination_data.csv", index=False)