<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-data-from-CDC" data-toc-modified-id="Read-data-from-CDC-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read data from CDC</a></span></li><li><span><a href="#Create-a-2nd-df-to-manipulate" data-toc-modified-id="Create-a-2nd-df-to-manipulate-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create a 2nd df to manipulate</a></span><ul class="toc-item"><li><span><a href="#Create-a-subset-based-on-date" data-toc-modified-id="Create-a-subset-based-on-date-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Create a subset based on date</a></span></li></ul></li><li><span><a href="#Create-a-df-with-just-data-from-Delaware" data-toc-modified-id="Create-a-df-with-just-data-from-Delaware-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Create a df with just data from Delaware</a></span></li><li><span><a href="#Create-a-df-that-contains-data-summarized-by-year-and-month" data-toc-modified-id="Create-a-df-that-contains-data-summarized-by-year-and-month-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create a df that contains data summarized by year and month</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#What-needs-to-change-if-we-want-to-the-same-for-v2DE-but-we-want-to-exclude-the-mean-and-include-dist_first,-dist_last?" data-toc-modified-id="What-needs-to-change-if-we-want-to-the-same-for-v2DE-but-we-want-to-exclude-the-mean-and-include-dist_first,-dist_last?-4.0.1"><span class="toc-item-num">4.0.1&nbsp;&nbsp;</span>What needs to change if we want to the same for v2DE but we want to exclude the mean and include dist_first, dist_last?</a></span></li></ul></li></ul></li><li><span><a href="#Write-out-each-df-as-a-csv-file" data-toc-modified-id="Write-out-each-df-as-a-csv-file-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Write out each df as a csv file</a></span></li></ul></div>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Read data from CDC

In [1]:
# Key data is hardcoded.

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import requests
from sodapy import Socrata
client = Socrata('data.cdc.gov',
                  '',
                  username='',
                  password='')

#results = client.get("8xkx-amqh", limit = 100000)
results = client.get("unsk-b7fc", limit = 150000)
vaccines = pd.DataFrame(results)

# This is where the documentation is:
# https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-Jurisdi/unsk-b7fc

In [2]:
print(vaccines.columns)
print(vaccines.shape)

Index(['date', 'mmwr_week', 'location', 'distributed', 'distributed_janssen',
       'distributed_moderna', 'distributed_pfizer', 'distributed_unk_manuf',
       'dist_per_100k', 'distributed_per_100k_12plus',
       'distributed_per_100k_18plus', 'distributed_per_100k_65plus',
       'administered', 'administered_12plus', 'administered_18plus',
       'administered_65plus', 'administered_janssen', 'administered_moderna',
       'administered_pfizer', 'administered_unk_manuf', 'admin_per_100k',
       'admin_per_100k_12plus', 'admin_per_100k_18plus',
       'admin_per_100k_65plus', 'recip_administered',
       'administered_dose1_recip', 'administered_dose1_pop_pct',
       'administered_dose1_recip_1', 'administered_dose1_recip_2',
       'administered_dose1_recip_3', 'administered_dose1_recip_4',
       'administered_dose1_recip_5', 'administered_dose1_recip_6',
       'series_complete_yes', 'series_complete_pop_pct',
       'series_complete_12plus', 'series_complete_12pluspop',
    

# Create a 2nd df to manipulate

## Create a subset based on date

In [None]:
v2 = vaccines[vaccines.date >= '2021-04-27']
v2.shape

# At this point, date is an object not a date

In [None]:
# Create a new dataframe with just the required columns
# new=  old[['A', 'C', 'D']]
v2 = vaccines[['date','mmwr_week','location','distributed','administered','distributed_janssen',
    'distributed_moderna','distributed_pfizer', 'additional_doses','administered_12plus',
               'administered_18plus','administered_65plus','series_complete_yes','series_complete_12plus',
               'series_complete_18plus','series_complete_65plus']]

In [None]:
v2.head()

In [None]:
v2.info()

In [None]:
# Drop columns
drop_columns = {'distributed_janssen',
    'distributed_moderna', 
    'distributed_pfizer'}

v2 = v2.drop(columns = drop_columns)

In [None]:
# Which columns have null values?

v2.isnull().sum()

In [None]:
# Fill the Null values with zero

v2['additional_doses'] = v2['additional_doses'].fillna(0)
# Alternate code v2['additional_doses'].fillna(0, inplace = True)

v2.isnull().sum()

In [None]:
#. Change the datatypes

v2['date'] = v2['date'].astype('datetime64[ns]')

v2['distributed'] = pd.to_numeric(v2['distributed']).astype(int)
v2['administered'] = pd.to_numeric(v2['administered']).astype(int)
v2['additional_doses'] = pd.to_numeric(v2['additional_doses']).astype(int)
v2['administered_12plus'] = pd.to_numeric(v2['administered_12plus']).astype(int)
v2['administered_18plus'] = pd.to_numeric(v2['administered_18plus']).astype(int)
v2['administered_65plus'] = pd.to_numeric(v2['administered_65plus']).astype(int)
v2['series_complete_yes'] = pd.to_numeric(v2['series_complete_yes']).astype(int)
v2['series_complete_12plus'] = pd.to_numeric(v2['series_complete_12plus']).astype(int)
v2['series_complete_18plus'] = pd.to_numeric(v2['series_complete_18plus']).astype(int)
v2['series_complete_65plus'] = pd.to_numeric(v2['series_complete_65plus']).astype(int)


In [None]:
v2.info()

In [None]:
#Create year, month, day columns from date

v2['year']= v2['date'].dt.year
v2['month']= v2['date'].dt.month
v2['day']= v2['date'].dt.day
v2.head()

In [None]:
# Create a column to classify the distribution volume

v2['volume'] = pd.cut(v2['distributed'], bins=[0,100000,1000000,100000000],labels=['low','medium','high'],right = False)
v2.head()

# Create a df with just data from Delaware

In [None]:
v2DE = v2[v2.location == 'DE']

In [None]:
v2DE = v2DE.sort_values('date')

In [None]:
v2DE.shape

In [None]:
v2DE.head()

In [None]:
# How many vaccines were distributed at the beginning of the month?
v2DE['dist_first'] = v2DE.sort_values(by=['day']).groupby(['year', 'month'])['distributed'].transform('first')
v2DE['dist_last']  = v2DE.sort_values(by=['day']).groupby(['year', 'month'])['distributed'].transform('last')

In [None]:
v2DE.head()

In [None]:
v2DE[['date', 'mmwr_week','distributed','year','month','day','dist_first', 'dist_last']]

# Create a df that contains data summarized by year and month

In [None]:
# Only aggregate distributed and administered

v2_agg = v2.groupby(['year','month']).agg({'distributed':['sum','mean'],'administered':['sum','mean']}).reset_index()
v2_agg.head()

# df_new = df.groupby(['col1', 'col2'])["col3", "col4"].sum()

### What needs to change if we want to the same for v2DE but we want to exclude the mean and include dist_first, dist_last?

In [None]:
v2DE2 = v2DE.groupby(['year','month']).agg(distributed = ('distributed','sum'),
                                            administered = ('administered','sum'),
                                            dist_first = ('dist_first','min'),
                                            dist_last = ('dist_last','min')).reset_index()
v2DE2.head()

In [None]:
#v2_DE2['prev_last'] = v2_DE2.sort_values(by=['month']).groupby(['year'])['dist_last','min'].shift(1)

# Write out each df as a csv file

In [None]:
v2.to_csv('vaccines_delivered.csv', index = False)
v2DE.to_csv('DE vaccines delivered.csv', index = False)
v2_agg.to_csv('v2 aggregated.csv', index = False)