In [1]:
# importing dependencies

import pandas as pd
import requests
from sqlalchemy import create_engine

In [2]:
# Using Requests module get the json data from https://covidtracking.com/api/v1/states/daily.json

query_url = "https://covidtracking.com/api/v1/states/daily.json"

response = requests.get(query_url)
data_json = response.json()


In [3]:
#  check length of data returned from API

len(data_json)

2825

In [4]:
# from the API documentation :
# state - State or territory postal code abbreviation.
# positive - Total cumulative positive test results.
# positiveIncrease - Increase from the day before.
# negative - Total cumulative negative test results.
# negativeIncrease - Increase from the day before.
# pending - Tests that have been submitted to a lab but no results have been reported yet.
# totalTestResults - Calculated value (positive + negative) of total test results.
# totalTestResultsIncrease - Increase from the day before.
# death - Total cumulative number of people that have died.
# deathIncrease - Increase from the day before.

#creating empty arrays to hold desired data for the day

date_list = []
state_list = []
positive_list = []
negative_list = []
pending_list = []
death_list =[]
totalTestResults_list =[]
hash_no_list = []

#json parsing by looping through each items in the json data , and getting only the data points that are required
for data in range(len(data_json)):
    
        date = data_json[data]["date"]
        date_list.append(date)
                    
        state = data_json[data]["state"]
        state_list.append(state)
        
        # if a certain day, does not have any cases reported, the json data returned does not have the key we are looking for
        # we need to create a transformation rule that looks for the key, if not found store "0" value for that key
        # try except block to handle json response
        try:
            positive = data_json[data]["positiveIncrease"]
            if positive == "[]" or positive == None:
                positive = 0
            else:
                positive = positive
                
        except:
            positive = 0
        
        positive_list.append(positive)
        
        try:
            negative = data_json[data]["negativeIncrease"]
            if negative == "[]" or negative == None:
                negative = 0
            else:
                negative = negative
                
        except:
            negative = 0
        
        negative_list.append(negative)
        
        try:
            pending = data_json[data]["pending"]
            if pending == "[]" or pending == None:
                pending = 0
            else:
                pending = pending
                
        except:
            pending = 0
        
        pending_list.append(pending)
    
    
        try:
            death = data_json[data]["deathIncrease"]
            if death == "[]" or death == None:
                death = 0
            else:
                death = death
                
        except:
            death = 0
        
        death_list.append(death)
        
        try:
            totalTestResults = data_json[data]["totalTestResultsIncrease"]
            if totalTestResults == "[]" or totalTestResults == None:
                totalTestResults = 0
            else:
                totalTestResults = totalTestResults
                
        except:
            totalTestResults = 0
        
        totalTestResults_list.append(totalTestResults)
        

   
        hash_no = data_json[data]["hash"]
        hash_no_list.append(hash_no)
             


In [5]:
#  check length of data lists returned from json parsing, that matches the data set length

print(len(date_list))
print(len(state_list))
print(len(positive_list))
print(len(negative_list))
print(len(pending_list))
print(len(death_list))
print(len(totalTestResults_list))
print(len(hash_no_list))

2825
2825
2825
2825
2825
2825
2825
2825


In [6]:
# Creating a data frame to hold data retrived
us_states_covid_data_df = pd.DataFrame({"report_date":date_list,
                                        "country":"US",
                                        "state_id" :state_list,
                                        "positive_cases" :positive_list,
                                        "negative_cases" :negative_list,
                                        'pending_cases' : pending_list,
                                        'deaths' : death_list,
                                        'total_test_results' :totalTestResults_list,
                                        'id':hash_no_list})

us_states_covid_data_df.head()

Unnamed: 0,report_date,country,state_id,positive_cases,negative_cases,pending_cases,deaths,total_test_results,id
0,20200424,US,AK,2,118,0,0,120,0b9cdafd1d82e743b065f4cd36e3a4f97a0d0f0e
1,20200424,US,AL,54,0,0,0,54,9d8fbcc7573f833b3b7e9936e784ac0bdee267f4
2,20200424,US,AR,276,3712,0,0,3988,8019de595733d1e5650d3be233a479378a4a29bf
3,20200424,US,AS,0,0,17,0,0,aa59004e05304324850c80a3cf7e8d3c7f92c3c3
4,20200424,US,AZ,276,1741,0,17,2017,5ce13a11a96830ada8b9956376db507f09a391d3


In [7]:
# transformation performed on the report_date column to convert the data type to date format
us_states_covid_data_df['report_date'] = pd.to_datetime(us_states_covid_data_df['report_date'].astype(str), format='%Y%m%d')
us_states_covid_data_df.head()

Unnamed: 0,report_date,country,state_id,positive_cases,negative_cases,pending_cases,deaths,total_test_results,id
0,2020-04-24,US,AK,2,118,0,0,120,0b9cdafd1d82e743b065f4cd36e3a4f97a0d0f0e
1,2020-04-24,US,AL,54,0,0,0,54,9d8fbcc7573f833b3b7e9936e784ac0bdee267f4
2,2020-04-24,US,AR,276,3712,0,0,3988,8019de595733d1e5650d3be233a479378a4a29bf
3,2020-04-24,US,AS,0,0,17,0,0,aa59004e05304324850c80a3cf7e8d3c7f92c3c3
4,2020-04-24,US,AZ,276,1741,0,17,2017,5ce13a11a96830ada8b9956376db507f09a391d3


In [8]:
#checking counts in data Frame
us_states_covid_data_df.count()

report_date           2825
country               2825
state_id              2825
positive_cases        2825
negative_cases        2825
pending_cases         2825
deaths                2825
total_test_results    2825
id                    2825
dtype: int64

In [9]:
# conneting to database 
connection_string = "postgres:postgres@localhost:5432/covid19_db"
engine = create_engine(f'postgresql://{connection_string}')

In [10]:
# Getting table names
engine.table_names()

['state',
 'us_states_cases',
 'country',
 'index_prices',
 'country_cases',
 'us_unemployment_stats',
 'hospital_beds',
 'gas_price']

In [11]:
#create a copy of the data frame
new_us_states_covid_data_df = us_states_covid_data_df.copy()
new_us_states_covid_data_df.head()

Unnamed: 0,report_date,country,state_id,positive_cases,negative_cases,pending_cases,deaths,total_test_results,id
0,2020-04-24,US,AK,2,118,0,0,120,0b9cdafd1d82e743b065f4cd36e3a4f97a0d0f0e
1,2020-04-24,US,AL,54,0,0,0,54,9d8fbcc7573f833b3b7e9936e784ac0bdee267f4
2,2020-04-24,US,AR,276,3712,0,0,3988,8019de595733d1e5650d3be233a479378a4a29bf
3,2020-04-24,US,AS,0,0,17,0,0,aa59004e05304324850c80a3cf7e8d3c7f92c3c3
4,2020-04-24,US,AZ,276,1741,0,17,2017,5ce13a11a96830ada8b9956376db507f09a391d3


In [12]:
#loading to us_states_cases table, and replace everytime the load is done 
new_us_states_covid_data_df.to_sql(name='us_states_cases', con=engine, if_exists='replace', index=False)

In [13]:
#selecting first five records from table
pd.read_sql_query('select * from us_states_cases', con=engine).head()

Unnamed: 0,report_date,country,state_id,positive_cases,negative_cases,pending_cases,deaths,total_test_results,id
0,2020-04-24,US,AK,2,118,0,0,120,0b9cdafd1d82e743b065f4cd36e3a4f97a0d0f0e
1,2020-04-24,US,AL,54,0,0,0,54,9d8fbcc7573f833b3b7e9936e784ac0bdee267f4
2,2020-04-24,US,AR,276,3712,0,0,3988,8019de595733d1e5650d3be233a479378a4a29bf
3,2020-04-24,US,AS,0,0,17,0,0,aa59004e05304324850c80a3cf7e8d3c7f92c3c3
4,2020-04-24,US,AZ,276,1741,0,17,2017,5ce13a11a96830ada8b9956376db507f09a391d3
