In [1]:
#Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import scipy.stats as st
from scipy.stats import linregress
from pprint import pprint

In [2]:
#Create dataframe for Air Quality data from csv files.
aq2020_df = pd.read_csv("Resources/aq2020.csv")
aq2019Q1_df = pd.read_csv("Resources/aq2019Q1.csv")
aq2019Q2_df = pd.read_csv("Resources/aq2019Q2.csv")
aq2019Q3_df = pd.read_csv("Resources/aq2019Q3.csv")

#Create a df to hold all of 2019 data
aq2019_df=pd.concat([aq2019Q1_df,aq2019Q2_df,aq2019Q3_df])

#Format dataframe dates
aq2020_df['Date']= pd.to_datetime(aq2020_df['Date'])
aq2019_df['Date']= pd.to_datetime(aq2019_df['Date'])

In [3]:
#Drop nullvalues and clean AQ data

#drop null values and reset index 2020
clean_aq2020_df = aq2020_df.dropna(how='any')
clean_aq2020_df.reset_index(inplace=True)
del clean_aq2020_df['index']
clean_aq2020_df

#drop null values and reset index 2020
clean_aq2019_df = aq2019_df.dropna(how='any')
clean_aq2019_df.reset_index(inplace=True)
del clean_aq2019_df['index']
clean_aq2019_df



Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,2019-01-16,AE,Abu Dhabi,pm10,24.0,86.0,99.0,97.0,179.40
1,2019-01-22,AE,Abu Dhabi,pm10,24.0,51.0,57.0,55.0,23.75
2,2019-01-26,AE,Abu Dhabi,pm10,24.0,136.0,173.0,160.0,941.96
3,2019-01-07,AE,Abu Dhabi,pm10,24.0,60.0,91.0,72.0,1006.88
4,2019-01-10,AE,Abu Dhabi,pm10,24.0,82.0,93.0,87.0,57.97
...,...,...,...,...,...,...,...,...,...
1596666,2019-07-09,HU,Budapest,wind-gust,282.0,0.1,12.6,4.4,109.64
1596667,2019-07-18,HU,Budapest,wind-gust,338.0,0.1,9.7,3.4,37.78
1596668,2019-07-21,HU,Budapest,wind-gust,337.0,0.1,17.9,3.8,137.24
1596669,2019-07-24,HU,Budapest,wind-gust,325.0,0.2,6.1,2.7,33.09


In [4]:
#Covid API URL
base_url = "https://api.covid19api.com/dayone/country/"
end_url = "/status/confirmed"

Country_list=['Germany','Italy','New Zealand','Spain','Brazil','USA','China','Japan']
Covid_date=[]
Covid_city=[]
Covid_province=[]
Covid_country_code=[]
Covid_cases=[]


# Loop through the list of coutries and perform a request for data on each
print("Beginning Data Retrieval")
print("--------------------------")
for country in Country_list:
    response = requests.get(base_url+country+end_url).json()
    response
    for i in range(len(response)):
        #exception handling  
        try:  
            #store covid data into respective variables
            Covid_date.append(response[i]['Date'])
            Covid_country_code.append(response[i]['CountryCode'])
            Covid_city.append(response[i]['City'])
            Covid_province.append(response[i]['Province'])
            Covid_cases.append(response[i]['Cases'])
        except KeyError:
            print("Coutry not found. Skipping...")
print("-----------------------------")
print("Data Retrieval Complete")      
print("-----------------------------")

Beginning Data Retrieval
--------------------------
-----------------------------
Data Retrieval Complete
-----------------------------


In [5]:
#Create a dataframe to store covid data

covid_dict = {
    "Date": Covid_date,
    "Country": Covid_country_code,
    "Province": Covid_province,
    "City":Covid_city,
    "Cases": Covid_cases
}
Covid_data = pd.DataFrame(covid_dict)


#Format covid dataframe dates (now mergable with air quality dataframes)
Covid_data['Date'] = Covid_data['Date'].str.strip('T00:00:00Z')
Covid_data['Date'] = pd.to_datetime(Covid_data['Date'])

#Drop duplicates
Clean_Covid_data= Covid_data.drop_duplicates(["City", "Date", "Country","Province" ], keep = 'first')
Clean_Covid_data



Unnamed: 0,Date,Country,Province,City,Cases
0,2020-01-27,DE,,,1
1,2020-01-28,DE,,,4
2,2020-01-29,DE,,,4
3,2020-01-03,DE,,,4
4,2020-01-31,DE,,,5
...,...,...,...,...,...
356086,2020-07-15,JP,,,23172
356087,2020-07-16,JP,,,23510
356088,2020-07-17,JP,,,24104
356089,2020-07-18,JP,,,24946


In [6]:
#Define a function to create Linear Regression plots

def L_regress(x_value, y_value, x_lbl, y_lbl,title,linepos,fignum):
    
    # Perform a linear regression
    (slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_value, y_value)

    # Get regression values
    regress_value = x_value * slope + intercept
    
    # Print r square value
    print(f"The r-squared is: {rvalue**2}")
    
    #calculate correlation coefficient and print
    correlation = st.pearsonr(x_value,y_value)
    print(f"The correlation between covid cases and Air quality Pm2.5 value is {round(correlation[0],2)}")
    
    # Create Plot
    plt.scatter(x_value,y_value, marker="o", facecolors="blue", edgecolors="black", alpha=0.75)
    plt.plot(x_value,regress_value,"r-")
    plt.grid(b=None, which='major', axis='both')
    
    # Create line equation string
    line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))
    
    #Label plot and annotate the line equation
    plt.xlabel(x_lbl)
    plt.ylabel(y_lbl)
    plt.title(title)
    plt.annotate(line_eq,linepos,fontsize=15,color="red")

    #save figure 
    plt.savefig(f"output_data/fig{fignum}.png")

    # Show plot
    plt.show()