In [20]:
import os
import pandas as pd
import altair as alt
import numpy as np

In [21]:
weather_data = pd.read_csv('clean_weather_data.csv')
covid_data = pd.read_csv('clean_covid_data.csv')

In [22]:
weather_data = weather_data.drop(weather_data.columns[0],axis=1)
covid_data = covid_data.drop(covid_data.columns[0],axis=1)

In [23]:
covid_data

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,32,2020-01-22,Washington,US,1/22/2020 17:00,1.0,0.0,0.0
1,71,2020-01-23,Washington,US,1/23/20 17:00,1.0,0.0,0.0
2,120,2020-01-24,Washington,US,1/24/20 17:00,1.0,0.0,0.0
3,162,2020-01-25,Washington,US,1/25/20 17:00,1.0,0.0,0.0
4,163,2020-01-25,Illinois,US,1/25/20 17:00,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
14870,192089,2020-12-31,Virginia,US,2021-04-02 15:13:53,349584.0,5032.0,0.0
14871,192098,2020-12-31,Washington,US,2021-04-02 15:13:53,246752.0,3461.0,0.0
14872,192101,2020-12-31,West Virginia,US,2021-04-02 15:13:53,85334.0,1338.0,0.0
14873,192103,2020-12-31,Wisconsin,US,2021-04-02 15:13:53,520438.0,5242.0,0.0


In [24]:
weather_data

Unnamed: 0,date,Average Temperature (deg F),location,fids
0,2020-03-10,61.769231,Alabama,FIPS:01
1,2020-03-11,63.076923,Alabama,FIPS:01
2,2020-03-12,67.384615,Alabama,FIPS:01
3,2020-03-13,66.538462,Alabama,FIPS:01
4,2020-03-14,66.153846,Alabama,FIPS:01
...,...,...,...,...
13001,2020-12-25,27.888889,Wyoming,FIPS:56
13002,2020-12-26,29.411765,Wyoming,FIPS:56
13003,2020-12-27,21.461538,Wyoming,FIPS:56
13004,2020-12-30,14.727273,Wyoming,FIPS:56


In [25]:
#We can compare average temperature to confirmed cases, deaths, and recoveries
merged_df = covid_data.merge(weather_data,how='inner',left_on=['ObservationDate','Province/State'],right_on=['date','location'])
merged_df

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,date,Average Temperature (deg F),location,fids
0,163,2020-01-25,Illinois,US,1/25/20 17:00,1.0,0.0,0.0,2020-01-25,33.600000,Illinois,FIPS:17
1,208,2020-01-26,Illinois,US,1/26/20 16:00,1.0,0.0,0.0,2020-01-26,33.580000,Illinois,FIPS:17
2,209,2020-01-26,California,US,1/26/20 16:00,2.0,0.0,0.0,2020-01-26,45.983280,California,FIPS:06
3,256,2020-01-27,Illinois,US,1/27/20 23:59,1.0,0.0,0.0,2020-01-27,30.400000,Illinois,FIPS:17
4,257,2020-01-27,California,US,1/27/20 23:59,2.0,0.0,0.0,2020-01-27,45.399615,California,FIPS:06
...,...,...,...,...,...,...,...,...,...,...,...,...
12870,192089,2020-12-31,Virginia,US,2021-04-02 15:13:53,349584.0,5032.0,0.0,2020-12-31,48.800000,Virginia,FIPS:51
12871,192098,2020-12-31,Washington,US,2021-04-02 15:13:53,246752.0,3461.0,0.0,2020-12-31,37.500000,Washington,FIPS:53
12872,192101,2020-12-31,West Virginia,US,2021-04-02 15:13:53,85334.0,1338.0,0.0,2020-12-31,43.600000,West Virginia,FIPS:54
12873,192103,2020-12-31,Wisconsin,US,2021-04-02 15:13:53,520438.0,5242.0,0.0,2020-12-31,19.240000,Wisconsin,FIPS:55


In [34]:
alt.data_transformers.disable_max_rows()
alt.Chart(merged_df).mark_point().encode(x='Average Temperature (deg F)',y='Confirmed').properties(
    width=100,height=100).facet('location:N',columns=5,spacing=4).resolve_scale(x='independent',y='independent')

In [35]:
alt.Chart(merged_df).mark_point().encode(x='Average Temperature (deg F)',y='Deaths').properties(
    width=100,height=100).facet('location:N',columns=5,spacing=4).resolve_scale(x='independent',y='independent')

In [36]:
#get state correlations and draw distribution plot

confirmed_correlations = [merged_df[merged_df['location']==loc][['Confirmed','Average Temperature (deg F)']].corr().iloc[0,1] 
                          for loc in merged_df['location'].unique()]
confirmed_correlations = [corr for corr in confirmed_correlations if not np.isnan(corr)]

In [37]:
print("Average corr:",np.nanmean(confirmed_correlations))
print("Average corr std:",np.nanstd(confirmed_correlations))
alt.Chart(pd.DataFrame({'corr':confirmed_correlations})).mark_point().encode(x=alt.X('corr',scale=alt.Scale(domain=[-1,1])))

Average corr: -0.47163340686339456
Average corr std: 0.21859236951948813


In [38]:
alt.Chart(pd.DataFrame({'corr':confirmed_correlations})).transform_density('corr',as_=['corr','density']).mark_area().encode(
    x=alt.X('corr:Q',scale=alt.Scale(domain=[-1,1])),
    y='density:Q'
)

In [31]:
#Not enough points to inform the tails, but it's clear that our mean correlation is somewhere between -0.4 and -0.8.
#Not meant to be a bootstrap, although that would probably help reduce the standard error of our estimated correlation here.