In [None]:
import pandas as pd
import numpy as np

# Import and Transform Data

In [None]:
# Read CSV data and verify
sparse_covid_df = pd.read_csv('../data/covid/sparse_county_covid.csv',index_col=0)
dense_covid_df = pd.read_csv('../data/covid/dense_county_covid.csv',index_col=0)
weather_df = pd.read_csv('../data/weather/county_weather.csv',index_col=0)

In [None]:
# Add population density and total population values
density = {
    'baltimore': 6866,
    'essex': 6168,
    'cook': 5301,
    'union': 5150,
    'norfolk': 5026,
    'nassau': 4954,
    'harrisonburg':4765,
    'fairfax':2454,
    'camden':2289,
    'harris':2700,
    'franklin':2186,
    'marion':2466,
    'dekalb':2482,
    'duval':1305,
    'wake':1377,
    'bexar':1620
}

population = {
    'baltimore': 621342,
    'essex': 755618,
    'cook': 5231351,
    'union': 543976,
    'norfolk': 245782,
    'nassau': 74629,
    'harrisonburg':50981,
    'fairfax':1118602,
    'camden':513539,
    'harris':4253700,
    'franklin':1195537,
    'marion':918977,
    'dekalb':707089,
    'duval':879602,
    'wake':952151,
    'bexar':1785704
}

In [None]:
# Add pop_density and population columns to covid data
sparse_covid_df['population'] = pd.Series(dtype=int)
sparse_covid_df['pop_density'] = pd.Series(dtype=int)

dense_covid_df['population'] = pd.Series(dtype=int)
dense_covid_df['pop_density'] = pd.Series(dtype=int)

# Set population and density columns for sparse counties
for i in range(len(sparse_covid_df.index)):
    county = sparse_covid_df.iloc[i,1]

    sparse_covid_df.iloc[i,7] = population[county]
    sparse_covid_df.iloc[i,8] = density[county]

# Set population and density columns for dense counties
for i in range(len(dense_covid_df.index)):
    county = dense_covid_df.iloc[i,1]

    dense_covid_df.iloc[i,7] = population[county]
    dense_covid_df.iloc[i,8] = density[county]

In [None]:
# Add past_delta14 columns to sparse and dense counties
sparse_covid_df['past_delta14'] = pd.Series(dtype=int)
dense_covid_df['past_delta14'] = pd.Series(dtype=int)

# Sparse county past_delta_14
for i in range(14,len(sparse_covid_df.index)):

    # Set past_delta_14
    past_cases = sparse_covid_df.iloc[i-14,3]
    present_cases = sparse_covid_df.iloc[i,3]
    delta14 = present_cases - past_cases

    sparse_covid_df.iloc[i,9] = delta14

# Dense county past delta14
for i in range(14,len(dense_covid_df.index)):

    # Set past_delta_14
    past_cases = dense_covid_df.iloc[i-14,3]
    present_cases = dense_covid_df.iloc[i,3]
    delta14 = present_cases - past_cases

    dense_covid_df.iloc[i,9] = delta14

In [None]:
# Sort weather data according to the sparse or dense county sets
sparse_counties = sparse_covid_df['county'].unique()
dense_counties = dense_covid_df['county'].unique()

# Sort weather for each set
sparse_weather_df = weather_df[weather_df['county'].isin(sparse_counties)]
dense_weather_df = weather_df[weather_df['county'].isin(dense_counties)]

In [None]:
# Merge weather data on sparse and dense covid data
sparse_weather_covid_df = sparse_covid_df.merge(sparse_weather_df,on=['date','county'])
dense_weather_covid_df = dense_covid_df.merge(dense_weather_df,on=['date','county'])

In [None]:
# Verify Sparse Data
sparse_weather_covid_df.head()

In [None]:
# Verify Dense Data
dense_weather_covid_df.head()

In [None]:
# Set Feature categories
weather_features = ['temp_mean(C)','precip_sum(mm)','wind_max(km/h)','min_humidity(%)','max_humidity(%)','mean_humidity(%)']
county_features = ['population','pop_density']
covid_features = ['new_cases','past_delta14']
target = ['future_delta14']

In [None]:
# Reorder and drop unnecessary columns
sparse_weather_covid_df = sparse_weather_covid_df[target + county_features + covid_features + weather_features]
dense_weather_covid_df = dense_weather_covid_df[target + county_features + covid_features + weather_features]

# Analyze Data

## Sparse County Data

In [None]:
# Sparse Data Summary Statistics
sparse_weather_covid_df.describe()

In [None]:
sparse_weather_covid_df.corr()

In [None]:
# Scatter plot for target with county features
pd.plotting.scatter_matrix(sparse_weather_covid_df[target + county_features])

In [None]:
# Scatter plot for target with covid features
pd.plotting.scatter_matrix(sparse_weather_covid_df[target + covid_features])

In [None]:
# Scatter plot for target with weather features
pd.plotting.scatter_matrix(sparse_weather_covid_df[target + weather_features])

## Dense County Data

In [None]:
dense_weather_covid_df.describe()

In [None]:
dense_weather_covid_df.corr()

In [None]:
# Scatter plot for target with county features
pd.plotting.scatter_matrix(dense_weather_covid_df[target + county_features])

In [None]:
# Scatter plot for target with covid features
pd.plotting.scatter_matrix(dense_weather_covid_df[target + covid_features])

In [None]:
# Scatter plot for target with weather features
pd.plotting.scatter_matrix(sparse_weather_covid_df[target + weather_features])