In [8]:
import math
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# load all data sources, all sliced by counties
covid_df = pd.read_csv('data/covid-2021-us-counties.csv', index_col=False)
population_df = pd.read_csv('data/2020CensusTotalPopulation.csv', index_col=False, skiprows = 1)
obesity_df = pd.read_csv('data/ObesityData.csv', index_col=False, skiprows = 2)
pi_df = pd.read_csv('data/Physical_Inactivity_Data.csv', index_col=False, skiprows = 2) # physical inactivity
diabetes_df = pd.read_csv('data/DiabetesAtlasData.csv', index_col=False, skiprows = 2)

In [3]:
# util function
def isfloat(num):
    if not num:
        return False

    try:
        float(num)
        return True
    except ValueError:
        return False

In [15]:
# extract 2021-02-28 data for covid cases and deaths
date_df = covid_df[(covid_df['date'] == "2021-02-28")]

# new a dictionary, key FIPS (county id), values will be 
#  - covid cases, deaths, population, pi, obesity, diabetes,
#  - cases pct, deaths pct, pi pct, obesity pct, diaebets pct
ccdata = {}

print("covid size: ", date_df['fips'].size)
for index, row in date_df.iterrows():
    fips = row['fips']
    if math.isnan(fips):
        print("fips (", fips, ")")
        continue
    if math.isnan(row['cases']):
        print("cases (", row['cases'], ")")
        continue
    
    v = {}
    ccdata[fips] = v
    v['fips'] = fips
    v['state'] = row['state']
    v['county'] = row['county']
    v['cases'] = row['cases']
    if math.isnan(row['deaths']):
        v['deaths'] = 0
    else:
        v['deaths'] = row['deaths']
    
print(ccdata[1001])
print(len(ccdata))

covid size:  3246
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
fips ( nan )
{'fips': 1001.0, 'state': 'Alabama', 'county': 'Autauga', 'cases': 6264, 'deaths': 91.0}
3218


In [19]:
# merge population data into ccdata
for index, row in population_df.iterrows():
    fips_str = row['id']
    fips_str = fips_str[-5:]
    if not isfloat(fips_str):
        print("fips: ", fips_str)
        continue
    fips = float(fips_str)
    
    v = ccdata.get(fips)
    if not v:
        print("no ccdata for: ", fips)
        continue
    else: 
        p = row['Estimate!!Total']
        cases_pct = (v['cases']/p) * 100
        deaths_pct = (v['deaths']/p) * 100
        v['population'] = p
        v['cases_pct'] = cases_pct
        v['deaths_pct'] = deaths_pct

print(ccdata[1001])

no ccdata for:  2060.0
no ccdata for:  2063.0
no ccdata for:  2066.0
no ccdata for:  2105.0
no ccdata for:  2164.0
no ccdata for:  2282.0
no ccdata for:  36005.0
no ccdata for:  36047.0
no ccdata for:  36061.0
no ccdata for:  36081.0
no ccdata for:  36085.0
fips:  000US
{'fips': 1001.0, 'state': 'Alabama', 'county': 'Autauga', 'cases': 6264, 'deaths': 91.0, 'population': 55639, 'cases_pct': 11.258290048347382, 'deaths_pct': 0.16355434137924837}


In [22]:
for index, row in obesity_df.iterrows():
    fips = row['County_FIPS']
    v = ccdata.get(fips)
    if not v:
        print("no ccdata for: ", fips)
        continue
    else: 
        v['obesity_pct'] = row['Obesity Percentage']

print(ccdata[1001])

no ccdata for:  2060.0
no ccdata for:  2105.0
no ccdata for:  2164.0
no ccdata for:  2282.0
no ccdata for:  36005.0
no ccdata for:  36047.0
no ccdata for:  36061.0
no ccdata for:  36081.0
no ccdata for:  36085.0
no ccdata for:  nan
{'fips': 1001.0, 'state': 'Alabama', 'county': 'Autauga', 'cases': 6264, 'deaths': 91.0, 'population': 55639, 'cases_pct': 11.258290048347382, 'deaths_pct': 0.16355434137924837, 'obesity_pct': 29.6}


In [25]:
for index, row in diabetes_df.iterrows():
    fips = row['County_FIPS']
    v = ccdata.get(fips)
    if not v:
        print("no ccdata for: ", fips)
        continue
    else: 
        v['diabetes_pct'] = row['Diagnosed Diabetes Percentage']
print(ccdata[1001])

no ccdata for:  2060.0
no ccdata for:  2105.0
no ccdata for:  2164.0
no ccdata for:  2282.0
no ccdata for:  36005.0
no ccdata for:  36047.0
no ccdata for:  36061.0
no ccdata for:  36081.0
no ccdata for:  36085.0
no ccdata for:  nan
{'fips': 1001.0, 'state': 'Alabama', 'county': 'Autauga', 'cases': 6264, 'deaths': 91.0, 'population': 55639, 'cases_pct': 11.258290048347382, 'deaths_pct': 0.16355434137924837, 'obesity_pct': 29.6, 'diabetes_pct': 9.5}


In [31]:
for index, row in pi_df.iterrows():
    fips = row['County_FIPS']
    v = ccdata.get(fips)
    if not v:
        print("no ccdata for: ", fips)
        continue
    else: 
        v['pi_pct'] = row['Physical Inactivity Percentage']
print(ccdata[1001])

no ccdata for:  2060.0
no ccdata for:  2105.0
no ccdata for:  2164.0
no ccdata for:  2282.0
no ccdata for:  36005.0
no ccdata for:  36047.0
no ccdata for:  36061.0
no ccdata for:  36081.0
no ccdata for:  36085.0
no ccdata for:  nan
{'fips': 1001.0, 'state': 'Alabama', 'county': 'Autauga', 'cases': 6264, 'deaths': 91.0, 'population': 55639, 'cases_pct': 11.258290048347382, 'deaths_pct': 0.16355434137924837, 'obesity_pct': 29.6, 'diabetes_pct': 9.5, 'physical_inactivity_pct': 23.8, 'pi_pct': 23.8}


In [39]:
ccdatalist = []
keys = list(ccdata.keys())
for k in keys:
    v = ccdata.get(k)

    if not 'population' in v.keys():
        print("no population data, ", v['fips'])
        continue
    if not 'obesity_pct' in v.keys():
        print("no obesity data, ", v['fips'])
        continue
    
    
    # make a list from the map
    row = []
    row.append(v['fips'])
    row.append(v['state'])
    row.append(v['county'])
    row.append(v['cases'])
    row.append(v['deaths'])
    row.append(v['population'])
    row.append(v['cases_pct'])
    row.append(v['deaths_pct'])
    row.append(v['obesity_pct'])
    row.append(v['diabetes_pct'])
    row.append(v['pi_pct'])
    

    ccdatalist.append(row)

ccdf = pd.DataFrame(ccdatalist, columns = ['County FIPS', 'State', 'County', 'Covid Cases', 'Covid Deaths', 'County Population', 'Covid Case Percentage', 'Covid Death Percentage', 'Obesity Percentage', 'Diabetes Percentage', 'Physical Inactivity Percentage'])
#print(ccdf)

no population data,  2997.0
no population data,  2261.0
no population data,  2998.0
no obesity data,  35039.0
no population data,  69110.0
no population data,  69120.0
no obesity data,  72001.0
no obesity data,  72003.0
no obesity data,  72005.0
no obesity data,  72007.0
no obesity data,  72009.0
no obesity data,  72011.0
no obesity data,  72013.0
no obesity data,  72015.0
no obesity data,  72017.0
no obesity data,  72019.0
no obesity data,  72021.0
no obesity data,  72023.0
no obesity data,  72025.0
no obesity data,  72027.0
no obesity data,  72029.0
no obesity data,  72031.0
no obesity data,  72033.0
no obesity data,  72035.0
no obesity data,  72037.0
no obesity data,  72039.0
no obesity data,  72041.0
no obesity data,  72043.0
no obesity data,  72045.0
no obesity data,  72047.0
no obesity data,  72049.0
no obesity data,  72051.0
no obesity data,  72053.0
no obesity data,  72054.0
no obesity data,  72055.0
no obesity data,  72057.0
no obesity data,  72059.0
no obesity data,  72061.0


In [136]:
ccdf.corr(method ='pearson')

Unnamed: 0,County FIPS,Covid Cases,Covid Deaths,County Population,Covid Case Percentage,Covid Death Percentage,Obesity Percentage,Diabetes Percentage,Physical Inactivity Percentage
County FIPS,1.0,0.004371,0.003397,-0.055882,-0.041958,-0.035142,0.005332,-0.082733,-0.155159
Covid Cases,0.004371,1.0,0.955142,0.267822,0.006598,-0.024492,-0.072476,-0.045507,-0.0314
Covid Deaths,0.003397,0.955142,1.0,0.178252,0.01281,-0.015572,-0.056952,-0.032139,-0.025655
County Population,-0.055882,0.267822,0.178252,1.0,-0.103838,-0.109318,-0.032191,-0.017881,-0.066131
Covid Case Percentage,-0.041958,0.006598,0.01281,-0.103838,1.0,0.982783,-0.109518,-0.065316,-0.046833
Covid Death Percentage,-0.035142,-0.024492,-0.015572,-0.109318,0.982783,1.0,-0.052663,-0.00046,-0.000539
Obesity Percentage,0.005332,-0.072476,-0.056952,-0.032191,-0.109518,-0.052663,1.0,0.628975,0.677908
Diabetes Percentage,-0.082733,-0.045507,-0.032139,-0.017881,-0.065316,-0.00046,0.628975,1.0,0.718043
Physical Inactivity Percentage,-0.155159,-0.0314,-0.025655,-0.066131,-0.046833,-0.000539,0.677908,0.718043,1.0


In [40]:
# raw data size
print("processed data # of rows: ", ccdf['County FIPS'].size)

processed data # of rows:  3131


In [41]:
# keep counties with population greater than 50K only
df_sub = ccdf[(ccdf['County Population'] >= 50000)]

###### whole country - Physical Inactivity correlation
print("# of counties: ", df_sub['County FIPS'].size)

r, p = stats.pearsonr(df_sub['Covid Case Percentage'], df_sub['Physical Inactivity Percentage'])
print("covid case - pi correlation:", r, "p-value:", p)
r, p = stats.pearsonr(df_sub['Covid Death Percentage'], df_sub['Physical Inactivity Percentage'])
print("covid death - pi correlation:", r, "p-value:", p)

# of counties:  985
covid case - pi correlation: 0.30083175293684705 p-value: 4.726008887156151e-22
covid death - pi correlation: 0.4060170420839583 p-value: 2.198308964206128e-40


In [42]:
###### whole country - Obesity correlation
print("# of counties: ", df_sub['County FIPS'].size)

r, p = stats.pearsonr(df_sub['Covid Case Percentage'], df_sub['Obesity Percentage'])
print("covid case - obesity correlation:", r, "p-value:", p)
r, p = stats.pearsonr(df_sub['Covid Death Percentage'], df_sub['Obesity Percentage'])
print("covid death - obesity correlation:", r, "p-value:", p)

# of counties:  985
covid case - obesity correlation: 0.1726919522412191 p-value: 4.924064175306927e-08
covid death - obesity correlation: 0.2023788834323017 p-value: 1.4554021251953065e-10


In [43]:
###### whole country - Diabetes correlation
print("# of counties: ", df_sub['County FIPS'].size)

r, p = stats.pearsonr(df_sub['Covid Case Percentage'], df_sub['Diabetes Percentage'])
print("covid case correlation:", r, "p-value:", p)
r, p = stats.pearsonr(df_sub['Covid Death Percentage'], df_sub['Diabetes Percentage'])
print("covid death correlation:", r, "p-value:", p)

# of counties:  985
covid case correlation: 0.22372951996649532 p-value: 1.2208070327342218e-12
covid death correlation: 0.29895167831596625 p-value: 8.742715933432988e-22
