In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter(action='ignore',category=FutureWarning)

covid_df = pd.read_csv("us_counties_covid19_daily.csv")
covid_df = covid_df.drop(['fips','deaths'],1)
covid_df.head()

Unnamed: 0,date,county,state,cases
0,2020-01-21,Snohomish,Washington,1
1,2020-01-22,Snohomish,Washington,1
2,2020-01-23,Snohomish,Washington,1
3,2020-01-24,Cook,Illinois,1
4,2020-01-24,Snohomish,Washington,1


In [2]:
# Select data for dense county data
dense_state_county = [
    ('Maryland','Baltimore'),
    ('New Jersey','Essex'),
    ('Illinois','Cook'),
    ('New Jersey','Union'),
    ('Virginia','Norfolk city'),
    ('New York','Nassau'),
    ('Virginia','Harrisonburg city')
]

In [3]:
# Process data for each dense county

# Create new DataFrame to bring data for each county together
dense_county_covid_df = pd.DataFrame()

for state, county in dense_state_county:
    # Get the correct county data
    county_df = covid_df[(covid_df['state'] == state) & (covid_df['county'] == county)].reset_index().drop(['index'],1)

    # new_cases column
    # record the number of new cases for today
    county_df['new_cases'] = pd.Series(dtype=int)

    # Set new_cases for first row = total_cases
    county_df.iloc[0,4] = county_df.iloc[0,3]

    for i in range(1,len(county_df.index)):
        current_cases = county_df.iloc[i,3]
        previous_cases = county_df.iloc[i-1,3]

        county_df.iloc[i,4] = current_cases - previous_cases

    # delta7 column
    # record the change in number of cases from 7 days ago to today
    county_df['future_delta7'] = pd.Series(dtype=int)

    for i in range(0, len(county_df.index)-7):
        current_cases = county_df.iloc[i,3]
        future_cases = county_df.iloc[i+7,3]

        county_df.iloc[i,5] = future_cases - current_cases
    
    # delta14 column
    # record the change in number of cases from 7 days ago to today
    county_df['future_delta14'] = pd.Series(dtype=int)

    for i in range(0,len(county_df.index)-14):
        current_cases = county_df.iloc[i,3]
        future_cases = county_df.iloc[i+14,3]

        county_df.iloc[i,6] = future_cases - current_cases
    
    # remove secondary words like 'city' from county names
    for i in range(0,len(county_df.index)):
        county_df.iloc[i,1] = county.split(' ')[0]

    # Add to final_df
    dense_county_covid_df = pd.concat([dense_county_covid_df,county_df])

# Convert float values to int
dense_county_covid_df = dense_county_covid_df.convert_dtypes()
dense_county_covid_df.head(30)

# make all county and state names lowercase, remove secondary words like 'city'
dense_county_covid_df['county'] = dense_county_covid_df['county'].str.lower()
dense_county_covid_df['state'] = dense_county_covid_df['state'].str.lower()

# rename 'cases' to 'total_cases'
dense_county_covid_df = dense_county_covid_df.rename(columns={'cases':'total_cases'})

# reset index
dense_county_covid_df = dense_county_covid_df.reset_index().drop(['index'],1)

# save county results to csv
dense_county_covid_df.to_csv("data/covid/dense_county_covid.csv")

# check results
dense_county_covid_df.head(30)

Unnamed: 0,date,county,state,total_cases,new_cases,future_delta7,future_delta14
0,2020-03-11,baltimore,maryland,1,1,6,50
1,2020-03-12,baltimore,maryland,1,0,12,80
2,2020-03-13,baltimore,maryland,2,1,11,101
3,2020-03-14,baltimore,maryland,3,1,16,138
4,2020-03-15,baltimore,maryland,3,0,25,159
5,2020-03-16,baltimore,maryland,4,1,31,182
6,2020-03-17,baltimore,maryland,4,0,38,223
7,2020-03-18,baltimore,maryland,7,3,44,282
8,2020-03-19,baltimore,maryland,13,6,68,340
9,2020-03-20,baltimore,maryland,13,0,90,414


In [4]:
# Verify end of data
dense_county_covid_df.tail(30)

Unnamed: 0,date,county,state,total_cases,new_cases,future_delta7,future_delta14
1903,2020-11-06,harrisonburg,virginia,3183,22,52.0,136.0
1904,2020-11-07,harrisonburg,virginia,3197,14,50.0,150.0
1905,2020-11-08,harrisonburg,virginia,3197,0,50.0,162.0
1906,2020-11-09,harrisonburg,virginia,3205,8,51.0,169.0
1907,2020-11-10,harrisonburg,virginia,3217,12,74.0,173.0
1908,2020-11-11,harrisonburg,virginia,3224,7,72.0,173.0
1909,2020-11-12,harrisonburg,virginia,3225,1,71.0,197.0
1910,2020-11-13,harrisonburg,virginia,3235,10,84.0,189.0
1911,2020-11-14,harrisonburg,virginia,3247,12,100.0,214.0
1912,2020-11-15,harrisonburg,virginia,3247,0,112.0,224.0


In [5]:
# Select data for sparse county data
sparse_state_county = [
    ('Virginia','Fairfax'),
    ('New Jersey','Camden'),
    ('Texas','Harris'),
    ('Ohio','Franklin'),
    ('Indiana','Marion'),
    ('Georgia','DeKalb'),
    ('Florida','Duval'),
    ('North Carolina','Wake'),
    ('Texas','Bexar')
]

In [6]:
# Process data for each sparse county

# Create new DataFrame to bring data for each county together
sparse_county_covid_df = pd.DataFrame()

for state, county in sparse_state_county:
    # Get the correct county data
    county_df = covid_df[(covid_df['state'] == state) & (covid_df['county'] == county)].reset_index().drop(['index'],1)

    # new_cases column
    # record the number of new cases for today
    county_df['new_cases'] = pd.Series(dtype=int)

    # Set new_cases for first row = total_cases
    county_df.iloc[0,4] = county_df.iloc[0,3]

    for i in range(1,len(county_df.index)):
        current_cases = county_df.iloc[i,3]
        previous_cases = county_df.iloc[i-1,3]

        county_df.iloc[i,4] = current_cases - previous_cases

    # delta7 column
    # record the change in number of cases from 7 days ago to today
    county_df['future_delta7'] = pd.Series(dtype=int)

    for i in range(0, len(county_df.index)-7):
        current_cases = county_df.iloc[i,3]
        future_cases = county_df.iloc[i+7,3]

        county_df.iloc[i,5] = future_cases - current_cases
    
    # delta14 column
    # record the change in number of cases from 7 days ago to today
    county_df['future_delta14'] = pd.Series(dtype=int)

    for i in range(0,len(county_df.index)-14):
        current_cases = county_df.iloc[i,3]
        future_cases = county_df.iloc[i+14,3]

        county_df.iloc[i,6] = future_cases - current_cases

    # Add to final_df
    sparse_county_covid_df = pd.concat([sparse_county_covid_df,county_df])

# Convert float values to int
sparse_county_covid_df = sparse_county_covid_df.convert_dtypes()
sparse_county_covid_df.head(30)

# make all county and state names lowercase, remove secondary words like 'city'
sparse_county_covid_df['county'] = sparse_county_covid_df['county'].str.lower()
sparse_county_covid_df['state'] = sparse_county_covid_df['state'].str.lower()

# rename 'cases' to 'total_cases'
sparse_county_covid_df = sparse_county_covid_df.rename(columns={'cases':'total_cases'})

# reset index
sparse_county_covid_df = sparse_county_covid_df.reset_index().drop(['index'],1)

# save county results to csv
sparse_county_covid_df.to_csv("data/covid/sparse_county_covid.csv")

# check results
sparse_county_covid_df.head(30)

Unnamed: 0,date,county,state,total_cases,new_cases,future_delta7,future_delta14
0,2020-03-07,fairfax,virginia,1,1,9,21
1,2020-03-08,fairfax,virginia,2,1,8,29
2,2020-03-09,fairfax,virginia,4,2,6,39
3,2020-03-10,fairfax,virginia,4,0,8,42
4,2020-03-11,fairfax,virginia,4,0,10,72
5,2020-03-12,fairfax,virginia,4,0,12,120
6,2020-03-13,fairfax,virginia,6,2,10,118
7,2020-03-14,fairfax,virginia,10,4,12,146
8,2020-03-15,fairfax,virginia,10,0,21,177
9,2020-03-16,fairfax,virginia,10,0,33,215


In [7]:
# Verify end of data
sparse_county_covid_df.tail(30)

Unnamed: 0,date,county,state,total_cases,new_cases,future_delta7,future_delta14
2454,2020-11-06,bexar,texas,67139,218,1249.0,5186.0
2455,2020-11-07,bexar,texas,67377,238,2428.0,5440.0
2456,2020-11-08,bexar,texas,67639,262,2415.0,6255.0
2457,2020-11-09,bexar,texas,68056,417,2192.0,6547.0
2458,2020-11-10,bexar,texas,68388,332,2152.0,7342.0
2459,2020-11-11,bexar,texas,68388,0,2518.0,8374.0
2460,2020-11-12,bexar,texas,68388,0,3001.0,8374.0
2461,2020-11-13,bexar,texas,68388,0,3937.0,8374.0
2462,2020-11-14,bexar,texas,69805,1417,3012.0,8618.0
2463,2020-11-15,bexar,texas,70054,249,3840.0,10015.0
