In [1]:
from datetime import date
import covidcast

Date may have a wider range later on for more training data

In [2]:
start = date(2020, 3, 1)
end = date(2021, 3, 1)

CA_counties_to_fips = covidcast.fips_to_name('^06.*', ties_method='all')
CA_counties_to_fips = {value[0]: key for key, value in CA_counties_to_fips[0].items()}

# CA_counties_to_fips

In [10]:
CA_counties = list(CA_counties_to_fips.keys())
CA_counties = covidcast.name_to_fips(CA_counties)[1:]

# CA_counties

### Indicator Combination: ground truth

In [4]:
indicator_combination = covidcast.signal(
    data_source='indicator-combination',
    signal='confirmed_incidence_num',
    start_day=start, end_day=end, geo_values=CA_counties
)

indicator_combination = indicator_combination.drop([0,1])
indicator_combination['geo_value'].unique()

array(['06001', '06003', '06005', '06007', '06009', '06011', '06013',
       '06015', '06017', '06019', '06021', '06023', '06025', '06027',
       '06029', '06031', '06033', '06035', '06037', '06039', '06041',
       '06043', '06045', '06047', '06049', '06051', '06053', '06055',
       '06059', '06061', '06063', '06065', '06067', '06069', '06071',
       '06073', '06075', '06077', '06079', '06081', '06083', '06085',
       '06089', '06091', '06093', '06095', '06097', '06099', '06101',
       '06103', '06105', '06107', '06109', '06111', '06113', '06115'],
      dtype=object)

### Change Healthcare: % of confirmed cases at doctor visit

In [5]:
change_health = covidcast.signal(
    data_source='chng',
    signal='smoothed_adj_outpatient_covid',
    start_day=start, end_day=end, geo_values=CA_counties
)

change_health = change_health.drop([0,1])
change_health['geo_value'].unique()

array(['06001', '06005', '06007', '06009', '06011', '06013', '06015',
       '06017', '06019', '06021', '06023', '06025', '06027', '06029',
       '06031', '06033', '06035', '06037', '06039', '06041', '06043',
       '06045', '06047', '06049', '06051', '06053', '06055', '06059',
       '06061', '06063', '06065', '06067', '06069', '06071', '06073',
       '06075', '06077', '06079', '06081', '06083', '06085', '06089',
       '06091', '06093', '06095', '06097', '06099', '06101', '06103',
       '06105', '06107', '06109', '06111', '06113', '06115'], dtype=object)

### Hospital Admissions: % of new hospital admissions with COVID-associated diagnoses, based on claims data from health system partners, smoothed in time using a Gaussian linear smoother

In [6]:
hospital_admit = covidcast.signal(
    data_source='hospital-admissions',
    signal='smoothed_adj_covid19_from_claims',
    start_day=start, end_day=end, geo_values=CA_counties
)

hospital_admit['geo_value'].unique()

array(['06001', '06013', '06029', '06037', '06059', '06061', '06065',
       '06067', '06071', '06073', '06075', '06081', '06085', '06111',
       '06083', '06077', '06019', '06031', '06099', '06047', '06095',
       '06041', '06079', '06097', '06053', '06107', '04023', '06007',
       '06113', '06017'], dtype=object)

In [22]:
hospital_admit = hospital_admit[hospital_admit['geo_value']!='04023']
hospital_admit['geo_value'].unique()

array(['06001', '06013', '06029', '06037', '06059', '06061', '06065',
       '06067', '06071', '06073', '06075', '06081', '06085', '06111',
       '06083', '06077', '06019', '06031', '06099', '06047', '06095',
       '06041', '06079', '06097', '06053', '06107', '06007', '06113',
       '06017'], dtype=object)

### Doctor Visits: % of confirmed cases at doctor visit (comes from another source)

In [7]:
doc_visits = covidcast.signal(
    data_source="doctor-visits",
    signal="smoothed_adj_cli",
    start_day=start, end_day=end, geo_values=CA_counties
)

doc_visits = doc_visits.drop([0])
doc_visits['geo_value'].unique()

array(['06001', '06005', '06007', '06011', '06013', '06017', '06019',
       '06023', '06025', '06029', '06031', '06037', '06039', '06041',
       '06045', '06047', '06053', '06055', '06059', '06061', '06065',
       '06067', '06069', '06071', '06073', '06075', '06077', '06079',
       '06081', '06083', '06085', '06089', '06095', '06097', '06099',
       '06101', '06107', '06111', '06113', '06009', '06033', '06115',
       '06103', '06109', '06043', '06093', '06021', '06063', '06027',
       '06035'], dtype=object)

### Mobility data

In [11]:
restaurants_prop = covidcast.signal(
    data_source="safegraph",
    signal="restaurants_visit_prop",
    start_day=start, end_day=end, geo_values=CA_counties
) 

restaurants_prop = restaurants_prop.drop([0])
restaurants_prop['geo_value'].unique()



array(['06001', '06003', '06005', '06007', '06009', '06011', '06013',
       '06015', '06017', '06019', '06021', '06023', '06025', '06027',
       '06029', '06031', '06033', '06035', '06037', '06039', '06041',
       '06045', '06047', '06051', '06053', '06055', '06059', '06061',
       '06063', '06065', '06067', '06069', '06071', '06073', '06075',
       '06077', '06079', '06081', '06083', '06085', '06089', '06091',
       '06093', '06095', '06097', '06099', '06101', '06103', '06105',
       '06107', '06109', '06111', '06113', '06115', '06043'], dtype=object)

### Merge

In [120]:
# df_list = [change_health, hospital_admit, doc_visits, restaurants_prop, indicator_combination]
df_list = [change_health, hospital_admit, doc_visits, indicator_combination]

merged = covidcast.aggregate_signals(df_list)

In [121]:
import numpy as np

merged = merged.rename(
    columns={
        'chng_smoothed_adj_outpatient_covid_0_value': 'change_health',
        'hospital-admissions_smoothed_adj_covid19_from_claims_1_value': 'hospital_admit',
#         'fb-survey_smoothed_cli_3_value': 'survey',
        'doctor-visits_smoothed_adj_cli_2_value': 'doc_visits',
#         'safegraph_restaurants_visit_prop_3_value': 'restaurants_prop',
        'indicator-combination_confirmed_incidence_num_3_value': 'ground_truth'
    }
)

# keep_list = ['geo_value', 'time_value',
#              'change_health', 'hospital_admit',
#              'doc_visits', 'restaurants_prop', 'ground_truth']
keep_list = ['geo_value', 'time_value',
             'change_health', 'hospital_admit',
             'doc_visits', 'ground_truth']
merged = merged[keep_list]
merged.loc[:, 'ground_truth'] = merged.loc[:, 'ground_truth'].abs()
np.sort(merged['ground_truth'].unique())

array([0.0000e+00, 1.0000e+00, 2.0000e+00, ..., 2.1902e+04, 2.2236e+04,
       2.8549e+04])

Missing values are caused by different sources of data having different counties they keep track of. We decided to find the average of the respective column values for every day and give the NaN values the value of the average.

In [123]:
# for every day, we took the mean values of every column with values of that day
# and gave the NaN values their respective mean values for that day
for date in merged['time_value'].unique():
    change_mean = merged[merged['time_value']==date]['change_health'].mean()
    hosp_mean = merged[merged['time_value']==date]['hospital_admit'].mean()
#     survey_mean = merged[merged['time_value']==date]['survey'].mean()
    doc_mean = merged[merged['time_value']==date]['doc_visits'].mean()
#     rest_mean = merged[merged['time_value']==date]['restaurants_prop'].mean()
    ground_mean = int(merged[merged['time_value']==date]['ground_truth'].mean())
    
    
    merged.loc[merged['time_value']==date, 'change_health'] = merged.loc[merged['time_value']==date, 'change_health'].fillna(change_mean)
    merged.loc[merged['time_value']==date, 'hospital_admit'] = merged.loc[merged['time_value']==date, 'hospital_admit'].fillna(hosp_mean)
#     merged.loc[merged['time_value']==date, 'survey'] = merged.loc[merged['time_value']==date, 'survey'].fillna(survey_mean)
    merged.loc[merged['time_value']==date, 'doc_visits'] = merged.loc[merged['time_value']==date, 'doc_visits'].fillna(doc_mean)
#     merged.loc[merged['time_value']==date, 'restaurants_prop'] = merged.loc[merged['time_value']==date, 'restaurants_prop'].fillna(rest_mean)
    merged.loc[merged['time_value']==date, 'ground_truth'] = merged.loc[merged['time_value']==date, 'ground_truth'].fillna(ground_mean)

merged = merged.sort_values(['time_value', 'geo_value'])
merged[merged['geo_value']=='06001']

Unnamed: 0,geo_value,time_value,change_health,hospital_admit,doc_visits,ground_truth
0,06001,2020-03-01,0.011162,0.119646,0.000000,0.0
56,06001,2020-03-02,0.013988,0.119067,0.000000,0.0
112,06001,2020-03-03,0.012357,0.119366,0.000000,1.0
168,06001,2020-03-04,0.011438,0.119776,0.019267,0.0
224,06001,2020-03-05,0.010639,0.119895,0.016927,0.0
...,...,...,...,...,...,...
20216,06001,2021-02-25,0.342982,0.122985,7.109562,138.0
20272,06001,2021-02-26,0.325226,0.624193,6.746090,123.0
20328,06001,2021-02-27,0.318205,1.216079,6.585704,122.0
20384,06001,2021-02-28,0.337996,1.768820,6.048529,96.0


In [124]:
data_shift = len(merged['geo_value'].unique())
# today_list = ['change_health', 'hospital_admit', 'doc_visits', 'restaurants_prop']
# yesterday_list = ['change_health-1', 'hospital_admit-1', 'doc_visits-1', 'restaurants_prop-1']
today_list = ['change_health', 'hospital_admit', 'doc_visits']
yesterday_list = ['change_health-1', 'hospital_admit-1', 'doc_visits-1']

# before_yesterday_list = ['change_health-2', 'hospital_admit-2', 'doc_visits-2', 'restaurants_prop-2']

merged['ground_truth+1'] = merged['ground_truth'].shift(-1*data_shift)
for today, yesterday, before_yesterday in zip(today_list, yesterday_list, before_yesterday_list):
    merged[yesterday] = merged[today].shift(data_shift)
#     merged[before_yesterday] = merged[today].shift(2*data_shift)
    
time_series = merged.dropna()
time_series[time_series['geo_value']=='06001']

Unnamed: 0,geo_value,time_value,change_health,hospital_admit,doc_visits,ground_truth,ground_truth+1,change_health-1,hospital_admit-1,doc_visits-1
56,06001,2020-03-02,0.013988,0.119067,0.000000,0.0,1.0,0.011162,0.119646,0.000000
112,06001,2020-03-03,0.012357,0.119366,0.000000,1.0,0.0,0.013988,0.119067,0.000000
168,06001,2020-03-04,0.011438,0.119776,0.019267,0.0,0.0,0.012357,0.119366,0.000000
224,06001,2020-03-05,0.010639,0.119895,0.016927,0.0,0.0,0.011438,0.119776,0.019267
280,06001,2020-03-06,0.010242,0.119820,0.014341,0.0,1.0,0.010639,0.119895,0.016927
...,...,...,...,...,...,...,...,...,...,...
20160,06001,2021-02-24,0.344247,0.099222,7.063695,153.0,138.0,0.345074,0.093383,7.354877
20216,06001,2021-02-25,0.342982,0.122985,7.109562,138.0,123.0,0.344247,0.099222,7.063695
20272,06001,2021-02-26,0.325226,0.624193,6.746090,123.0,122.0,0.342982,0.122985,7.109562
20328,06001,2021-02-27,0.318205,1.216079,6.585704,122.0,96.0,0.325226,0.624193,6.746090


In [125]:
# export as a csv
# import pandas as pd
# compression_opts = dict(method='zip',
#                         archive_name='time_series.csv')
# time_series.to_csv('time_series.zip', index=False,
#           compression=compression_opts)
time_series.to_csv('time_series.csv', index=False)

### Drop NaN values

In [111]:
# # df_list = [change_health, hospital_admit, doc_visits, restaurants_prop, indicator_combination]
# df_list = [change_health, hospital_admit, doc_visits, indicator_combination]

# merged = covidcast.aggregate_signals(df_list)

In [112]:
# import numpy as np

# merged = merged.rename(
#     columns={
#         'chng_smoothed_adj_outpatient_covid_0_value': 'change_health',
#         'hospital-admissions_smoothed_adj_covid19_from_claims_1_value': 'hospital_admit',
# #         'fb-survey_smoothed_cli_3_value': 'survey',
#         'doctor-visits_smoothed_adj_cli_2_value': 'doc_visits',
# #         'safegraph_restaurants_visit_prop_3_value': 'restaurants_prop',
#         'indicator-combination_confirmed_incidence_num_3_value': 'ground_truth'
#     }
# )

# # keep_list = ['geo_value', 'time_value',
# #              'change_health', 'hospital_admit',
# #              'doc_visits', 'restaurants_prop', 'ground_truth']
# keep_list = ['geo_value', 'time_value',
#              'change_health', 'hospital_admit',
#              'doc_visits', 'ground_truth']
# merged = merged[keep_list]
# merged = merged.dropna().sort_values(by=['geo_value', 'time_value'])
# merged = merged.drop([4982,5038])
# merged

In [113]:
# today_list = ['change_health', 'hospital_admit', 'doc_visits']
# yesterday_list = ['change_health-1', 'hospital_admit-1', 'doc_visits-1']

# # before_yesterday_list = ['change_health-2', 'hospital_admit-2', 'doc_visits-2', 'restaurants_prop-2']

# merged['ground_truth+1'] = merged['ground_truth'].shift(-1)
# for county in merged['geo_value'].unique():
#     merged.loc[merged['geo_value']==county, 'ground_truth+1'] = merged.loc[merged['geo_value']==county, 'ground_truth'].shift(-1)
#     for i in range(len(today_list)):
#         merged.loc[merged['geo_value']==county, yesterday_list[i]] = merged.loc[merged['geo_value']==county, today_list[i]].shift()
    
# drop_na = merged.drop(columns='ground_truth').dropna()
# drop_na

In [114]:
# # export as a csv
# import pandas as pd
# compression_opts = dict(method='zip',
#                         archive_name='drop_na.csv')
# drop_na.to_csv('drop_na.zip', index=False,
#           compression=compression_opts)