In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import datetime
pd.set_option('display.max_columns', 5000)

In [2]:
# !curl https://raw.githubusercontent.com/JieYingWu/COVID-19_US_County-level_Summaries/master/data/counties.csv  -o ../data/raw/counties_09-11.csv
# !curl https://raw.githubusercontent.com/JieYingWu/COVID-19_US_County-level_Summaries/master/data/deaths_timeseries.csv -o ../data/raw/time_series_covid19_deaths_US_06-19.csv
# !curl https://opendata.arcgis.com/datasets/6ac5e325468c4cb9b905f1728d6fbf0f_0.csv?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D -o ../data/raw/hospitals.csv
# !curl https://raw.githubusercontent.com/descarteslabs/DL-COVID-19/master/DL-us-m50.csv -o ../data/raw/DL-us-m50.csv
# !curl https://raw.githubusercontent.com/descarteslabs/DL-COVID-19/master/DL-us-m50_index.csv -o ../data/raw/DL-us-m50_index.csv
# !curl https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv -o ../data/raw/time_series_covid19_deaths_US_05-22.csv
# !curl https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=694ae9957380f150 -o ../data/raw/google_mobility_report_06-14-20.csv
# !curl https://raw.githubusercontent.com/HopkinsIDD/hit-covid/master/data/hit-covid-longdata.csv -o ../data/raw/hit_covid_interventions_06-26-20.csv
# !curl https://raw.githubusercontent.com/Keystone-Strategy/covid19-intervention-data/master/complete_npis_raw_policies.csv -o ../data/raw/complete_npis_inherited_policies_keystone_06_26.csv # have to delete first cell/ttitle
# !curl https://raw.githubusercontent.com/COVID19StatePolicy/SocialDistancing/master/data/USstatesCov19distancingpolicy.csv -o ../data/raw/USstatesCov19distancingpolicy_07_01.csv
# !curl https://data.cms.gov/resource/s2uc-8wxp.csv -o ../data/raw/longtermcare_deaths_07_08.csv

In [3]:
hospitals = pd.read_csv('../data/raw/hospitals.csv')
hospitals.columns

Index(['FID', 'ID', 'NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'ZIP4',
       'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY', 'COUNTYFIPS',
       'COUNTRY', 'LATITUDE', 'LONGITUDE', 'NAICS_CODE', 'NAICS_DESC',
       'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE', 'WEBSITE', 'STATE_ID',
       'ALT_NAME', 'ST_FIPS', 'OWNER', 'TTL_STAFF', 'BEDS', 'TRAUMA',
       'HELIPAD'],
      dtype='object')

## Pull Hospital Data

### Long-term Care Hospitals

In [4]:
longterm_all = pd.read_csv('../data/raw/Long-_Term_Care_Hospital_-_General_Information.csv')
longterm_all['Long-term beds'] = longterm_all['Total Number of Beds']
longterm_all = longterm_all.drop(columns='Total Number of Beds')
fipscodes = pd.read_csv('../data/raw/countyfipstool2019.csv').filter(['sab', 'cname', 'fips'])
longterm_all = fipscodes.merge(longterm_all, left_on=['sab', 'cname'], right_on=['State', 'County Name'])
longterm_all['FIPS'] = longterm_all['fips']
longterm_all = longterm_all.drop(columns=['sab', 'cname', 'fips', 'State', 'County Name'])
longterm = longterm_all.filter(['FIPS', 'Long-term beds', 'State', 'County Name'])
longterm = longterm.dropna()
longterm.head()

Unnamed: 0,FIPS,Long-term beds
0,1015,38
1,1069,35
2,1073,38
3,1073,45
4,1083,31


In [5]:
fips_groups =longterm.groupby("FIPS")

LT_hosp_data = pd.DataFrame(columns=['FIPS', 'Longterm HospCt', 'Longterm Beds'])
for fips_code, grp in fips_groups:
    LT_hosp_data = LT_hosp_data.append({'FIPS': fips_code, 'Longterm HospCt': len(grp), 'Longterm Beds': sum(grp['Long-term beds'])}, ignore_index=True)   

LT_hosp_data.head()

Unnamed: 0,FIPS,Longterm HospCt,Longterm Beds
0,1015,1,38
1,1069,1,35
2,1073,2,83
3,1083,1,31
4,1097,1,22


### Nursing Homes

In [6]:
nursing_homes = pd.read_csv('../data/raw/Nursing_Homes.csv').filter(['COUNTYFIPS','BEDS', 'STATUS'])
nursing_homes = nursing_homes[nursing_homes['STATUS'] == 'OPEN']
nursing_homes = nursing_homes.drop(columns='STATUS')
nursing_homes.head()

Unnamed: 0,COUNTYFIPS,BEDS
0,54019,58
1,51153,107
2,37051,170
3,25017,-999
4,55035,6


In [7]:
nursing = pd.DataFrame(columns=['FIPS', 'NursingCt', 'NursingBeds'])
nursing_fips = nursing_homes.groupby('COUNTYFIPS')
for fips, grp in nursing_fips:
    nursing = nursing.append({'FIPS': fips, 'NursingCt': len(grp), 'NursingBeds': sum(grp['BEDS'])}, ignore_index=True)
nursing.head()

Unnamed: 0,FIPS,NursingCt,NursingBeds
0,1001,6,315
1,1003,25,1816
2,1005,3,212
3,1007,1,131
4,1009,9,367


### General Hospitals

In [8]:
# Created by Michael
hospitals = pd.read_csv('../data/raw/hospitals.csv')[['TYPE', 'STATUS', 'COUNTYFIPS', 'BEDS', 'STATE', 'COUNTY']]
print(set(hospitals["TYPE"]))
hospitals = hospitals[hospitals["STATUS"] == 'OPEN']
hospitals = hospitals[hospitals["TYPE"] == 'GENERAL ACUTE CARE']
hospitals = hospitals[hospitals["BEDS"].astype(str).astype(int) > 0]
hospitals["FIPS"] = hospitals["COUNTYFIPS"]
hospitals = hospitals[hospitals["FIPS"] != 'NOT AVAILABLE']
hospitals = hospitals.drop(["COUNTYFIPS", "STATUS"], axis=1)
hospitals["FIPS"] = hospitals["FIPS"].astype(str).astype(int)

fips_groups = hospitals.groupby("FIPS")

hosp_data = pd.DataFrame(columns=['FIPS', 'HospCt', 'Beds'])
for fips_code, grp in fips_groups:
    hosp_data = hosp_data.append({'FIPS': fips_code, 'HospCt': len(grp), 'Beds': sum(grp["BEDS"])}, ignore_index=True)   

hosp_data.head()

{'LONG TERM CARE', 'REHABILITATION', 'PSYCHIATRIC', 'GENERAL ACUTE CARE', 'CHRONIC DISEASE', 'SPECIAL', 'MILITARY', 'CRITICAL ACCESS', 'CHILDREN', 'WOMEN'}


Unnamed: 0,FIPS,HospCt,Beds
0,1001,1,85
1,1003,3,332
2,1005,1,74
3,1007,1,35
4,1011,1,61


## County Data

In [9]:
counties = pd.read_csv('../data/raw/counties_09-11.csv')
# print(counties.columns.values)
# counties = counties.dropna(how='any')
# counties['FIPS'] = counties['FIPS'] .astype('int')
counties = counties.set_index('FIPS')
# print(counties.where(counties['FIPS'] == 36061))
# print(counties.loc[6037,], '\n')
# counties = counties.filter(['FIPS', 'Rural-urban_Continuum Code_2013', 'Density per square mile of land area - Population'])
counties = counties.filter(['FIPS',  'Density per square mile of land area - Population', 'Rural-urban_Continuum Code_2013', \
                            'Percent of adults with less than a high school diploma 2014-18',\
                            "Percent of adults with a bachelor's degree or higher 2014-18", 'Unemployment_rate_2018', \
                            'Med_HH_Income_Percent_of_State_Total_2018', 'Jan Temp AVG / F', 'Feb Temp AVG / F', 'Mar Temp AVG / F', 'Apr Temp AVG / F', 'May Temp AVG / F', \
                            'Jun Temp AVG / F', 'Jul Temp AVG / F', 'Aug Temp AVG / F', 'Sep Temp AVG / F', 'Oct Temp AVG / F', 'Nov Temp AVG / F', 'Dec Temp AVG / F', 'Total_age65plus',  'Total households!!Average household size',\
                            'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool',\
                            'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten',\
                            'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)',\
                            'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)',\
                            'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school', 'POP_ESTIMATE_2018', \
                            'H_MALE', 'H_FEMALE', 'BA_MALE', 'BA_FEMALE'])
print(counties.shape)
# counties = counties.dropna()
print(counties.shape)
counties = counties.merge(hosp_data, how='left', left_on='FIPS', right_on='FIPS')
counties = counties.merge(LT_hosp_data, how='left',  left_on='FIPS', right_on='FIPS')
counties = counties.merge(nursing, how='left', left_on='FIPS', right_on='FIPS')
print("test:", counties.shape)
# counties = counties.fillna(value=0) ### 
counties[['HospCt', 'Beds', 'NursingCt', 'NursingBeds','Longterm HospCt', 'Longterm Beds']] = counties[['HospCt', 'Beds', 'NursingCt', 'NursingBeds', 'Longterm HospCt', 'Longterm Beds']].fillna(value=0)
print(counties.shape)
counties= counties.drop(columns=['HospCt'])

counties['HospCt'] = counties['Beds']
counties['HospCt'][counties['HospCt'] >1000] = 1000
print('range hosp beds: ', counties['Beds'].min(axis=0), counties['Beds'].max(axis=0))
counties = counties.drop(columns=['Beds'])
print('range hosp beds: ', counties['HospCt'].min(axis=0), counties['HospCt'].max(axis=0))


print('range Nursing Ct: ', counties['NursingCt'].min(axis=0), np.mean(counties['NursingCt']), np.median(counties['NursingCt']), counties['NursingCt'].max(axis=0))
counties['NursingCt'] = counties['NursingCt'] + counties['Longterm HospCt']
print('range Nursing Ct: ', counties['NursingCt'].min(axis=0), np.mean(counties['NursingCt']), np.median(counties['NursingCt']), counties['NursingCt'].max(axis=0))

counties = counties.drop(columns=['Longterm HospCt'])
counties = counties.drop(columns=['Longterm Beds'])
counties = counties.drop(columns=['NursingBeds'])

counties['Hispanic Population'] = counties['H_MALE'] + counties['H_FEMALE']
counties = counties.drop(columns=['H_FEMALE', 'H_MALE'])
counties['Black Population'] = counties['BA_MALE'] + counties['BA_FEMALE']
counties = counties.drop(columns=['BA_FEMALE', 'BA_MALE'])
print(counties.shape)
counties.head()

(3273, 30)
(3273, 30)
test: (3273, 37)
(3273, 37)
range hosp beds:  0 24676
range hosp beds:  0 1000
range Nursing Ct:  0 12.047662694775436 4.0 1662
range Nursing Ct:  0 12.15948670944088 4.0 1666
(3273, 31)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,FIPS,Density per square mile of land area - Population,Rural-urban_Continuum Code_2013,Percent of adults with less than a high school diploma 2014-18,Percent of adults with a bachelor's degree or higher 2014-18,Unemployment_rate_2018,Med_HH_Income_Percent_of_State_Total_2018,Jan Temp AVG / F,Feb Temp AVG / F,Mar Temp AVG / F,Apr Temp AVG / F,May Temp AVG / F,Jun Temp AVG / F,Jul Temp AVG / F,Aug Temp AVG / F,Sep Temp AVG / F,Oct Temp AVG / F,Nov Temp AVG / F,Dec Temp AVG / F,Total_age65plus,Total households!!Average household size,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school,POP_ESTIMATE_2018,NursingCt,HospCt,Hispanic Population,Black Population
0,0,87.4,,12.3,31.5,3.9,,,,,,,,,,,,,,52431193.0,0.0,4300436.0,3397087.0,27916769.0,14550764.0,19757430.0,327167434,0,0,59722584.0,43799095.0
1,1000,94.4,,14.2,24.9,3.9,100.0,46.06,55.57,54.56,63.26,73.92,77.8,80.4,80.34,79.93,67.48,50.28,50.94,,,,,,,,4887871,0,0,,
2,1001,91.8,2.0,11.3,27.7,3.6,119.0,47.0,58.6,56.1,65.0,75.5,79.3,81.5,81.8,81.6,69.3,51.7,52.0,8653.0,,,,,,,55601,6,85,1649.0,10915.0
3,1003,114.6,3.0,9.7,31.3,3.6,115.5,52.0,62.0,59.7,66.0,76.5,81.3,82.5,82.5,82.1,72.3,56.2,56.1,44571.0,2.57,3065.0,1449.0,23254.0,11209.0,6503.0,218022,25,332,10131.0,19492.0
4,1005,31.0,6.0,27.0,12.2,5.2,68.9,49.0,58.5,57.4,64.9,75.2,79.6,81.3,81.1,80.6,70.9,52.8,51.6,4832.0,,,,,,,24881,3,74,1064.0,12042.0


## Deaths Data

In [None]:
deaths_t_series = pd.read_csv('../data/raw/time_series_covid19_deaths_US_06-19.csv')
# print(filtered.loc[36061,])
# print(deaths.columns.values)
# deaths_t_series = deaths_t_series.drop(['UID', 'iso2', 'iso3', 'code3', 'Admin2', 'Population', 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key'], axis=1)
deaths_t_series = deaths_t_series.drop(['Combined_Key'], axis=1)

# deaths = deaths.dropna()
# deaths = deaths.set_index('FIPS')

deaths_t_series.head()

In [None]:
# total_deaths = pd.DataFrame(deaths.sum(axis=1), columns=['deaths'])
# total_deaths.columns = ['FIPS', 'deaths']
death_data = pd.DataFrame(columns=['FIPS', 'Deaths', 'Outbreak_date'])
dates = deaths_t_series.columns[1:]
death_thresh = 3
n_days = 21

# Get all existing data n_days from the death_thresh death.
for i in range(len(deaths_t_series)):
    county = deaths_t_series.iloc[i]
    outbreak_date = None

    # Iterate through dates to find first date with deaths >= death thresh
    for j in range(len(dates)):
        if county[dates[j]] >= death_thresh: 
            outbreak_date = dates[j]

            # See if this happened n_days or more before current day
            try:
                n_days_later = dates[j+n_days]

                # Make sure the data is bug-free
                bad_data=False
                for k in range(n_days):
                    if county[n_days_later] < county[dates[j+k]]:
                        bad_data=True
                if not bad_data:
                    outbreak_date = datetime.datetime.strptime(outbreak_date, '%m/%d/%y')
                    death_data = death_data.append({'FIPS': county['FIPS'], 'Deaths':county[n_days_later], 'Outbreak_date':outbreak_date}, ignore_index=True)
                break

            except:
                continue
#                 break
                
death_data.head()

In [None]:
combined_data = counties.merge(death_data, left_on='FIPS', right_on='FIPS')
# combined_data = combined_data.dropna()
combined_data.set_index('FIPS', inplace=True)
print(combined_data.shape)
combined_data.head()

## Longterm Care Deaths

In [None]:
# print(longterm_all.loc[0, ])
# print(longterm_all.shape)
# longterm_all.head()

In [None]:
# longterm_deaths = pd.read_csv('../data/raw/longtermcare_deaths_07_08.csv')
# longterm_deaths['CMS Certification Number (CCN)'] = np.nan
# print(longterm_deaths.loc[0, 'federal_provider_number'])
# print(int(longterm_deaths.loc[0, 'federal_provider_number']))
# for i in longterm_deaths.index:
#     try: 
#         longterm_deaths.loc[i, 'CMS Certification Number (CCN)'] = int(longterm_deaths.loc[i, 'federal_provider_number'])
#     except:
#         longterm_deaths.drop(index=i, inplace=True)
#         continue
# # longterm_deaths = longterm_deaths.dropna()        
# longterm_deaths['CMS Certification Number (CCN)'] = longterm_deaths['CMS Certification Number (CCN)'].astype('int64')
# print(longterm_deaths.shape)
# longterm_deaths.head(15)

In [None]:
# for i in longterm_deaths['CMS Certification Number (CCN)'].values: 
#     if i in longterm_all['CMS Certification Number (CCN)'].values:
#         print(i)
# print(longterm_all['CMS Certification Number (CCN)'].values)

In [None]:
# print(longterm_deaths['CMS Certification Number (CCN)'].values)

In [None]:
# print(longterm_deaths['provider_name'].values)

In [None]:

# longterm_all = longterm_all.merge(longterm_deaths, left_on=['CMS Certification Number (CCN)'], right_on=['CMS Certification Number (CCN)'], how='left')
# print(longterm_all.shape)
# longterm_all = longterm_all.dropna()
# print(longterm_all.shape)
# longterm_all.head()

## Drop Outlier Counties

In [None]:
combined_data.head()

## Mobility Data

In [None]:
# dl_index = pd.read_csv('../data/raw/DL-us-m50_index_05-22.csv')
# dl_abs_movement = pd.read_csv('../data/raw/DL-us-m50_05-22.csv')
# dl_index = dl_index.dropna()
# dl_abs_movement = dl_abs_movement.dropna()
# dl = dl_index.merge(dl_abs_movement, on='fips')
# print(dl.head())

# baseline1 = pd.DataFrame(dl['2020-03-01_y'] / (.01 * dl['2020-03-01_x']))
# baseline2 = pd.DataFrame(dl['2020-03-02_y'] / (.01 * dl['2020-03-02_x']))
# baseline3 = pd.DataFrame(dl['2020-03-03_y'] / (.01 * dl['2020-03-03_x']))
# baselines = pd.concat((baseline1, baseline2, baseline3), axis=1)

# pd.set_option('display.max_rows', None)
# dl['baseline m50 mobility'] = baselines.mean(axis=1)
# baseline_data = dl[['baseline m50 mobility', 'fips']]
# baseline_data['FIPS'] = baseline_data['fips']
# baseline_data = baseline_data.drop(['fips'], axis=1)
# baseline_data.set_index('FIPS', inplace=True)
# baseline_data.head()

In [None]:
# combined_data = baseline_data.merge(combined_data, left_on='FIPS', right_on='FIPS')
# combined_data.head()

## Safegraph Mobility data 

In [None]:
combined_data.to_csv('../data/intermediates/pre_acpca.csv')

In [None]:
sg_mobility = pd.read_csv('../data/processed/safegraph_mobility_avgs.csv')

In [None]:
# safegraph = pd.read_csv('../data/processed/od_mobility_baseline.csv')
# safegraph = safegraph.drop(columns='in_movement')

In [None]:
# combined_data = safegraph.merge(combined_data, left_on='FIPS', right_on='FIPS')
# combined_data = combined_data.set_index('FIPS')

In [None]:
# safegraph_inter = pd.read_csv('../data/processed/od_inter_mobilities_05-20.csv')
# column_change = {}
# for col in safegraph_inter.columns[1:]: 
# #     print(col)
#     date = datetime.datetime.strptime(col+ '-20', '%m-%d-%y')
#     column_change[col] = date.date()
# safegraph_inter = safegraph_inter.rename(columns=column_change)
# print(safegraph_inter.columns[2])
# # print(combined_data.loc[1003, 'Outbreak_date'])
# safegraph_inter = safegraph_inter.set_index('FIPS')
# # print(safegraph_inter.loc[1053])
# safegraph_inter.head()

In [None]:
# sg_mobility = pd.DataFrame(index=combined_data.index, columns=['2wk Prior Mobility', '2wk Onset Mobility', '2wk Post Mobility'])
# # print(sg_mobility.head())
# for i in combined_data.index: 
#     mob_per_day = np.zeros(27)
#     if i not in safegraph_inter.index:
#         continue
#     row = combined_data.loc[i]
# #     outbreak_date = datetime.datetime.strptime(row['Outbreak_date'], '%m/%d/%y')
# #     print(outbreak_date)
#     outbreak_date = row.loc['Outbreak_date']
#     for j in range(-13, 14): 
#         mob_day = outbreak_date + datetime.timedelta(days=j) #.AddDays(j)
#         mob_day = mob_day.date()
#         if mob_day in safegraph_inter.columns:
#             mob_per_day[13+j] = safegraph_inter.loc[i,mob_day]
#     prior_mob_arr = np.array(mob_per_day[:14])
#     onset_mob_arr = np.array(mob_per_day[7:-6])
#     post_mob_arr = np.array(mob_per_day[-14:])
#     prior_mob = np.mean(prior_mob_arr)
#     onset_mob = np.mean(onset_mob_arr)
#     post_mob = np.mean(post_mob_arr)
#     sg_mobility.loc[i,'2wk Prior Mobility'] = prior_mob
#     sg_mobility.loc[i, '2wk Onset Mobility'] = onset_mob
#     sg_mobility.loc[i, '2wk Post Mobility'] = post_mob
# #     if i < combined_data.index[5]:
# #         print(mob_per_day)
# #         print(prior_mob_arr, prior_mob)
# #         print(onset_mob_arr, onset_mob)
# #         print(pos_mob_arr, pos_mob)
# sg_mobility.head()

In [None]:
combined_data = sg_mobility.merge(combined_data, left_on='FIPS', right_on='FIPS')
print(combined_data.shape)
combined_data.head()

## Filter/Timeseries Temperature Data

In [None]:
month_temps = ['Jan Temp AVG / F', 'Feb Temp AVG / F', 'Mar Temp AVG / F', 'Apr Temp AVG / F', 'May Temp AVG / F', \
                            'Jun Temp AVG / F', 'Jul Temp AVG / F', 'Aug Temp AVG / F', 'Sep Temp AVG / F', 'Oct Temp AVG / F', 'Nov Temp AVG / F', 'Dec Temp AVG / F']
for i in combined_data.index: 
    row = combined_data.loc[i]
#     outbreak_date = datetime.datetime.strptime(row['Outbreak_date'], '%m/%d/%y')
    outbreak_date = row['Outbreak_date']
    outbreak_temp = month_temps[outbreak_date.month - 1]
#     print(outbreak_date.strftime("%b"), outbreak_temp)
    combined_data.loc[i,'Outbreak Month Temp AVG / F'] = row[outbreak_temp]
combined_data = combined_data.drop(columns=month_temps)

In [None]:
# for i in combined_data.index.values[:5]: 
#     row = combined_data.loc[i]
#     print(row, '\n\n')
print(combined_data.shape)
combined_data.head()

## Interventions

In [None]:
# interventions = pd.read_csv('../data/raw/USstatesCov19distancingpolicy_07_01.csv').filter()
# interventions.head()

In [None]:
interventions = pd.read_csv('../data/raw/complete_npis_inherited_policies_keystone_06_26.csv').filter(['fips', 'npi', 'start_date']).dropna()
numRows = interventions.shape[0]
print(set(interventions['npi'].values))
interventions = interventions[interventions['npi'] == 'school closure'] # all counties in dataset have school closure 
# for i in numRows:
interventions.head()

In [10]:
# Find earliest intervention for a given county
intervention_dates = pd.DataFrame(columns=['FIPS', 'Intervention Start Date'])
for i in set(interventions['fips']):
    county_intervention = interventions.loc[interventions['fips'] == i]
    earliest = datetime.datetime.strptime(county_intervention.loc[county_intervention.index.values[0],'start_date'], '%m/%d/%Y')
    eIndex = 0
    for j in county_intervention.index.values:
        datestr = county_intervention.loc[j,'start_date']
#         if type(datestr):
#             continue
        date = datetime.datetime.strptime(datestr, '%m/%d/%Y')
        if date < earliest:
            earliest = date
            eIndex = j
#     row = pd.DataFrame([i, earliest], columns=['FIPS', 'Intervention Start Date'])
    row = {'FIPS': i, 'Intervention Start Date': earliest}
    intervention_dates = intervention_dates.append(row, ignore_index=True)
print(intervention_dates.shape)    
intervention_dates.head()

NameError: name 'interventions' is not defined

In [11]:
combined_data = combined_data.merge(intervention_dates, how='outer', left_on='FIPS', right_on='FIPS')
print(combined_data.shape)
combined_data.head()

NameError: name 'combined_data' is not defined

In [12]:
# Get number of days from  outbreak date to date of effect of intervention policies 

# print((datetime.datetime(2020, 3, 19) - datetime.datetime(2020, 3, 20)).days)
# April 9th 2020- 3 weeks after national intervention 
combined_data['Time from outbreak to intervention'] = combined_data['Intervention Start Date'] - combined_data['Outbreak_date']
combined_data['Time from outbreak to national intervention'] = datetime.datetime(2020, 4, 9) - combined_data['Outbreak_date']
combined_data['Time from national intervention to outbreak'] = combined_data['Outbreak_date'] - datetime.datetime(2020, 4, 9)
for i in combined_data.index.values:
    combined_data.loc[i, 'Time from outbreak to intervention'] = combined_data.loc[i, 'Time from outbreak to intervention'].days
    combined_data.loc[i, 'Time from outbreak to national intervention'] = max(0,combined_data.loc[i, 'Time from outbreak to national intervention'].days)
    combined_data.loc[i, 'Time from national intervention to outbreak'] = max(0,combined_data.loc[i, 'Time from national intervention to outbreak'].days)
combined_data.head()

NameError: name 'combined_data' is not defined

In [13]:
combined_data = combined_data.drop(columns=['Intervention Start Date'])
# print(combined_data.columns.values)
col_order = list(combined_data.columns.values[combined_data.columns.values != 'Deaths'])
# place Deaths last 
col_order.append('Deaths')
combined_data = combined_data.reindex(columns=col_order)
# combined_data = combined_data.dropna()
print(combined_data.shape)
combined_data.head()

NameError: name 'combined_data' is not defined

In [None]:
print(combined_data.max(axis=0))
print(combined_data.min(axis=0))

## Normalize by Population

In [None]:
for col in combined_data.columns.values:
    print(col, combined_data[col].shape)

In [14]:
combined_data.columns.values

NameError: name 'combined_data' is not defined

In [None]:
combined_data['normalized_deaths'] = np.true_divide(combined_data['Deaths'],combined_data['POP_ESTIMATE_2018'])
combined_data['Total_age65plus'] = np.true_divide(combined_data['Total_age65plus'],combined_data['POP_ESTIMATE_2018'])
combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool'] = np.true_divide(combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool'],combined_data['POP_ESTIMATE_2018'])
combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten'] = np.true_divide(combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten'],combined_data['POP_ESTIMATE_2018'])
combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)'] = np.true_divide(combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)'],combined_data['POP_ESTIMATE_2018'])
combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)'] = np.true_divide(combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)'],combined_data['POP_ESTIMATE_2018'])
combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school'] = np.true_divide(combined_data['SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school'],combined_data['POP_ESTIMATE_2018'])
combined_data['HospCt'] = np.true_divide(combined_data['HospCt'],combined_data['POP_ESTIMATE_2018'])
combined_data['Hispanic Population'] = np.true_divide(combined_data['Hispanic Population'],combined_data['POP_ESTIMATE_2018'])
combined_data['Black Population'] = np.true_divide(combined_data['Black Population'],combined_data['POP_ESTIMATE_2018'])
# combined_data['out_movement'] = np.true_divide(combined_data['out_movement'],combined_data['POP_ESTIMATE_2018'])
# combined_data['inter_movement'] = np.true_divide(combined_data['inter_movement'],combined_data['POP_ESTIMATE_2018'])
combined_data['2wk Prior Inter-Mobility'] = np.true_divide(combined_data['2wk Prior Inter-Mobility'],combined_data['POP_ESTIMATE_2018'])
combined_data['2wk Onset Inter-Mobility'] = np.true_divide(combined_data['2wk Onset Inter-Mobility'],combined_data['POP_ESTIMATE_2018'])
combined_data['2wk Post Inter-Mobility'] = np.true_divide(combined_data['2wk Post Inter-Mobility'],combined_data['POP_ESTIMATE_2018'])
combined_data['2wk Prior Intra-Mobility'] = np.true_divide(combined_data['2wk Prior Intra-Mobility'],combined_data['POP_ESTIMATE_2018'])
combined_data['2wk Onset Intra-Mobility'] = np.true_divide(combined_data['2wk Onset Intra-Mobility'],combined_data['POP_ESTIMATE_2018'])
combined_data['2wk Post Intra-Mobility'] = np.true_divide(combined_data['2wk Post Intra-Mobility'],combined_data['POP_ESTIMATE_2018'])
combined_data['1 Month Prior Inter-Mobility'] = np.true_divide(combined_data['1 Month Prior Inter-Mobility'],combined_data['POP_ESTIMATE_2018'])
combined_data['1 Month Prior Intra-Mobility'] = np.true_divide(combined_data['1 Month Prior Intra-Mobility'],combined_data['POP_ESTIMATE_2018'])
# combined_data['Longterm Beds'] = np.true_divide(combined_data['Longterm Beds'],combined_data['POP_ESTIMATE_2018'])
# combined_data['Longterm HospCt'] = np.true_divide(combined_data['Longterm HospCt'],combined_data['POP_ESTIMATE_2018'])
combined_data['NursingCt'] = np.true_divide(combined_data['NursingCt'],combined_data['POP_ESTIMATE_2018'])

In [15]:
# drop population 
# combined_data.dropna()
print(combined_data.shape)
combined_data.head()

NameError: name 'combined_data' is not defined

In [None]:
# combined_data.to_csv('../data/processed/pre_pairwise_acpca.csv', index=False)

In [None]:
# combined_data.drop(columns='POP_ESTIMATE_2018', inplace=True)
combined_data = combined_data.drop(columns=['Outbreak_date'])

In [None]:
print(combined_data.max(axis=0))
print(combined_data.min(axis=0))

## Filter by region

In [16]:
fipscodes = pd.read_csv('../data/raw/countyfipstool2019.csv').filter(['fips', 'sname']).drop_duplicates()
fipscodes.head()

Unnamed: 0,fips,sname
0,1001,Alabama
20,1003,Alabama
40,1005,Alabama
60,1007,Alabama
100,1009,Alabama


In [17]:
print(combined_data.shape)
combined_data = combined_data.merge(fipscodes, how='left', left_on='FIPS', right_on='fips')
NE_states = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'New York', 'New Jersey', 'Vermont', 'Maryland', 'Delaware', 'District of Columbia', 'Pennsylvania']
# combined_data['Northeast'] = (combined_data['sname'] in NE_states.any())
print(combined_data.shape)
combined_data.head()

NameError: name 'combined_data' is not defined

In [18]:
combined_data['Northeast'] = 0
for state in NE_states: 
    combined_data['Northeast'] += combined_data['sname'] == state
combined_data[combined_data['Northeast'] > 0].head()

NameError: name 'combined_data' is not defined

In [19]:
combined_data = combined_data.drop(columns=['fips', 'sname'])
combined_data.head()

NameError: name 'combined_data' is not defined

In [20]:
combined_data_NE = combined_data[combined_data['Northeast'] > 0].drop(columns=['Northeast'])
print(combined_data_NE.shape)
combined_data_NE.head()

NameError: name 'combined_data' is not defined

In [None]:
combined_data = combined_data.drop(columns=['Northeast'])
combined_data.to_csv('../data/processed/multi_var_unscaled.csv', index=False)
print(combined_data.shape)

In [21]:
pre_partial_corr = combined_data.filter(["Rural-urban_Continuum Code_2013", "2wk Prior Intra-Mobility", "Total_age65plus", "HospCt", "normalized_deaths"])
pre_partial_corr = pre_partial_corr.dropna()
print(pre_partial_corr.shape)
pre_partial_corr.to_csv('../data/processed/pre_partial_corr.csv', index=False)
pre_partial_corr.head()

NameError: name 'combined_data' is not defined

In [None]:
combined_data_NE.to_csv('../data/processed/NE_multi_var_unscaled.csv', index=False)
print(combined_data_NE.shape)

In [22]:
pre_partial_corr_NE = combined_data_NE.filter(["Rural-urban_Continuum Code_2013", "2wk Prior Intra-Mobility", "Total_age65plus", "HospCt", "normalized_deaths"])
pre_partial_corr_NE = pre_partial_corr_NE.dropna()
print(pre_partial_corr_NE.shape)
pre_partial_corr_NE.to_csv('../data/processed/pre_partial_corr_NE.csv', index=False)
pre_partial_corr_NE.head()

NameError: name 'combined_data_NE' is not defined

In [None]:
combined_data = combined_data.dropna()
combined_data_NE = combined_data_NE.dropna()
print(combined_data.shape)
print(combined_data_NE.shape)

## Min/Max Scaling / Normalization 

In [None]:
# scaler = MinMaxScaler() 
# print(combined_data.to_numpy().shape)
# scaled_data = scaler.fit_transform(combined_data)
# for i in range(1, len(combined_data.columns.values)): 
#     col = combined_data.columns.values[i]
#     combined_data[col] = scaled_data[:, i]

In [None]:
# print(combined_data.max(axis=0))
# print(combined_data.min(axis=0))
# combined_data.head()

In [None]:
# pd.set_option('display.max_columns', 5000)
# combined_data[combined_data['FIPS'] == 36061]

In [23]:
#  
# combined_data= combined_data.set_index('FIPS')
cities = combined_data[combined_data['Rural-urban_Continuum Code_2013'] == 1]
# for i in [36061, 17031, 26163, 36059, 6037]:
#     cities = cities.append(combined_data.loc[i])
cities.head()

NameError: name 'combined_data' is not defined

In [None]:
# combined_data = combined_data[combined_data['Rural-urban_Continuum Code_2013'] > 1]

 ## Scaling by Standard Deviation

In [None]:
def scale_std(data):
    scaler = StandardScaler() 
    print(data.to_numpy().shape)
    # scaled_data = scaler.fit_transform(combined_data)
    for i in range(1, len(data.columns.values)- 2): # 1 to -2 so that it scales all except FIPs, deaths, and normalized deaths 
        col = data.columns.values[i]
    #     combined_data[col] = scaled_data[:, i]
        data[col] = scaler.fit_transform(np.array(data[col]).reshape(-1,1)).reshape(-1)
    return data

In [24]:
combined_data = scale_std(combined_data)
print(combined_data.max(axis=0))
print(combined_data.min(axis=0))
combined_data.head()

NameError: name 'scale_std' is not defined

In [25]:
combined_data_NE = scale_std(combined_data_NE)
print(combined_data_NE.max(axis=0))
print(combined_data_NE.min(axis=0))
combined_data_NE.head()

NameError: name 'scale_std' is not defined

In [None]:
# combined_data=combined_data.set_index('FIPS')
combined_data.to_csv('../data/processed/pre_pca.csv', index=False)
combined_data.shape

In [None]:
# combined_data=combined_data.set_index('FIPS')
combined_data_NE.to_csv('../data/processed/NE_pre_pca.csv', index=False)
combined_data_NE.shape

In [26]:
plt.plot(list(combined_data['HospCt']), list(combined_data['normalized_deaths']), 'bo')
plt.xlabel('# hospitals')
plt.ylabel('total deaths')

NameError: name 'combined_data' is not defined

In [27]:
filter1 = combined_data['Deaths'] > 140
# filter2 = combined_data['deaths'] > 300000
filtered = combined_data.where(filter1)
# filtered.where(filter2, inplace=True)
filtered = filtered.dropna()
filtered.head()

NameError: name 'combined_data' is not defined

In [None]:
data_arr = combined_data.to_numpy()
X = data_arr[:,2].reshape(1, -1).T
Y = data_arr[:,3].reshape(1, -1).T
reg = lm.LinearRegression().fit(X,Y)
reg.score(X,Y)

In [None]:
data_arr = combined_data.to_numpy()
X = data_arr[:,2:10].reshape(8, -1).T
Y = data_arr[:,10].reshape(1, -1).T
reg = lm.LinearRegression().fit(X,Y)
reg.score(X,Y)

In [28]:
len(combined_data.columns)

NameError: name 'combined_data' is not defined

In [None]:
print(combined_data.columns[0:20])

In [29]:
combined_data.head()

NameError: name 'combined_data' is not defined