In [166]:
import numpy as np
import pandas as pd


In [167]:
deaths_t_series = pd.read_csv('../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')
deaths_t_series = deaths_t_series.loc[~(deaths_t_series['Lat'] == 0)]
deaths_t_series = deaths_t_series.drop(labels=['UID', 'iso2', 'iso3', 'code3', 'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_', "Combined_Key", "Population"], axis=1)


# 3 Death Thresh, 9 Days Out

In [168]:
death_data = pd.DataFrame(columns=['FIPS', 'Deaths'])
dates = deaths_t_series.columns[1:]
death_thresh = 3
n_days = 9

# Get all existing data n_days from the death_thresh death.
for i in range(len(deaths_t_series)):
    county = deaths_t_series.iloc[i]

    # Iterate through dates to find first date with deaths >= death thresh
    for j in range(len(dates)):
        if county[dates[j]] >= death_thresh:

            # See if this happened n_days or more before current day
            try:
                n_days_later = dates[j+n_days]

                # Make sure the data is bug-free
                bad_data=False
                for k in range(n_days):
                    if county[n_days_later] < county[dates[j+k]]:
                        bad_data=True
                if not bad_data:
                    death_data = death_data.append({'FIPS': county['FIPS'], 'Deaths':county[n_days_later]}, ignore_index=True)
                break

            except:
                break 

In [169]:
death_data['Deaths'].max()

328.0

In [170]:
adjacent_counties = pd.read_excel('../adjacent_counties.xlsx', header=None)
adjacent_counties = adjacent_counties[[1, 3]]
adjacent_counties = adjacent_counties[adjacent_counties[1] != adjacent_counties[3]]
pairs = []
for index, row in adjacent_counties.iterrows():
    pair = set([row[1], row[3]])
    if pair not in pairs:
        pairs.append(pair)
np.random.seed(9)
np.random.shuffle(pairs)

In [171]:
print(len(pairs))

9483


# Hospitals Analysis

In [172]:
hospitals = pd.read_csv('../Hospitals.csv')[['TYPE', 'STATUS', 'COUNTYFIPS', 'BEDS']]
static_features = pd.read_csv('../COVID-19_US_County-level_Summaries/data/counties_only.csv')[['FIPS', 'Rural-urban_Continuum Code_2013', 'Density per square mile of land area - Population', 'Percent of adults with less than a high school diploma 2014-18', 'PCTPOV017_2018', 'Unemployment_rate_2018', 'Total_age65plus', 'POP_ESTIMATE_2018']]
static_features = static_features.dropna()
static_features['pop_density'] = static_features['Density per square mile of land area - Population']
static_features['no_dip'] = static_features['Percent of adults with less than a high school diploma 2014-18']
static_features['elderly_ratio'] = static_features['Total_age65plus']/static_features['POP_ESTIMATE_2018']
static_features['RUCC'] = static_features['Rural-urban_Continuum Code_2013']

In [173]:
hospitals = hospitals[hospitals["STATUS"] == 'OPEN']
hospitals = hospitals[hospitals["TYPE"] == 'GENERAL ACUTE CARE']
hospitals = hospitals[hospitals["BEDS"].astype(str).astype(int) > 0]
hospitals["FIPS"] = hospitals["COUNTYFIPS"]
hospitals = hospitals[hospitals["FIPS"] != 'NOT AVAILABLE']
hospitals = hospitals.drop(["COUNTYFIPS", "STATUS"], axis=1)
hospitals["FIPS"] = hospitals["FIPS"].astype(str).astype(int)
fips_groups = hospitals.groupby("FIPS")

hosp_data = pd.DataFrame(columns=['FIPS', 'HospCt', 'Beds'])
for fips_code, grp in fips_groups:
    hosp_data = hosp_data.append({'FIPS': fips_code, 'HospCt': len(grp), 'Beds': sum(grp["BEDS"])}, ignore_index=True)   

hosp_data.head()

Unnamed: 0,FIPS,HospCt,Beds
0,1001,1,85
1,1003,3,332
2,1005,1,74
3,1007,1,35
4,1011,1,61


In [174]:
dataset = hosp_data.merge(death_data, on='FIPS')
dataset = dataset.merge(static_features, on="FIPS")
dataset = dataset.sample(frac=1.0, random_state=9)[['FIPS', 'HospCt', 'Beds', 'Deaths', 'RUCC', 'pop_density', 'no_dip', 'elderly_ratio', 'Unemployment_rate_2018', 'PCTPOV017_2018', 'POP_ESTIMATE_2018']]
dataset.head()

Unnamed: 0,FIPS,HospCt,Beds,Deaths,RUCC,pop_density,no_dip,elderly_ratio,Unemployment_rate_2018,PCTPOV017_2018,POP_ESTIMATE_2018
54,6113,2,156,8.0,1,197.9,13.6,0.124927,4.2,15.2,220408
412,34017,6,1704,92.0,1,13731.4,15.8,0.118214,3.9,22.3,676061
97,12061,2,486,6.0,3,274.5,11.3,0.330462,4.3,17.9,157413
30,6013,9,1731,7.0,1,1465.2,10.6,0.157747,3.2,9.2,1150215
572,42077,5,1698,8.0,2,1012.5,11.4,0.167093,4.6,17.4,368100


In [177]:
training_set = pd.DataFrame(columns=['Hosp_diff', 'Beds_diff', 'RUCC_diff', 'dense_diff', 'no_dip_diff', 'elderly_diff', 'unemp_diff', 'pct_pov_diff', 'pop_diff', 'More_Deaths'])
test_set = pd.DataFrame(columns=['Hosp_diff', 'Beds_diff', 'RUCC_diff', 'dense_diff', 'no_dip_diff', 'elderly_diff', 'unemp_diff', 'pct_pov_diff', 'pop_diff', 'More_Deaths'])
training_set = training_set.apply(pd.to_numeric) 
test_set = test_set.apply(pd.to_numeric) 
dataset.reset_index(drop=True, inplace=True)

for fips1, fips2 in pairs:
    
    row1_df = dataset.loc[dataset["FIPS"] == fips1].reset_index(drop=True)
    row2_df = dataset.loc[dataset["FIPS"] == fips2].reset_index(drop=True)

    if row1_df.empty or row2_df.empty:
        continue
        
    for _, row1 in row1_df.iterrows():
        for _, row2 in row2_df.iterrows():
            if row1['Deaths'] != row2["Deaths"]:

                if len(training_set) < 750:
                    if row1['Deaths']  > row2["Deaths"]:
                        training_set = training_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),
                                                            'More_Deaths':0}, ignore_index=True)   
                    elif row1['Deaths']  < row2["Deaths"]:  
                        training_set = training_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':1}, ignore_index=True)  
                else:
                    if row1['Deaths']  > row2["Deaths"]:
                        test_set = test_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':0}, ignore_index=True)  
                    elif row1['Deaths'] < row2["Deaths"]:  
                        test_set = test_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':1}, ignore_index=True)  


In [178]:
print(len(training_set))
print(len(test_set))

750
215


In [179]:
sum(training_set['More_Deaths'])/len(training_set)

0.5186666666666667

In [180]:
sum(test_set['More_Deaths'])/len(test_set)

0.5348837209302325

In [157]:
np.corrcoef(training_set["RUCC_diff"], training_set["Hosp_diff"])

array([[ 1.        , -0.16142744],
       [-0.16142744,  1.        ]])

In [181]:
training_set = training_set.apply(pd.to_numeric) 
test_set = test_set.apply(pd.to_numeric) 
training_set['Hosp_diff'] = (training_set['Hosp_diff'] - np.min(training_set['Hosp_diff']))/(np.max(training_set['Hosp_diff']) - np.min(training_set['Hosp_diff']))
test_set['Hosp_diff'] = (test_set['Hosp_diff'] - np.min(test_set['Hosp_diff']))/(np.max(test_set['Hosp_diff']) - np.min(test_set['Hosp_diff']))


training_set['Beds_diff'] = (training_set['Beds_diff'] - np.min(training_set['Beds_diff']))/(np.max(training_set['Beds_diff']) - np.min(training_set['Beds_diff']))
test_set['Beds_diff'] = (test_set['Beds_diff'] - np.min(test_set['Beds_diff']))/(np.max(test_set['Beds_diff']) - np.min(test_set['Beds_diff']))


training_set['dense_diff'] = (training_set['dense_diff'] - np.min(training_set['dense_diff']))/(np.max(training_set['dense_diff']) - np.min(training_set['dense_diff']))
test_set['dense_diff'] = (test_set['dense_diff'] - np.min(test_set['dense_diff']))/(np.max(test_set['dense_diff']) - np.min(test_set['dense_diff']))

training_set['RUCC_diff'] = (training_set['RUCC_diff'] - np.min(training_set['RUCC_diff']))/(np.max(training_set['RUCC_diff']) - np.min(training_set['RUCC_diff']))
test_set['RUCC_diff'] = (test_set['RUCC_diff'] - np.min(test_set['RUCC_diff']))/(np.max(test_set['RUCC_diff']) - np.min(test_set['RUCC_diff']))

training_set['no_dip_diff'] = (training_set['no_dip_diff'] - np.min(training_set['no_dip_diff']))/(np.max(training_set['no_dip_diff']) - np.min(training_set['no_dip_diff']))
test_set['no_dip_diff'] = (test_set['no_dip_diff'] - np.min(test_set['no_dip_diff']))/(np.max(test_set['no_dip_diff']) - np.min(test_set['no_dip_diff']))

training_set['elderly_diff'] = (training_set['elderly_diff'] - np.min(training_set['elderly_diff']))/(np.max(training_set['elderly_diff']) - np.min(training_set['elderly_diff']))
test_set['elderly_diff'] = (test_set['elderly_diff'] - np.min(test_set['elderly_diff']))/(np.max(test_set['elderly_diff']) - np.min(test_set['elderly_diff']))

training_set['unemp_diff'] = (training_set['unemp_diff'] - np.min(training_set['unemp_diff']))/(np.max(training_set['unemp_diff']) - np.min(training_set['unemp_diff']))
test_set['unemp_diff'] = (test_set['unemp_diff'] - np.min(test_set['unemp_diff']))/(np.max(test_set['unemp_diff']) - np.min(test_set['unemp_diff']))

training_set['pct_pov_diff'] = (training_set['pct_pov_diff'] - np.min(training_set['pct_pov_diff']))/(np.max(training_set['pct_pov_diff']) - np.min(training_set['pct_pov_diff']))
test_set['pct_pov_diff'] = (test_set['pct_pov_diff'] - np.min(test_set['pct_pov_diff']))/(np.max(test_set['pct_pov_diff']) - np.min(test_set['pct_pov_diff']))

training_set['pop_diff'] = (training_set['pop_diff'] - np.min(training_set['pop_diff']))/(np.max(training_set['pop_diff']) - np.min(training_set['pop_diff']))
test_set['pop_diff'] = (test_set['pop_diff'] - np.min(test_set['pop_diff']))/(np.max(test_set['pop_diff']) - np.min(test_set['pop_diff']))

In [199]:
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression as logreg

def test_features(features):
    train_x = training_set[features].to_numpy()
    train_y = training_set['More_Deaths'].to_numpy()
    test_x = test_set[features].to_numpy()
    test_y = test_set['More_Deaths'].to_numpy()

    train_x, train_y = shuffle(train_x, train_y, random_state=0)
    test_x, test_y = shuffle(test_x, test_y, random_state=0)
    train_y = np.ravel(train_y)
    test_y = np.ravel(test_y)
    
    print(f'Features: {features}')
    print(f'Test Ratio: {sum(test_y)/len(test_y)}')

    lr = logreg().fit(train_x, train_y)
    print(f'Score: {lr.score(test_x, test_y)}')
    print(f'Coefficients: {lr.coef_}\n')

In [205]:
test_features(['Hosp_diff'])
test_features(['Beds_diff'])
test_features(['pop_diff'])
test_features(['dense_diff'])
test_features(['RUCC_diff'])
test_features(['Hosp_diff', 'Beds_diff'])
test_features(['pop_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'Beds_diff'])
test_features(['dense_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'pop_diff', 'dense_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'pop_diff', 'dense_diff', 'Hosp_diff', 'Beds_diff', 'elderly_diff', 'no_dip_diff'])

Features: ['Hosp_diff']
Test Ratio: 0.5348837209302325
Score: 0.5488372093023256
Coefficients: [[-3.33032674]]

Features: ['Beds_diff']
Test Ratio: 0.5348837209302325
Score: 0.5906976744186047
Coefficients: [[-4.02681968]]

Features: ['pop_diff']
Test Ratio: 0.5348837209302325
Score: 0.5581395348837209
Coefficients: [[-3.6468052]]

Features: ['dense_diff']
Test Ratio: 0.5348837209302325
Score: 0.5395348837209303
Coefficients: [[-0.9975392]]

Features: ['RUCC_diff']
Test Ratio: 0.5348837209302325
Score: 0.5441860465116279
Coefficients: [[1.86646828]]

Features: ['Hosp_diff', 'Beds_diff']
Test Ratio: 0.5348837209302325
Score: 0.5720930232558139
Coefficients: [[-2.11506596 -3.33965166]]

Features: ['pop_diff', 'Beds_diff']
Test Ratio: 0.5348837209302325
Score: 0.5767441860465117
Coefficients: [[-2.508315   -3.21914692]]

Features: ['RUCC_diff', 'Beds_diff']
Test Ratio: 0.5348837209302325
Score: 0.6186046511627907
Coefficients: [[ 1.90760678 -4.08584254]]

Features: ['dense_diff', 'Beds_di



# 3 Deaths Thresh, 21 Days out

In [206]:
death_data = pd.DataFrame(columns=['FIPS', 'Deaths'])
dates = deaths_t_series.columns[1:]
death_thresh = 3
n_days = 21

# Get all existing data n_days from the death_thresh death.
for i in range(len(deaths_t_series)):
    county = deaths_t_series.iloc[i]

    # Iterate through dates to find first date with deaths >= death thresh
    for j in range(len(dates)):
        if county[dates[j]] >= death_thresh:

            # See if this happened n_days or more before current day
            try:
                n_days_later = dates[j+n_days]

                # Make sure the data is bug-free
                bad_data=False
                for k in range(n_days):
                    if county[n_days_later] < county[dates[j+k]]:
                        bad_data=True
                if not bad_data:
                    death_data = death_data.append({'FIPS': county['FIPS'], 'Deaths':county[n_days_later]}, ignore_index=True)
                break

            except:
                break 

In [207]:
dataset = hosp_data.merge(death_data, on='FIPS')
dataset = dataset.merge(static_features, on="FIPS")
dataset = dataset.sample(frac=1.0, random_state=9)[['FIPS', 'HospCt', 'Beds', 'Deaths', 'RUCC', 'pop_density', 'no_dip', 'elderly_ratio', 'Unemployment_rate_2018', 'PCTPOV017_2018', 'POP_ESTIMATE_2018']]
dataset.head()

Unnamed: 0,FIPS,HospCt,Beds,Deaths,RUCC,pop_density,no_dip,elderly_ratio,Unemployment_rate_2018,PCTPOV017_2018,POP_ESTIMATE_2018
91,12095,10,3794,21.0,1,1268.5,11.5,0.119379,3.2,22.8,1380645
40,6085,13,3760,43.0,1,1381.0,11.9,0.134772,2.6,7.1,1937570
70,12009,7,1466,8.0,2,535.0,8.0,0.236819,3.5,16.5,596849
106,13015,1,80,27.0,1,217.9,17.0,0.140177,3.8,17.1,106408
97,12109,1,335,4.0,1,316.4,5.4,0.202469,2.9,7.3,254261


In [210]:
training_set = pd.DataFrame(columns=['Hosp_diff', 'Beds_diff', 'RUCC_diff', 'dense_diff', 'no_dip_diff', 'elderly_diff', 'unemp_diff', 'pct_pov_diff', 'pop_diff', 'More_Deaths'])
test_set = pd.DataFrame(columns=['Hosp_diff', 'Beds_diff', 'RUCC_diff', 'dense_diff', 'no_dip_diff', 'elderly_diff', 'unemp_diff', 'pct_pov_diff', 'pop_diff', 'More_Deaths'])
training_set = training_set.apply(pd.to_numeric) 
test_set = test_set.apply(pd.to_numeric) 
dataset.reset_index(drop=True, inplace=True)

for fips1, fips2 in pairs:
    
    row1_df = dataset.loc[dataset["FIPS"] == fips1].reset_index(drop=True)
    row2_df = dataset.loc[dataset["FIPS"] == fips2].reset_index(drop=True)

    if row1_df.empty or row2_df.empty:
        continue
        
    for _, row1 in row1_df.iterrows():
        for _, row2 in row2_df.iterrows():
            if row1['Deaths'] != row2["Deaths"]:

                if len(training_set) < 550:
                    if row1['Deaths']  > row2["Deaths"]:
                        training_set = training_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),
                                                            'More_Deaths':0}, ignore_index=True)   
                    elif row1['Deaths']  < row2["Deaths"]:  
                        training_set = training_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':1}, ignore_index=True)  
                else:
                    if row1['Deaths']  > row2["Deaths"]:
                        test_set = test_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':0}, ignore_index=True)  
                    elif row1['Deaths'] < row2["Deaths"]:  
                        test_set = test_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':1}, ignore_index=True)  



In [211]:
print(len(training_set))
print(len(test_set))
print(sum(training_set['More_Deaths'])/len(training_set))
print(sum(test_set['More_Deaths'])/len(test_set))

550
202


In [214]:
training_set = training_set.apply(pd.to_numeric) 
test_set = test_set.apply(pd.to_numeric) 
training_set['Hosp_diff'] = (training_set['Hosp_diff'] - np.min(training_set['Hosp_diff']))/(np.max(training_set['Hosp_diff']) - np.min(training_set['Hosp_diff']))
test_set['Hosp_diff'] = (test_set['Hosp_diff'] - np.min(test_set['Hosp_diff']))/(np.max(test_set['Hosp_diff']) - np.min(test_set['Hosp_diff']))


training_set['Beds_diff'] = (training_set['Beds_diff'] - np.min(training_set['Beds_diff']))/(np.max(training_set['Beds_diff']) - np.min(training_set['Beds_diff']))
test_set['Beds_diff'] = (test_set['Beds_diff'] - np.min(test_set['Beds_diff']))/(np.max(test_set['Beds_diff']) - np.min(test_set['Beds_diff']))


training_set['dense_diff'] = (training_set['dense_diff'] - np.min(training_set['dense_diff']))/(np.max(training_set['dense_diff']) - np.min(training_set['dense_diff']))
test_set['dense_diff'] = (test_set['dense_diff'] - np.min(test_set['dense_diff']))/(np.max(test_set['dense_diff']) - np.min(test_set['dense_diff']))

training_set['RUCC_diff'] = (training_set['RUCC_diff'] - np.min(training_set['RUCC_diff']))/(np.max(training_set['RUCC_diff']) - np.min(training_set['RUCC_diff']))
test_set['RUCC_diff'] = (test_set['RUCC_diff'] - np.min(test_set['RUCC_diff']))/(np.max(test_set['RUCC_diff']) - np.min(test_set['RUCC_diff']))

training_set['no_dip_diff'] = (training_set['no_dip_diff'] - np.min(training_set['no_dip_diff']))/(np.max(training_set['no_dip_diff']) - np.min(training_set['no_dip_diff']))
test_set['no_dip_diff'] = (test_set['no_dip_diff'] - np.min(test_set['no_dip_diff']))/(np.max(test_set['no_dip_diff']) - np.min(test_set['no_dip_diff']))

training_set['elderly_diff'] = (training_set['elderly_diff'] - np.min(training_set['elderly_diff']))/(np.max(training_set['elderly_diff']) - np.min(training_set['elderly_diff']))
test_set['elderly_diff'] = (test_set['elderly_diff'] - np.min(test_set['elderly_diff']))/(np.max(test_set['elderly_diff']) - np.min(test_set['elderly_diff']))

training_set['unemp_diff'] = (training_set['unemp_diff'] - np.min(training_set['unemp_diff']))/(np.max(training_set['unemp_diff']) - np.min(training_set['unemp_diff']))
test_set['unemp_diff'] = (test_set['unemp_diff'] - np.min(test_set['unemp_diff']))/(np.max(test_set['unemp_diff']) - np.min(test_set['unemp_diff']))

training_set['pct_pov_diff'] = (training_set['pct_pov_diff'] - np.min(training_set['pct_pov_diff']))/(np.max(training_set['pct_pov_diff']) - np.min(training_set['pct_pov_diff']))
test_set['pct_pov_diff'] = (test_set['pct_pov_diff'] - np.min(test_set['pct_pov_diff']))/(np.max(test_set['pct_pov_diff']) - np.min(test_set['pct_pov_diff']))

training_set['pop_diff'] = (training_set['pop_diff'] - np.min(training_set['pop_diff']))/(np.max(training_set['pop_diff']) - np.min(training_set['pop_diff']))
test_set['pop_diff'] = (test_set['pop_diff'] - np.min(test_set['pop_diff']))/(np.max(test_set['pop_diff']) - np.min(test_set['pop_diff']))

In [215]:
test_features(['Hosp_diff'])
test_features(['Beds_diff'])
test_features(['pop_diff'])
test_features(['dense_diff'])
test_features(['RUCC_diff'])
test_features(['Hosp_diff', 'Beds_diff'])
test_features(['pop_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'Beds_diff'])
test_features(['dense_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'pop_diff', 'dense_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'pop_diff', 'dense_diff', 'Hosp_diff', 'Beds_diff', 'elderly_diff', 'no_dip_diff'])

Features: ['Hosp_diff']
Test Ratio: 0.5346534653465347
Score: 0.5495049504950495
Coefficients: [[-3.19873223]]

Features: ['Beds_diff']
Test Ratio: 0.5346534653465347
Score: 0.594059405940594
Coefficients: [[-4.03996849]]

Features: ['pop_diff']
Test Ratio: 0.5346534653465347
Score: 0.5693069306930693
Coefficients: [[-3.52048669]]

Features: ['dense_diff']
Test Ratio: 0.5346534653465347
Score: 0.5346534653465347
Coefficients: [[-1.30132819]]

Features: ['RUCC_diff']
Test Ratio: 0.5346534653465347
Score: 0.5594059405940595
Coefficients: [[2.43194667]]

Features: ['Hosp_diff', 'Beds_diff']
Test Ratio: 0.5346534653465347
Score: 0.5693069306930693
Coefficients: [[-2.00333692 -3.41221823]]

Features: ['pop_diff', 'Beds_diff']
Test Ratio: 0.5346534653465347
Score: 0.5742574257425742
Coefficients: [[-2.39500829 -3.29401113]]

Features: ['RUCC_diff', 'Beds_diff']
Test Ratio: 0.5346534653465347
Score: 0.6435643564356436
Coefficients: [[ 2.52143261 -4.16268084]]

Features: ['dense_diff', 'Beds_d



# 3 Deaths Thresh 28 Days out

In [216]:
death_data = pd.DataFrame(columns=['FIPS', 'Deaths'])
dates = deaths_t_series.columns[1:]
death_thresh = 3
n_days = 28

# Get all existing data n_days from the death_thresh death.
for i in range(len(deaths_t_series)):
    county = deaths_t_series.iloc[i]

    # Iterate through dates to find first date with deaths >= death thresh
    for j in range(len(dates)):
        if county[dates[j]] >= death_thresh:

            # See if this happened n_days or more before current day
            try:
                n_days_later = dates[j+n_days]

                # Make sure the data is bug-free
                bad_data=False
                for k in range(n_days):
                    if county[n_days_later] < county[dates[j+k]]:
                        bad_data=True
                if not bad_data:
                    death_data = death_data.append({'FIPS': county['FIPS'], 'Deaths':county[n_days_later]}, ignore_index=True)
                break

            except:
                break 

In [217]:
dataset = hosp_data.merge(death_data, on='FIPS')
dataset = dataset.merge(static_features, on="FIPS")
dataset = dataset.sample(frac=1.0, random_state=9)[['FIPS', 'HospCt', 'Beds', 'Deaths', 'RUCC', 'pop_density', 'no_dip', 'elderly_ratio', 'Unemployment_rate_2018', 'PCTPOV017_2018', 'POP_ESTIMATE_2018']]
dataset.head()

Unnamed: 0,FIPS,HospCt,Beds,Deaths,RUCC,pop_density,no_dip,elderly_ratio,Unemployment_rate_2018,PCTPOV017_2018,POP_ESTIMATE_2018
393,48303,4,1256,43.0,2,311.3,14.2,0.124263,3.1,20.3,307412
228,27037,3,259,11.0,1,709.0,5.1,0.141699,2.5,7.9,425423
371,45085,1,283,11.0,3,161.6,16.4,0.164451,4.0,28.2,106512
345,42017,8,1245,148.0,1,1034.7,6.1,0.186343,3.7,6.6,628195
166,22017,5,1598,93.0,2,290.2,13.4,0.171273,5.5,35.6,242922


In [220]:
training_set = pd.DataFrame(columns=['Hosp_diff', 'Beds_diff', 'RUCC_diff', 'dense_diff', 'no_dip_diff', 'elderly_diff', 'unemp_diff', 'pct_pov_diff', 'pop_diff', 'More_Deaths'])
test_set = pd.DataFrame(columns=['Hosp_diff', 'Beds_diff', 'RUCC_diff', 'dense_diff', 'no_dip_diff', 'elderly_diff', 'unemp_diff', 'pct_pov_diff', 'pop_diff', 'More_Deaths'])
training_set = training_set.apply(pd.to_numeric) 
test_set = test_set.apply(pd.to_numeric) 
dataset.reset_index(drop=True, inplace=True)

for fips1, fips2 in pairs:
    
    row1_df = dataset.loc[dataset["FIPS"] == fips1].reset_index(drop=True)
    row2_df = dataset.loc[dataset["FIPS"] == fips2].reset_index(drop=True)

    if row1_df.empty or row2_df.empty:
        continue
        
    for _, row1 in row1_df.iterrows():
        for _, row2 in row2_df.iterrows():
            if row1['Deaths'] != row2["Deaths"]:

                if len(training_set) < 400:
                    if row1['Deaths']  > row2["Deaths"]:
                        training_set = training_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),
                                                            'More_Deaths':0}, ignore_index=True)   
                    elif row1['Deaths']  < row2["Deaths"]:  
                        training_set = training_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':1}, ignore_index=True)  
                else:
                    if row1['Deaths']  > row2["Deaths"]:
                        test_set = test_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':0}, ignore_index=True)  
                    elif row1['Deaths'] < row2["Deaths"]:  
                        test_set = test_set.append({'Hosp_diff': int(row1['HospCt']) - int(row2['HospCt']),\
                                                            'Beds_diff': int(row1['Beds']) - int(row2['Beds']),\
                                                            'RUCC_diff': int(row1['RUCC']) - int(row2['RUCC']),\
                                                            'dense_diff':float(row1['pop_density']) - float(row2['pop_density']),\
                                                            'no_dip_diff':float(row1['no_dip']) - float(row2['no_dip']),\
                                                            'elderly_diff': float(row1['elderly_ratio']) - float(row2['elderly_ratio']),\
                                                            'unemp_diff': float(row1['Unemployment_rate_2018']) - float(row2['Unemployment_rate_2018']),\
                                                            'pct_pov_diff': float(row1['PCTPOV017_2018']) - float(row2['PCTPOV017_2018']),\
                                                            'pop_diff' : float(row1['POP_ESTIMATE_2018']) - float(row2['POP_ESTIMATE_2018']),\
                                                            'More_Deaths':1}, ignore_index=True)  




In [221]:
print(len(training_set))
print(len(test_set))
print(sum(training_set['More_Deaths'])/len(training_set))
print(sum(test_set['More_Deaths'])/len(test_set))

400
125
0.53
0.504


In [222]:
training_set = training_set.apply(pd.to_numeric) 
test_set = test_set.apply(pd.to_numeric) 
training_set['Hosp_diff'] = (training_set['Hosp_diff'] - np.min(training_set['Hosp_diff']))/(np.max(training_set['Hosp_diff']) - np.min(training_set['Hosp_diff']))
test_set['Hosp_diff'] = (test_set['Hosp_diff'] - np.min(test_set['Hosp_diff']))/(np.max(test_set['Hosp_diff']) - np.min(test_set['Hosp_diff']))


training_set['Beds_diff'] = (training_set['Beds_diff'] - np.min(training_set['Beds_diff']))/(np.max(training_set['Beds_diff']) - np.min(training_set['Beds_diff']))
test_set['Beds_diff'] = (test_set['Beds_diff'] - np.min(test_set['Beds_diff']))/(np.max(test_set['Beds_diff']) - np.min(test_set['Beds_diff']))


training_set['dense_diff'] = (training_set['dense_diff'] - np.min(training_set['dense_diff']))/(np.max(training_set['dense_diff']) - np.min(training_set['dense_diff']))
test_set['dense_diff'] = (test_set['dense_diff'] - np.min(test_set['dense_diff']))/(np.max(test_set['dense_diff']) - np.min(test_set['dense_diff']))

training_set['RUCC_diff'] = (training_set['RUCC_diff'] - np.min(training_set['RUCC_diff']))/(np.max(training_set['RUCC_diff']) - np.min(training_set['RUCC_diff']))
test_set['RUCC_diff'] = (test_set['RUCC_diff'] - np.min(test_set['RUCC_diff']))/(np.max(test_set['RUCC_diff']) - np.min(test_set['RUCC_diff']))

training_set['no_dip_diff'] = (training_set['no_dip_diff'] - np.min(training_set['no_dip_diff']))/(np.max(training_set['no_dip_diff']) - np.min(training_set['no_dip_diff']))
test_set['no_dip_diff'] = (test_set['no_dip_diff'] - np.min(test_set['no_dip_diff']))/(np.max(test_set['no_dip_diff']) - np.min(test_set['no_dip_diff']))

training_set['elderly_diff'] = (training_set['elderly_diff'] - np.min(training_set['elderly_diff']))/(np.max(training_set['elderly_diff']) - np.min(training_set['elderly_diff']))
test_set['elderly_diff'] = (test_set['elderly_diff'] - np.min(test_set['elderly_diff']))/(np.max(test_set['elderly_diff']) - np.min(test_set['elderly_diff']))

training_set['unemp_diff'] = (training_set['unemp_diff'] - np.min(training_set['unemp_diff']))/(np.max(training_set['unemp_diff']) - np.min(training_set['unemp_diff']))
test_set['unemp_diff'] = (test_set['unemp_diff'] - np.min(test_set['unemp_diff']))/(np.max(test_set['unemp_diff']) - np.min(test_set['unemp_diff']))

training_set['pct_pov_diff'] = (training_set['pct_pov_diff'] - np.min(training_set['pct_pov_diff']))/(np.max(training_set['pct_pov_diff']) - np.min(training_set['pct_pov_diff']))
test_set['pct_pov_diff'] = (test_set['pct_pov_diff'] - np.min(test_set['pct_pov_diff']))/(np.max(test_set['pct_pov_diff']) - np.min(test_set['pct_pov_diff']))

training_set['pop_diff'] = (training_set['pop_diff'] - np.min(training_set['pop_diff']))/(np.max(training_set['pop_diff']) - np.min(training_set['pop_diff']))
test_set['pop_diff'] = (test_set['pop_diff'] - np.min(test_set['pop_diff']))/(np.max(test_set['pop_diff']) - np.min(test_set['pop_diff']))

In [223]:
test_features(['Hosp_diff'])
test_features(['Beds_diff'])
test_features(['pop_diff'])
test_features(['dense_diff'])
test_features(['RUCC_diff'])
test_features(['Hosp_diff', 'Beds_diff'])
test_features(['pop_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'Beds_diff'])
test_features(['dense_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'pop_diff', 'dense_diff', 'Beds_diff'])
test_features(['RUCC_diff', 'pop_diff', 'dense_diff', 'Hosp_diff', 'Beds_diff', 'elderly_diff', 'no_dip_diff'])

Features: ['Hosp_diff']
Test Ratio: 0.504
Score: 0.52
Coefficients: [[-2.69153826]]

Features: ['Beds_diff']
Test Ratio: 0.504
Score: 0.584
Coefficients: [[-3.51259627]]

Features: ['pop_diff']
Test Ratio: 0.504
Score: 0.528
Coefficients: [[-2.98924183]]

Features: ['dense_diff']
Test Ratio: 0.504
Score: 0.512
Coefficients: [[-1.08013311]]

Features: ['RUCC_diff']
Test Ratio: 0.504
Score: 0.56
Coefficients: [[2.31061998]]

Features: ['Hosp_diff', 'Beds_diff']
Test Ratio: 0.504
Score: 0.536
Coefficients: [[-1.65249099 -2.99811941]]

Features: ['pop_diff', 'Beds_diff']
Test Ratio: 0.504
Score: 0.552
Coefficients: [[-2.01332239 -2.89508675]]

Features: ['RUCC_diff', 'Beds_diff']
Test Ratio: 0.504
Score: 0.656
Coefficients: [[ 2.47925627 -3.70749292]]

Features: ['dense_diff', 'Beds_diff']
Test Ratio: 0.504
Score: 0.56
Coefficients: [[-0.15460593 -3.46173912]]

Features: ['RUCC_diff', 'pop_diff', 'dense_diff', 'Beds_diff']
Test Ratio: 0.504
Score: 0.616
Coefficients: [[ 2.61130722 -2.16106

