In [1]:
import csv
import urllib.request
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import plotly
import datetime

In [None]:
## CREATE MAIN DATAFRAME COMBINING NYT AND POPULATION

In [2]:
nycounties = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
nycounties = nycounties.sort_values(by = ['county','date'])
popdata = pd.read_csv('population.csv').drop(['Unnamed: 0'], axis = 1)
popdata.head(5)

Unnamed: 0,county,state,2010_pop,2019_pop,area,density
0,Autauga,Alabama,54571,55869,594.436066,93.986558
1,Baldwin,Alabama,182265,223234,1589.78405,140.417813
2,Barbour,Alabama,27457,24686,884.876288,27.897685
3,Bibb,Alabama,22915,22394,622.58234,35.969539
4,Blount,Alabama,57322,57826,644.775904,89.683873


In [6]:
df = popdata.merge(nycounties, on=['county','state'])
df.head(5)

Unnamed: 0,county,state,2010_pop,2019_pop,area,density,date,fips,cases,deaths
0,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-24,1001.0,1,0
1,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-25,1001.0,4,0
2,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-26,1001.0,6,0
3,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-27,1001.0,6,0
4,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-28,1001.0,6,0


In [3]:
mobility = pd.read_csv('https://raw.githubusercontent.com/vitorbaptista/google-covid19-mobility-reports/master/data/processed/mobility_reports.csv',
                      dtype = {'region':'str', 'subregion':'str'})
mobility = mobility[mobility.updated_at == max(mobility.updated_at)]
mobility['subregion'] = mobility['subregion'].str.replace(' County', '')
mobility = mobility.drop(['retail_and_recreation_not_enough_data','grocery_and_pharmacy_not_enough_data','parks_not_enough_data'
                         ,'transit_stations_not_enough_data','workplaces_not_enough_data','residential_not_enough_data'], axis = 1)
mobility.head(5)

Unnamed: 0,region,subregion,updated_at,retail_and_recreation,grocery_and_pharmacy,parks,transit_stations,workplaces,residential
2,Afghanistan,,2020-04-11,-0.46,-0.32,-0.12,-0.5,-0.38,0.16
5,Alabama,,2020-04-11,-0.39,0.03,0.11,-0.35,-0.29,0.1
8,Alabama,Autauga,2020-04-11,-0.37,0.16,-0.14,,-0.3,0.2
11,Alabama,Baldwin,2020-04-11,-0.44,-0.1,-0.42,-0.39,-0.28,0.11
14,Alabama,Barbour,2020-04-11,-0.18,-0.2,,,-0.31,


In [7]:
df = df.merge(mobility, left_on=['county','state'], right_on=['subregion','region'], how='left')
df = df.drop(['region','subregion','updated_at','grocery_and_pharmacy','transit_stations','residential'], axis = 1)
df.head(5)

Unnamed: 0,county,state,2010_pop,2019_pop,area,density,date,fips,cases,deaths,retail_and_recreation,parks,workplaces
0,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-24,1001.0,1,0,-0.37,-0.14,-0.3
1,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-25,1001.0,4,0,-0.37,-0.14,-0.3
2,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-26,1001.0,6,0,-0.37,-0.14,-0.3
3,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-27,1001.0,6,0,-0.37,-0.14,-0.3
4,Autauga,Alabama,54571,55869,594.436066,93.986558,2020-03-28,1001.0,6,0,-0.37,-0.14,-0.3


In [None]:
## CREATE CASES AND DEATHS DATAFRAMES THAT INDEX BY DAY

In [38]:
newd = df.set_index(['county', 'state', '2019_pop', 'density','retail_and_recreation','workplaces',
                     'date'])['cases'].unstack().reset_index()
cases_df = pd.DataFrame()
for i in range(0,len(newd)):
    row = newd.iloc[i,0:6].append(newd.iloc[i,6:].dropna()).reset_index().iloc[:, 1]
    cases_df = cases_df.append(row)

cases_df.rename(columns=lambda x: str(x-6), inplace=True)
cases_df = cases_df.rename({"-6":'county', "-5":'state', "-4":'population', "-3":'density','-2':'mob_rnr',
                           '-1':'mob_work'}, axis = 1)
cases_df.head(5)

Unnamed: 0,county,state,population,density,mob_rnr,mob_work,0,1,2,3,...,93,94,95,96,97,98,99,100,101,102
0,Abbeville,South Carolina,24527.0,50.005713,-0.3,-0.25,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,Accomack,Virginia,32316.0,71.893852,-0.24,-0.19,1.0,1.0,1.0,1.0,...,,,,,,,,,,
2,Ada,Idaho,481587.0,457.531999,-0.45,-0.4,1.0,2.0,2.0,2.0,...,,,,,,,,,,
3,Adair,Iowa,7152.0,12.563444,-0.44,-0.43,1.0,1.0,1.0,1.0,...,,,,,,,,,,
4,Adair,Kentucky,19202.0,47.379272,-0.28,-0.52,3.0,3.0,4.0,4.0,...,,,,,,,,,,


In [40]:
newd1 = df.set_index(['county', 'state', '2019_pop', 'density','retail_and_recreation','workplaces',
                     'date'])['deaths'].unstack().reset_index()
deaths_df = pd.DataFrame()
for i in range(0,len(newd1)):
    row = newd1.iloc[i,0:6].append(newd.iloc[i,6:].dropna()).reset_index().iloc[:, 1]
    deaths_df = deaths_df.append(row)

deaths_df.rename(columns=lambda x: str(x-6), inplace=True)
deaths_df = deaths_df.rename({"-6":'county', "-5":'state', "-4":'population', "-3":'density','-2':'mob_rnr',
                           '-1':'mob_work'}, axis = 1)
deaths_df.head(5)

Unnamed: 0,county,state,population,density,mob_rnr,mob_work,0,1,2,3,...,93,94,95,96,97,98,99,100,101,102
0,Abbeville,South Carolina,24527.0,50.005713,-0.3,-0.25,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,Accomack,Virginia,32316.0,71.893852,-0.24,-0.19,1.0,1.0,1.0,1.0,...,,,,,,,,,,
2,Ada,Idaho,481587.0,457.531999,-0.45,-0.4,1.0,2.0,2.0,2.0,...,,,,,,,,,,
3,Adair,Iowa,7152.0,12.563444,-0.44,-0.43,1.0,1.0,1.0,1.0,...,,,,,,,,,,
4,Adair,Kentucky,19202.0,47.379272,-0.28,-0.52,3.0,3.0,4.0,4.0,...,,,,,,,,,,


In [None]:
## DEFINE DISTANCE AND PREDICTION FUNCTION, CURRENTLY 1-DAY FUTURE PREDICTIONS

In [51]:
def dist_rows(df, x, cols):
    dists = []
    for i in range(0, len(df)):
        dist = (abs(x[cols]-df.iloc[i][cols]) / x[cols]).sum() / len((x[cols]-df.iloc[i][cols]).dropna())
        # divides distance by column values to normalize
        dists += [dist]
    return(dists)


def pred(df, row, k):
    n = int(row.dropna().index[-1])
    #what day the specified row is on
    cols = ['density','population','mob_rnr','mob_work'] + list(map(str, range(n-20,n)))
    df = df[df[str(n+1)] >= 0]
    dists = np.array(dist_rows(df, row, cols))
    #only use instances that have as many days as row
    nbrs = list(dists.argsort()[1:k+1])
    #take off 1st bc that's the row itself
    preds = df.iloc[nbrs][str(n+1)]
    wgt_preds = [x*y for x,y in zip(preds, [1/(nbrs.index(x)+1) for x in nbrs])]
    #get predicted values for neighbors
    return(sum(wgt_preds) / sum([1/(nbrs.index(x)+1) for x in nbrs]))


In [None]:
## GET PREDICTIONS FOR PENNSYLVANIA COUNTIES

In [89]:
penn_ind = cases_df.index[cases_df.state == 'Pennsylvania']
cases_pred_list, deaths_pred_list = [], []
k=20
for row in penn_ind:
    cases_prd = pred(cases_df, cases_df.iloc[row], k)
    deaths_prd = pred(deaths_df, deaths_df.iloc[row], k)
    #print(prd)
    cases_pred_list += [cases_prd]
    deaths_pred_list += [deaths_prd]


In [90]:
#len(pred_list), len(cases_df.iloc[penn_ind].population)
#cases_pred_ls = [x*y for x,y in zip(cases_pred_list, cases_df.iloc[penn_ind].population)]
#deaths_pred_ls = [x*y for x,y in zip(deaths_pred_list, cases_df.iloc[penn_ind].population)]
dates = [(datetime.datetime.today() + datetime.timedelta(days=1)).strftime('%m/%d/%y')] * len(pred_list)
pred_df = pd.DataFrame({'County' : cases_df.iloc[penn_ind].county, 'State' : cases_df.iloc[penn_ind].state,
                        'Cumulative Cases Prediction' : [float(x) for x in cases_pred_list],
                       'Cumulative Deaths Prediction' : [float(x) for x in deaths_pred_list],
                       'Date' : dates})

pred_df = pred_df.reset_index().drop(['index'], axis = 1)
pred_df.head(10)

Unnamed: 0,County,State,Cumulative Cases Prediction,Cumulative Deaths Prediction,Date
0,Adams,Pennsylvania,277.0,60.0,05/01/20
1,Allegheny,Pennsylvania,75.25,27.166667,05/01/20
2,Armstrong,Pennsylvania,751.25,5.25,05/01/20
3,Beaver,Pennsylvania,785.5,43.142857,05/01/20
4,Bedford,Pennsylvania,126.0,6.666667,05/01/20
5,Berks,Pennsylvania,585.0,34.8,05/01/20
6,Blair,Pennsylvania,148.6,8.4,05/01/20
7,Bradford,Pennsylvania,742.0,17.444444,05/01/20
8,Bucks,Pennsylvania,93.142857,41.25,05/01/20
9,Butler,Pennsylvania,382.857143,28.5,05/01/20


In [8]:
#USE THIS TO GET CSV OF PREDICTIONS

'''date = datetime.today().strftime('%m%d')
pred_df.to_csv('pennpreds'+str(date)+'.csv')'''

In [None]:
## MAY ADD MOBILITY AS VARIABLE IF TIME PERMITS

In [None]:
"""mobility = pd.read_csv('https://raw.githubusercontent.com/vitorbaptista/google-covid19-mobility-reports/master/data/processed/mobility_reports.csv',
                      dtype = {'region':'str', 'subregion':'str'})
mobility = mobility[mobility.updated_at == max(mobility.updated_at)]
mobility['subregion'] = mobility['subregion'].str.replace(' County', '')
mobility = mobility.drop(['retail_and_recreation_not_enough_data','grocery_and_pharmacy_not_enough_data','parks_not_enough_data'
                         ,'transit_stations_not_enough_data','workplaces_not_enough_data','residential_not_enough_data'], axis = 1)
mobility.head(5)"""

In [None]:
"""df = df.merge(mobility, left_on=['county','state'], right_on=['subregion','region'], how='left')
df = df.drop(['region','subregion'], axis = 1)
df.dtypes"""