In [None]:
import os
import numpy as np
import pandas as pd

import sklearn.linear_model
import sklearn.metrics

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', style='whitegrid', font_scale=1.25)

In [None]:
#DATA_DIR = 'data/'
#x_tr_M = np.loadtxt(os.path.join(DATA_DIR, 'SVI2018_US_COUNTY.csv'), delimiter=',', skiprows=1)

data_svi2018_us_per_county = pd.read_csv("data/SVI2018_US_COUNTY.csv") 
data_svi2018_us_per_county.head()

In [129]:
data_covid_cases_deaths_county = pd.read_csv("data/covid_cases_deaths_county.csv", dtype={"fips": str}, parse_dates=['date']) 
data_covid_cases_deaths_county.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0


In [63]:
np.count_nonzero(np.asarray([data_covid_cases_deaths_county['deaths']>10000.0]))


323

In [64]:
rslt_df = data_covid_cases_deaths_county[data_covid_cases_deaths_county['date'] == '2021-01-20']
  
rslt_df

Unnamed: 0,date,county,state,fips,cases,deaths
946480,2021-01-20,Autauga,Alabama,01001,5257,55.0
946481,2021-01-20,Baldwin,Alabama,01003,16653,184.0
946482,2021-01-20,Barbour,Alabama,01005,1760,36.0
946483,2021-01-20,Bibb,Alabama,01007,2188,47.0
946484,2021-01-20,Blount,Alabama,01009,5376,83.0
...,...,...,...,...,...,...
949721,2021-01-20,Sweetwater,Wyoming,56037,3347,28.0
949722,2021-01-20,Teton,Wyoming,56039,2788,5.0
949723,2021-01-20,Uinta,Wyoming,56041,1852,10.0
949724,2021-01-20,Washakie,Wyoming,56043,845,24.0


In [65]:
data_covid_cases_deaths_county['date']

0        2020-01-21
1        2020-01-22
2        2020-01-23
3        2020-01-24
4        2020-01-24
            ...    
978926   2021-01-29
978927   2021-01-29
978928   2021-01-29
978929   2021-01-29
978930   2021-01-29
Name: date, Length: 978931, dtype: datetime64[ns]

# GNNs

In [41]:
# load nodes included info will be an id that starts from 0, the fips identificator, and the number of deaths.
# first we create a dataframe with all the county
df_fips = {fips: data_covid_cases_deaths_county[data_covid_cases_deaths_county['fips'] == fips] for fips in data_covid_cases_deaths_county['fips'].unique()}

# load county adjacency info
df_fips

{'53061':               date     county       state   fips  cases  deaths
 0       2020-01-21  Snohomish  Washington  53061      1     0.0
 1       2020-01-22  Snohomish  Washington  53061      1     0.0
 2       2020-01-23  Snohomish  Washington  53061      1     0.0
 4       2020-01-24  Snohomish  Washington  53061      1     0.0
 7       2020-01-25  Snohomish  Washington  53061      1     0.0
 ...            ...        ...         ...    ...    ...     ...
 965793  2021-01-25  Snohomish  Washington  53061  27995   470.0
 969037  2021-01-26  Snohomish  Washington  53061  28058   472.0
 972282  2021-01-27  Snohomish  Washington  53061  28174   478.0
 975527  2021-01-28  Snohomish  Washington  53061  28328   483.0
 978772  2021-01-29  Snohomish  Washington  53061  28373   488.0
 
 [375 rows x 6 columns],
 '17031':               date county     state   fips   cases  deaths
 3       2020-01-24   Cook  Illinois  17031       1     0.0
 6       2020-01-25   Cook  Illinois  17031       1    

In [59]:
arr = np.asarray(list(df_fips.keys()))
arr.shape

(3219,)

In [58]:
# aparently there are nans in this list but I cannot find them using numpy.isnan(myarray).any() so I remove them by doing:
arr2 = np.asarray([int(x) for x in arr if str(x) != 'nan'])
arr2.shape

(3218,)

In [75]:
fst_day = max(data_covid_cases_deaths_county['date']) # the earliest day in the dataset
fst_day

Timestamp('2021-01-29 00:00:00')

In [84]:
unique_fips_fst_day = np.asarray([fips for fips in rslt_fst_day_df['fips'].unique()])
print(unique_fips_fst_day.shape)
unique_fips_fst_day= np.asarray([x for x in unique_fips_fst_day if str(x) != 'nan']) # converting to int and removing nans
print(unique_fips_fst_day.shape)

(3219,)
(3218,)


In [85]:
unique_fips_fst_day

array(['01001', '01003', '01005', ..., '56041', '56043', '56045'],
      dtype='<U5')

In [86]:
# now adding a column with integer identificator starting from 0
ids_nodes = np.arange(unique_fips_fst_day.shape[0])
ids_nodes

array([   0,    1,    2, ..., 3215, 3216, 3217])

In [98]:
death_per_county = list()
for _id in unique_fips_fst_day:
    death_per_county.append(rslt_fst_day_df.loc[rslt_fst_day_df['fips'] == _id, 'deaths'].to_numpy()[0])

death_per_county = np.asarray(death_per_county, dtype = np.float64)
death_per_county

array([ 69., 224.,  40., ...,  12.,  25.,   4.])

In [94]:
rslt_fst_day_df.loc[rslt_fst_day_df['fips'] == '01005', 'deaths'].to_numpy()[0]

40.0

In [123]:
# now we can put together id, fips and deaths in a dataframe
stack = np.hstack([ids_nodes.reshape((ids_nodes.shape[0],1)), unique_fips_fst_day.reshape((ids_nodes.shape[0],1)), death_per_county.reshape((ids_nodes.shape[0],1))])

In [124]:
stack.shape

(3218, 3)

In [118]:
ids_nodes.reshape((ids_nodes.shape[0],1))

array([[   0],
       [   1],
       [   2],
       ...,
       [3215],
       [3216],
       [3217]])

In [125]:
df = pd.DataFrame(data=stack, columns=["id", "fips", "deaths"])
print(df)

        id   fips deaths
0        0  01001   69.0
1        1  01003  224.0
2        2  01005   40.0
3        3  01007   51.0
4        4  01009   98.0
...    ...    ...    ...
3213  3213  56037   32.0
3214  3214  56039    6.0
3215  3215  56041   12.0
3216  3216  56043   25.0
3217  3217  56045    4.0

[3218 rows x 3 columns]


In [126]:
# so the above is the data we are going to use for our nodes (1st layer).
# now let's construct the edges data. We will take it from the counties adjacency data

county_adjacency_df = pd.read_csv("data/county_adjacency2010.csv", dtype={"fips": str}) 
county_adjacency_df

Unnamed: 0,countyname,fipscounty,neighborname,fipsneighbor
0,"Autauga County, AL",1001,"Autauga County, AL",1001
1,"Autauga County, AL",1001,"Chilton County, AL",1021
2,"Autauga County, AL",1001,"Dallas County, AL",1047
3,"Autauga County, AL",1001,"Elmore County, AL",1051
4,"Autauga County, AL",1001,"Lowndes County, AL",1085
...,...,...,...,...
22195,"St. Croix Island, VI",78010,"St. Croix Island, VI",78010
22196,"St. John Island, VI",78020,"St. John Island, VI",78020
22197,"St. John Island, VI",78020,"St. Thomas Island, VI",78030
22198,"St. Thomas Island, VI",78030,"St. John Island, VI",78020
