In [None]:
import os
import numpy as np
import pandas as pd

import sklearn.linear_model
import sklearn.metrics

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', style='whitegrid', font_scale=1.25)

In [None]:
#DATA_DIR = 'data/'
#x_tr_M = np.loadtxt(os.path.join(DATA_DIR, 'SVI2018_US_COUNTY.csv'), delimiter=',', skiprows=1)

data_svi2018_us_per_county = pd.read_csv("data/SVI2018_US_COUNTY.csv") 
data_svi2018_us_per_county.head()

In [None]:
data_covid_cases_deaths_county = pd.read_csv("data/covid_cases_deaths_county.csv", dtype={"fips": str}, parse_dates=['date']) 
data_covid_cases_deaths_county.head()

In [None]:
np.count_nonzero(np.asarray([data_covid_cases_deaths_county['deaths']>10000.0]))


In [None]:
rslt_df = data_covid_cases_deaths_county[data_covid_cases_deaths_county['date'] == '2021-01-20']
  
rslt_df

In [None]:
data_covid_cases_deaths_county['date']

# GNNs

In [None]:
fst_day = max(data_covid_cases_deaths_county['date']) # the earliest day in the dataset
fst_day

In [None]:
rslt_fst_day_df = data_covid_cases_deaths_county[data_covid_cases_deaths_county['date'] == fst_day]
rslt_fst_day_df

In [None]:
unique_fips_fst_day = np.asarray([fips for fips in rslt_fst_day_df['fips'].unique()])
print(unique_fips_fst_day.shape)
unique_fips_fst_day= np.asarray([x for x in unique_fips_fst_day if str(x) != 'nan']) # converting to int and removing nans
print(unique_fips_fst_day.shape)

In [None]:
unique_fips_fst_day

In [None]:
# now adding a column with integer identificator starting from 0
ids_nodes = np.arange(unique_fips_fst_day.shape[0])
ids_nodes

In [None]:
death_per_county = list()
for _id in unique_fips_fst_day:
    n_deaths = rslt_fst_day_df.loc[rslt_fst_day_df['fips'] == _id, 'deaths']
    if (n_deaths.shape[0] == 0):
        death_per_county.append(0.0)
    else:
        death_per_county.append(n_deaths.to_numpy()[0])
            

death_per_county = np.asarray(death_per_county, dtype = np.float64)
death_per_county

In [None]:
rslt_fst_day_df.loc[rslt_fst_day_df['fips'] == '1001', 'deaths']

In [None]:
# change fips to be ints instead of strings so we can better manage the edges later
unique_fips_fst_day = unique_fips_fst_day.astype(np.int64)
unique_fips_fst_day

In [None]:
# now we can put together id, fips and deaths in a dataframe
stack = np.hstack([ids_nodes.reshape((ids_nodes.shape[0],1)), unique_fips_fst_day.reshape((ids_nodes.shape[0],1)), death_per_county.reshape((ids_nodes.shape[0],1))])

In [None]:
stack.shape

In [None]:
ids_nodes.reshape((ids_nodes.shape[0],1))

In [None]:
nodes_df = pd.DataFrame(data=stack, columns=["id", "fips", "deaths"])
print(nodes_df)

In [None]:
# so the above is the data we are going to use for our nodes (1st layer).
# now let's construct the edges data. We will take it from the counties adjacency data

county_adjacency_df = pd.read_csv("data/county_adjacency2010.csv", dtype={"fips": str}) 
county_adjacency_df

In [None]:
county_adjacency_only_fips = county_adjacency_df[["fipscounty", "fipsneighbor"]]

In [None]:
# select only the edges that are linking two different loops (we're avoiding loops)
county_adjacency_only_fips = county_adjacency_only_fips[county_adjacency_only_fips['fipscounty'] != county_adjacency_only_fips['fipsneighbor']]
county_adjacency_only_fips

In [None]:
fipscouty = county_adjacency_only_fips['fipscounty'].to_numpy()
fipscouty.shape

In [None]:
fipsneighbor = county_adjacency_only_fips['fipsneighbor'].to_numpy()
fipsneighbor.shape

In [None]:
# associating the ids to the source and destination nodes that we assigned when creating the nodes dataframe
ids_src_nodes = list()
ids_dst_nodes = list()
for i in range(fipscouty.shape[0]):
    id_to_append_src = nodes_df.loc[nodes_df['fips'] == fipscouty[i], 'id'].to_numpy()
    id_to_append_dst = nodes_df.loc[nodes_df['fips'] == fipsneighbor[i], 'id'].to_numpy()
    
    if (id_to_append_src.shape[0] != 0 and id_to_append_dst.shape[0] != 0):
        ids_src_nodes.append(id_to_append_src[0])
        ids_dst_nodes.append(id_to_append_dst[0])
    
print(ids_src_nodes)
print('--')
print(ids_dst_nodes)

In [None]:
ids_src_nodes = np.asarray(ids_src_nodes, dtype=np.int64)
ids_dst_nodes = np.asarray(ids_dst_nodes, dtype=np.int64)

stack_edges = np.hstack([ids_src_nodes.reshape((ids_src_nodes.shape[0],1)), ids_dst_nodes.reshape((ids_dst_nodes.shape[0],1))])
edges_df = pd.DataFrame(data=stack_edges, columns=["src", "dst"])
edges_df

In [None]:
# we have fewer numbers since there were counties ids in the adjacency dataframe that didn't match any county in
# the nodes dataset

In [None]:
import torch
import dgl

# now let's create the dgl graph
src = edges_df['src'].to_numpy()
dst = edges_df['dst'].to_numpy()

# Create a DGL graph from a pair of numpy arrays
g = dgl.graph((src, dst))

# Print a graph gives some meta information such as number of nodes and edges.
print(g)

In [None]:
import networkx as nx
# Since the actual graph is undirected, we convert it for visualization
# purpose.
nx_g = g.to_networkx().to_undirected()
# Kamada-Kawaii layout usually looks pretty for arbitrary graphs
pos = nx.kamada_kawai_layout(nx_g)
nx.draw(nx_g, pos, with_labels=True, node_color=[[.7, .7, .7]])

In [None]:
nodes_df['deaths'].to_numpy() # to remove

In [None]:
# now I will load the node features, that is, the number of deaths.
import torch.nn.functional as F

# Prepare the age node feature
max_n_deaths = np.nanmax(nodes_df['deaths'].to_numpy())
deaths = torch.tensor(nodes_df['deaths'].to_numpy()).float() / max_n_deaths
print(deaths)

In [None]:
# Feed the features to graph
g.ndata['deaths'] = deaths # we are setting one of the keys of the ndata dictionary to be age with value age, which we created above.
print(g) # in the output, the shape=() represents the feature shape. The features in this case is the float that represents the age of each one of the members of the club.

In [None]:
# take a copy of the nodes df and add multiple layers of temporal data.
# don't take the first day when covid appeared, but sometime when 
nodes_df

In [None]:
# we're gonna create the nodes from day Sept 14 and then add on top of that dataframe



In [None]:
days_df =pd.to_datetime(['2020-09-15', '2020-09-16', '2020-09-17', '2020-09-19'])
days_np_arr = np.array(days_df, dtype= np.datetime64)

days_np_arr

In [None]:
for day in days_np_arr:
    rslt_day_i_df = data_covid_cases_deaths_county[data_covid_cases_deaths_county['date'] == day]
    print(rslt_day_i_df)



In [None]:
# more ideas:
# add also number of cases! not only deaths
# in order to give the doctors intuition on where they should send personnel
# and resources to treat the patients.
