In [1]:
START_DATE = "02/29/2020"
END_DATE = "05/30/2020"
TRAIN_SPLIT_IDX = 120
TIME_WINDOW_SIZE = 7
DS_LABEL = "test_gen"

In [2]:
import random
import torch
import math
import pandas as pd
import pathlib


def getDateRange(start, end):
    start_date  = pd.to_datetime(start)
    end_date    = pd.to_datetime(end)
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    return dates



In [3]:
# by borough

death_df = pd.read_csv("https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/deaths-by-day.csv")
case_df = pd.read_csv("https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/cases-by-day.csv")

death_df['date_of_interest'] = pd.to_datetime(death_df['date_of_interest'])
case_df['date_of_interest'] = pd.to_datetime(case_df['date_of_interest'])

In [4]:
BOROUGH_FIPS_MAP = {
    'BX' : 36005,
    'BK' : 36047,
    'MN' : 36061,
    'QN' : 36081,
    'SI' : 36085,
}

In [5]:
subset_cols = ['date_of_interest', 'BX_DEATH_COUNT', 'BX_DEATH_COUNT_7DAY_AVG',
       'BK_DEATH_COUNT', 'BK_DEATH_COUNT_7DAY_AVG', 'MN_DEATH_COUNT',
       'MN_DEATH_COUNT_7DAY_AVG', 'QN_DEATH_COUNT', 'QN_DEATH_COUNT_7DAY_AVG',
       'SI_DEATH_COUNT', 'SI_DEATH_COUNT_7DAY_AVG']
death_subset_df = death_df.loc[(START_DATE <= death_df['date_of_interest']) & (death_df['date_of_interest'] <= END_DATE), subset_cols]
death_subset_df = pd.melt(death_subset_df, id_vars=['date_of_interest'])
death_subset_df[['borough', 'metric']] = death_subset_df['variable'].str.split('_', n=1, expand=True)
death_subset_df = death_subset_df.pivot(index=['date_of_interest','borough'], columns='metric', values='value').reset_index()

death_subset_df['FIPS'] = death_subset_df['borough'].map(BOROUGH_FIPS_MAP)

death_subset_df['node_key'] = death_subset_df['date_of_interest'].astype('str') + "-" + death_subset_df['FIPS'].astype(str)
long_cols = ['date_of_interest', 'FIPS', 'node_key', 'DEATH_COUNT', 'DEATH_COUNT_7DAY_AVG']
death_subset_df = death_subset_df[long_cols]
death_subset_df.head()

metric,date_of_interest,FIPS,node_key,DEATH_COUNT,DEATH_COUNT_7DAY_AVG
0,2020-02-29,36047,2020-02-29-36047,0,0
1,2020-02-29,36005,2020-02-29-36005,0,0
2,2020-02-29,36061,2020-02-29-36061,0,0
3,2020-02-29,36081,2020-02-29-36081,0,0
4,2020-02-29,36085,2020-02-29-36085,0,0


In [6]:
# compute deltas
death_subset_df = death_subset_df.sort_values(by=['date_of_interest', 'FIPS'])

death_subset_df['DEATH_DELTA'] = death_subset_df.groupby(['FIPS'])['DEATH_COUNT'].diff().fillna(0)
death_subset_df


metric,date_of_interest,FIPS,node_key,DEATH_COUNT,DEATH_COUNT_7DAY_AVG,DEATH_DELTA
1,2020-02-29,36005,2020-02-29-36005,0,0,0.0
0,2020-02-29,36047,2020-02-29-36047,0,0,0.0
2,2020-02-29,36061,2020-02-29-36061,0,0,0.0
3,2020-02-29,36081,2020-02-29-36081,0,0,0.0
4,2020-02-29,36085,2020-02-29-36085,0,0,0.0
...,...,...,...,...,...,...
456,2020-05-30,36005,2020-05-30-36005,14,11,5.0
455,2020-05-30,36047,2020-05-30-36047,10,14,-8.0
457,2020-05-30,36061,2020-05-30-36061,14,9,8.0
458,2020-05-30,36081,2020-05-30-36081,17,14,3.0


In [7]:
subset_cols = ['date_of_interest',
               'BX_CASE_COUNT', 'BX_CASE_COUNT_7DAY_AVG',
               'BK_CASE_COUNT', 'BK_CASE_COUNT_7DAY_AVG',
               'MN_CASE_COUNT', 'MN_CASE_COUNT_7DAY_AVG',
               'QN_CASE_COUNT', 'QN_CASE_COUNT_7DAY_AVG',
               'SI_CASE_COUNT', 'SI_CASE_COUNT_7DAY_AVG']
case_subset_df = case_df.loc[(START_DATE <= case_df['date_of_interest']) & (case_df['date_of_interest'] <= END_DATE), subset_cols]
case_subset_df = pd.melt(case_subset_df, id_vars=['date_of_interest'])
case_subset_df[['borough', 'metric']] = case_subset_df['variable'].str.split('_', n=1, expand=True)
case_subset_df = case_subset_df.pivot(index=['date_of_interest','borough'], columns='metric', values='value').reset_index()

case_subset_df['FIPS'] = case_subset_df['borough'].map(BOROUGH_FIPS_MAP)
case_subset_df['node_key'] = case_subset_df['date_of_interest'].astype('str') + "-" + case_subset_df['FIPS'].astype(str)
long_cols = ['date_of_interest', 'FIPS', 'node_key', 'CASE_COUNT', 'CASE_COUNT_7DAY_AVG']

case_subset_df = case_subset_df[long_cols]
case_subset_df.head()

metric,date_of_interest,FIPS,node_key,CASE_COUNT,CASE_COUNT_7DAY_AVG
0,2020-02-29,36047,2020-02-29-36047,0,0
1,2020-02-29,36005,2020-02-29-36005,0,0
2,2020-02-29,36061,2020-02-29-36061,1,0
3,2020-02-29,36081,2020-02-29-36081,0,0
4,2020-02-29,36085,2020-02-29-36085,0,0


In [8]:
# compute deltas
case_subset_df = case_subset_df.sort_values(by=['date_of_interest', 'FIPS'])

case_subset_df['CASE_DELTA'] = case_subset_df.groupby(['FIPS'])['CASE_COUNT'].diff().fillna(0)
case_subset_df.head()

metric,date_of_interest,FIPS,node_key,CASE_COUNT,CASE_COUNT_7DAY_AVG,CASE_DELTA
1,2020-02-29,36005,2020-02-29-36005,0,0,0.0
0,2020-02-29,36047,2020-02-29-36047,0,0,0.0
2,2020-02-29,36061,2020-02-29-36061,1,0,0.0
3,2020-02-29,36081,2020-02-29-36081,0,0,0.0
4,2020-02-29,36085,2020-02-29-36085,0,0,0.0


In [76]:
dates = getDateRange(START_DATE, END_DATE)
fips_list = list(BOROUGH_FIPS_MAP.values())

node_dict = dict()

curr_idx = 0
for f in fips_list:
    for d in dates:
        key_str = f"{f}-{d.strftime('%Y-%m-%d')}"
        node_dict[key_str] = curr_idx
        curr_idx += 1

pd.DataFrame.from_dict(node_dict, orient='index')

Unnamed: 0,0
36005-2020-02-29,0
36005-2020-03-01,1
36005-2020-03-02,2
36005-2020-03-03,3
36005-2020-03-04,4
...,...
36085-2020-05-26,455
36085-2020-05-27,456
36085-2020-05-28,457
36085-2020-05-29,458


In [55]:
deltaT = pd.Timedelta(value=1, unit="D") 

for f in list(BOROUGH_FIPS_MAP.values()):
    for d in dates:
        for dd in range(TIME_WINDOW_SIZE-1):
            prev = pd.to_datetime(d) - deltaT * (dd + 1)
            
            selection_current = (case_subset_df['FIPS'] == f) & (case_subset_df['date_of_interest'] == d)
            selection_prev = (case_subset_df['FIPS'] == f) & (case_subset_df['date_of_interest'] == prev)
            if prev < pd.to_datetime(START_DATE):
                prev_cases = 0
            else:
                prev_cases = case_subset_df.loc[selection_prev, 'CASE_COUNT'].values[0]
            case_subset_df.loc[selection_current, f'CASE_COUNT_PREV_{dd}'] = prev_cases

            selection_current = (death_subset_df['FIPS'] == f) & (death_subset_df['date_of_interest'] == d)
            selection_prev = (death_subset_df['FIPS'] == f) & (death_subset_df['date_of_interest'] == prev)
            if prev < pd.to_datetime(START_DATE):
                prev_deaths = 0
            else:
                prev_deaths = case_subset_df.loc[selection_prev, 'CASE_COUNT'].values[0]
            death_subset_df.loc[selection_current, f'DEATH_COUNT_PREV_{dd}'] = prev_deaths


In [62]:
x_t = case_subset_df.merge(death_subset_df, on=['date_of_interest','FIPS', 'node_key'])
x_t_cols = ['CASE_COUNT',
       'CASE_COUNT_7DAY_AVG', 'CASE_COUNT_PREV_0',
       'CASE_COUNT_PREV_1', 'CASE_COUNT_PREV_2', 'CASE_COUNT_PREV_3',
       'CASE_COUNT_PREV_4', 'CASE_COUNT_PREV_5', 'CASE_COUNT_PREV_6',
       'DEATH_COUNT', 'DEATH_COUNT_7DAY_AVG',
       'DEATH_COUNT_PREV_0', 'DEATH_COUNT_PREV_1', 'DEATH_COUNT_PREV_2',
       'DEATH_COUNT_PREV_3', 'DEATH_COUNT_PREV_4', 'DEATH_COUNT_PREV_5']
x_t = torch.tensor(x_t[x_t_cols].values)
y_t = torch.tensor(case_subset_df['CASE_DELTA'].values)

In [63]:
import torch_geometric

torch_geometric.data.Data(
    x=x_t,
    y=y_t
    )

ModuleNotFoundError: No module named 'torch_geometric'

In [None]:
def getPyTorchGeoData(label):
    with open(X_FN.format(label), "r") as x_f:
        x_df = pd.read_csv(x_f, header=None)

    with open(Y_FN.format(label), "r") as y_f:
        y_df = pd.read_csv(y_f, header=None)

    with open(Y_P_FN.format(label), "r") as y_p_f:
        y_p_df = pd.read_csv(y_p_f, header=None)
        y_p_df.fillna(0, inplace=True)

    with open(COO_FN.format(label), "r") as coo_f:
        coo_df = pd.read_csv(coo_f, header=None)

    with open(TRAIN_M_FN.format(label), "r") as train_m_f:
        train_m_df = pd.read_csv(train_m_f, header=None)

    with open(TEST_M_FN.format(label), "r") as test_m_f:
        test_m_df = pd.read_csv(test_m_f, header=None)

    # COO Edge List Tensor. Reshaped.
    coo_t = torch.tensor(coo_df.values, dtype=torch.long)
    coo_t = coo_t.reshape((2, len(coo_df.values)))

    # Features and Target Tensors.
    x_t = torch.tensor(x_df.values, dtype=torch.float)
    y_t = torch.tensor(y_df.values, dtype=torch.float)
    y_p_t = torch.tensor(y_p_df.values, dtype=torch.float)

    # Test/Train Mask Tensors.
    train_t = torch.tensor(train_m_df.values, dtype=torch.long)
    test_t = torch.tensor(test_m_df.values, dtype=torch.long)
    return Data(
        x=x_t,
        y=y_t,
        edge_index=coo_t,
        train_mask=train_t,
        test_mask=test_t,
        priors=y_p_t,
    )

## Mobility Data

In [117]:
DTYPE = {
    'census_fips_code':'Int64',
    'date':'str',
}
mobility_report_df = pd.read_csv("../data/raw/2020_US_Region_Mobility_Report.csv", dtype=DTYPE)
mobility_report_df['date'] = pd.to_datetime(mobility_report_df['date'])

In [124]:
counties = ['Bronx County', 'Kings County', 'New York County', 'Queens County', 'Richmond County']
subset_cols = [
    "census_fips_code",
    "date",
    "retail_and_recreation_percent_change_from_baseline",
    "grocery_and_pharmacy_percent_change_from_baseline",
    "parks_percent_change_from_baseline",
    "transit_stations_percent_change_from_baseline",
    "workplaces_percent_change_from_baseline",
    "residential_percent_change_from_baseline"]

nyc_mobility_report_df = mobility_report_df.loc[(mobility_report_df.sub_region_1 == "New York") & (mobility_report_df.sub_region_2.isin(counties)) & (START_DATE <= mobility_report_df['date'])& (mobility_report_df['date'] <= END_DATE), subset_cols]


In [126]:
nyc_mobility_report_df.rename(columns={"census_fips_code": "FIPS"})

Unnamed: 0,FIPS,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
466922,36005,2020-02-29,10.0,-1.0,-15.0,8.0,4.0,-1.0
466923,36005,2020-03-01,9.0,2.0,-6.0,3.0,0.0,0.0
466924,36005,2020-03-02,13.0,6.0,-7.0,8.0,6.0,-1.0
466925,36005,2020-03-03,8.0,7.0,6.0,4.0,4.0,0.0
466926,36005,2020-03-04,11.0,7.0,16.0,2.0,2.0,-1.0
...,...,...,...,...,...,...,...,...
479379,36085,2020-05-26,-43.0,-13.0,28.0,-54.0,-50.0,20.0
479380,36085,2020-05-27,-41.0,-11.0,19.0,-55.0,-50.0,21.0
479381,36085,2020-05-28,-43.0,-8.0,1.0,-54.0,-51.0,21.0
479382,36085,2020-05-29,-46.0,-9.0,7.0,-53.0,-50.0,23.0


## Adjency list

In [67]:
fips_list = list(BOROUGH_FIPS_MAP.values())
dates

DatetimeIndex(['2020-02-29', '2020-03-01', '2020-03-02', '2020-03-03',
               '2020-03-04', '2020-03-05', '2020-03-06', '2020-03-07',
               '2020-03-08', '2020-03-09', '2020-03-10', '2020-03-11',
               '2020-03-12', '2020-03-13', '2020-03-14', '2020-03-15',
               '2020-03-16', '2020-03-17', '2020-03-18', '2020-03-19',
               '2020-03-20', '2020-03-21', '2020-03-22', '2020-03-23',
               '2020-03-24', '2020-03-25', '2020-03-26', '2020-03-27',
               '2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31',
               '2020-04-01', '2020-04-02', '2020-04-03', '2020-04-04',
               '2020-04-05', '2020-04-06', '2020-04-07', '2020-04-08',
               '2020-04-09', '2020-04-10', '2020-04-11', '2020-04-12',
               '2020-04-13', '2020-04-14', '2020-04-15', '2020-04-16',
               '2020-04-17', '2020-04-18', '2020-04-19', '2020-04-20',
               '2020-04-21', '2020-04-22', '2020-04-23', '2020-04-24',
      

In [87]:
coo_list   = []

# create spatial edges (all boroughs are connected to each other)
for d in dates:
    for u in fips_list:
        for v in fips_list:
            u_key = f"{u}-{d.strftime('%Y-%m-%d')}"
            v_key = f"{v}-{d.strftime('%Y-%m-%d')}"
            u_idx = node_dict[u_key]
            v_idx = node_dict[v_key]
            coo_list.append([u_idx, v_idx])
print(len(coo_list), 'spatial edges')

# create temporal edges
temp_count = 0
for base_day_idx in range(0, len(dates)-TIME_WINDOW_SIZE):
    base_day = dates[base_day_idx]
    base_str = base_day.strftime('%Y-%m-%d')
    for future_day in dates[base_day_idx+1 : base_day_idx+TIME_WINDOW_SIZE+1]:
        future_str = future_day.strftime('%Y-%m-%d')

        # iterate over each county fips
        for f in fips_list:

            # Need a link from base_day to future_day
            u_key = f"{f}-{base_str}"
            v_key = f"{f}-{future_str}"
            
            u_idx = node_dict[u_key]
            v_idx = node_dict[v_key]
            # Only add past->future link. 
            coo_list.append([u_idx, v_idx])
            temp_count += 1
                
print(temp_count, 'temporal edges')

# print("{} adj links, {} temp links in cool list".format(adj_count, temp_count))
# print("{} key errors while generating coo list".format(key_errors))
# return coo_list

2300 spatial edges
2975 temporal edges


In [89]:

save_df = pd.DataFrame(coo_list)
save_df.to_csv(
# save_df.to_csv(filepath, header=False, index=False)


Unnamed: 0,0,1
0,0,0
1,0,92
2,0,184
3,0,276
4,0,368
...,...,...
5270,84,91
5271,176,183
5272,268,275
5273,360,367
