In [1]:
import pickle

### Impute missing values according to horizon

In [2]:
horizon = 12

# d2020 = pickle.load(open('./data/bay_2020.pkl','rb'))

# rolling_data = d2020.rolling(horizon,min_periods=1).mean()
# rolling_data.fillna(rolling_data.mean(),inplace = True)
# d2020.fillna(rolling_data[d2020.isnull()],inplace = True)

### Class that builds DCRNN data with NetworkX graph as an input

In [3]:
import numpy as np
import os
import torch
import pandas as pd
import networkx

class create_dcrnn_data():
    '''
    This class takes a networkx graph with temporal values modeled into them and generates the adjacency matrix
    and adjacency matrix required by the DCRNN model.
    
    Each node is expected to have a series with index as datetime.
    '''
    
    def __init__(self,graph,**kwargs):
        self.graph = graph
        self.outdir = kwargs.get('outdir','')
        self.horizon = kwargs.get('horizon',12)
        self.df = kwargs.get('data',None)
        
    def _createDataFrame(self):
        frame = list()
#         for i in range(self.graph.number_of_nodes()):
        for i in list(subgraph.nodes):
            frame.append(self.graph.nodes[i]['values'])
        return pd.concat(frame,axis = 1).sort_index(inplace = False)
    
    def generateAdjacencyMatrix(self):
        nodes = [self.graph.nodes[i]['sensor'] for i in self.graph.nodes]
        sensor_ind = {self.graph.nodes[i]['sensor']:i for i in self.graph.nodes}
        matrix = [nodes,sensor_ind,networkx.adjacency_matrix(self.graph)]
        try:
            pickle.dump(matrix,open(os.path.join(self.outdir,'newest_adj_max.pkl'),'wb'))
        except Error as e:
            print(e)

    def generate_graph_seq2seq_io_data(self,df, x_offsets, y_offsets, add_time_in_day=False, add_day_in_week=False, scaler=None):
        """
        Return:
        # x: (epoch_size, input_length, num_nodes, input_dim)
        # y: (epoch_size, output_length, num_nodes, output_dim)
        """

        num_samples, num_nodes = df.shape
        data = np.expand_dims(df.values, axis=-1)
        data_list = [data]
        
        if add_time_in_day:
            time_ind = (df.index.values - df.index.values.astype("datetime64[D]")) / np.timedelta64(1, "D")
            time_in_day = np.tile(time_ind, [1, num_nodes, 1]).transpose((2, 1, 0))
            data_list.append(time_in_day)
        if add_day_in_week:
            day_in_week = np.zeros(shape=(num_samples, num_nodes, 7))
            day_in_week[np.arange(num_samples), :, df.index.dayofweek] = 1
            data_list.append(day_in_week)

        data = np.concatenate(data_list, axis=-1)
        
        # epoch_len = num_samples + min(x_offsets) - max(y_offsets)
        x, y = [], []
        
        # t is the index of the last observation.
        min_t = abs(min(x_offsets))
        max_t = abs(num_samples - abs(max(y_offsets)))  # Exclusive
        
        for t in range(min_t, max_t):
            x_t = data[t + x_offsets, ...]
            y_t = data[t + y_offsets, ...]
            x.append(x_t)
            y.append(y_t)
        x = np.stack(x, axis=0)
        y = np.stack(y, axis=0)
        return x, y

    
    def generate_train_val_test(self):
        if not isinstance(self.df,pd.DataFrame):
            df = self._createDataFrame()
        else:
            df = self.df
        # Make sure to test what happens at horizon = 0
        x_offsets = np.sort(
            np.concatenate((np.arange(1 + (-1 * self.horizon), 1, 1),))
        )
        
        # Predict the next one hour
        y_offsets = np.sort(np.arange(1, self.horizon + 1, 1))

        # x: (num_samples, input_length, num_nodes, input_dim)
        # y: (num_samples, output_length, num_nodes, output_dim)
        x, y = self.generate_graph_seq2seq_io_data(
            df,
            x_offsets=x_offsets,
            y_offsets=y_offsets,
            add_time_in_day=True, # Add flag for this
            add_day_in_week=False,
        )

        print("x shape: ", x.shape, ", y shape: ", y.shape)

        num_samples = x.shape[0]
        num_test = round(num_samples * 0.2)
        num_train = round(num_samples * 0.7)
        num_val = num_samples - num_test - num_train


        x_train, y_train = x[:num_train], y[:num_train]
        
        x_val, y_val = (
            x[num_train: num_train + num_val],
            y[num_train: num_train + num_val],
        )

        x_test, y_test = x[-num_test:], y[-num_test:]

        datasets = {}
        for cat in ["train", "val", "test"]:
            _x, _y = locals()["x_" + cat], locals()["y_" + cat]
            print(cat, "x: ", _x.shape, "y:", _y.shape)
            np.savez_compressed(
                os.path.join(self.outdir, "%s.npz" % cat),
                x=_x,
                y=_y,
                x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),
                y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),
            )
            datasets["x_" + cat] = _x
            datasets["y_" + cat] = _y

        return None
    
    
    def generate_dataset(self):
        if not isinstance(self.df,pd.DataFrame):
            df = self._createDataFrame()
        else:
            df = self.df
        
        # Make sure to test what happens at horizon = 0
        x_offsets = np.sort(
            np.concatenate((np.arange(1 + (-1 * self.horizon), 1, 1),))
        )
        #Predict the next one hour
        y_offsets = np.sort(np.arange(1, self.horizon + 1, 1))

        # x: (num_samples, input_length, num_nodes, input_dim)
        # y: (num_samples, output_length, num_nodes, output_dim)
        x, y = self.generate_graph_seq2seq_io_data(
            df,
            x_offsets=x_offsets,
            y_offsets=y_offsets,
            add_time_in_day=True, # Add flag for this
            add_day_in_week=False,
        )

        print("x shape: ", x.shape, ", y shape: ", y.shape)

        num_samples = x.shape[0]
        num_test = round(num_samples * 1.0)
#         num_train = round(num_samples * 0.7)
#         num_val = num_samples - num_test - num_train


#         x_train, y_train = x[:num_train], y[:num_train]
        
#         x_val, y_val = (
#             x[num_train: num_train + num_val],
#             y[num_train: num_train + num_val],
#         )

        x_test, y_test = x[-num_test:], y[-num_test:]

        datasets = {}
#         for cat in ["train", "val", "test"]:
        for cat in ["test"]:
            _x, _y = locals()["x_" + cat], locals()["y_" + cat]
            print(cat, "x: ", _x.shape, "y:", _y.shape)
            np.savez_compressed(
                os.path.join(self.outdir, "%s.npz" % cat),
                x=_x,
                y=_y,
                x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),
                y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),
            )
            datasets["x_" + cat] = _x
            datasets["y_" + cat] = _y

            
        return None
    
    

### Load and create graph

In [4]:
!python --version

Python 3.8.8


In [5]:
g = pickle.load(open('./data/full_graph.pkl','rb'))

In [6]:
min(g.nodes[1]['values'].index), max(g.nodes[1]['values'].index) 

(Timestamp('2020-01-01 00:05:00'), Timestamp('2020-06-30 00:00:00'))

In [7]:
type(g)

networkx.classes.graph.Graph

In [6]:
len(g)

320

In [9]:
g.nodes[0]['values']

0
2020-01-01 00:05:00    71.7
2020-01-01 00:10:00    71.7
2020-01-01 00:15:00    71.8
2020-01-01 00:20:00    71.6
2020-01-01 00:25:00    71.3
                       ... 
2020-06-29 23:40:00    71.3
2020-06-29 23:45:00    71.5
2020-06-29 23:50:00    71.3
2020-06-29 23:55:00    69.5
2020-06-30 00:00:00    70.4
Name: 400001, Length: 52113, dtype: float64

In [10]:
g.nodes[0]['sensor']

'400001'

In [82]:
# sample_25

sample_25 = [
    400017,
400040,
400052,
400059,
400065,
400073,
400085,
400100,
400148,
400178,
400221,
400240,
400257,
400258,
400268,
400278,
400280,
400298,
400336,
400418,
400436,
400457,
400461,
400485,
400519,
400637,
400649,
400654,
400664,
400690,
400700,
400713,
400714,
400715,
400717,
400750,
400772,
400792,
400822,
400832,
400837,
400842,
400916,
400934,
400995,
401129,
401154,
401163,
401167,
401327,
401388,
401400,
401457,
401495,
401555,
401567,
401597,
401606,
401611,
401942,
401943,
401957,
401958,
402067,
403329,
403414,
403419,
404370,
404451,
404452,
404453,
404461,
404462,
405613,
405619,
405701,
407710,
407711,
414284,
414694
]

In [32]:
sample_50 = [
    400017,
400040,
400052,
400057,
400059,
400065,
400073,
400084,
400085,
400097,
400100,
400147,
400148,
400178,
400185,
400209,
400213,
400221,
400222,
400236,
400240,
400257,
400258,
400268,
400278,
400280,
400292,
400298,
400336,
400400,
400414,
400418,
400429,
400436,
400457,
400461,
400464,
400485,
400499,
400507,
400514,
400519,
400560,
400563,
400582,
400637,
400648,
400649,
400654,
400664,
400665,
400673,
400677,
400688,
400690,
400700,
400709,
400713,
400714,
400715,
400717,
400723,
400750,
400772,
400792,
400799,
400804,
400822,
400823,
400828,
400832,
400837,
400842,
400907,
400916,
400934,
400951,
400952,
400953,
400995,
401129,
401154,
401163,
401167,
401210,
401224,
401327,
401388,
401391,
401400,
401403,
401457,
401495,
401555,
401567,
401597,
401606,
401611,
401655,
401808,
401809,
401810,
401811,
401845,
401846,
401890,
401891,
401906,
401908,
401942,
401943,
401957,
401958,
402056,
402057,
402058,
402059,
402067,
402359,
402360,
402361,
403329,
403401,
403402,
403404,
403406,
403409,
403412,
403414,
403419,
404370,
404434,
404435,
404444,
404451,
404452,
404453,
404461,
404462,
404640,
405613,
405619,
405701,
407165,
407172,
407173,
407194,
407321,
407331,
407344,
407370,
407373,
407710,
407711,
408907,
408911,
413877,
413878,
414284,
414694
]

In [152]:
sample_75 = [
400001,
400017,
400030,
400040,
400045,
400052,
400057,
400059,
400065,
400069,
400073,
400084,
400085,
400097,
400100,
400109,
400122,
400147,
400148,
400160,
400172,
400178,
400185,
400206,
400209,
400213,
400221,
400222,
400227,
400236,
400240,
400246,
400253,
400257,
400258,
400268,
400278,
400280,
400292,
400298,
400336,
400343,
400353,
400372,
400394,
400400,
400414,
400418,
400429,
400436,
400440,
400457,
400461,
400464,
400479,
400485,
400499,
400507,
400508,
400514,
400519,
400528,
400560,
400563,
400582,
400586,
400637,
400648,
400649,
400654,
400664,
400665,
400668,
400673,
400677,
400688,
400690,
400700,
400709,
400713,
400714,
400715,
400717,
400723,
400750,
400760,
400772,
400792,
400794,
400799,
400804,
400822,
400823,
400828,
400832,
400837,
400842,
400863,
400895,
400907,
400911,
400916,
400922,
400934,
400951,
400952,
400953,
400964,
400965,
400971,
400995,
400996,
401129,
401154,
401163,
401167,
401210,
401224,
401327,
401388,
401391,
401400,
401403,
401440,
401457,
401495,
401541,
401555,
401560,
401567,
401597,
401606,
401611,
401655,
401808,
401809,
401810,
401811,
401816,
401817,
401845,
401846,
401890,
401891,
401906,
401908,
401942,
401943,
401957,
401958,
401994,
402056,
402057,
402058,
402059,
402060,
402061,
402067,
402117,
402118,
402121,
402281,
402282,
402283,
402284,
402359,
402360,
402361,
402362,
402363,
402364,
402365,
402366,
402367,
403225,
403265,
403329,
403401,
403402,
403404,
403406,
403409,
403412,
403414,
403419,
404370,
404434,
404435,
404444,
404451,
404452,
404453,
404461,
404462,
404521,
404640,
404753,
404759,
405613,
405619,
405701,
407165,
407172,
407173,
407194,
407321,
407323,
407325,
407328,
407331,
407335,
407336,
407337,
407339,
407341,
407342,
407344,
407348,
407352,
407360,
407361,
407364,
407367,
407370,
407372,
407373,
407374,
407710,
407711,
408907,
408911,
409524,
409525,
409526,
409528,
409529,
413877,
413878,
414284,
414694,
]

In [126]:
len(set(sample_75) - set(sample_50)) 

NameError: name 'sample_50' is not defined

In [127]:
len(set(sample_75) - set(sample_25)) 

160

### Create Sub graph (Masked Graph)

In [153]:
g.nodes[0]['sensor']

'400001'

In [154]:
# sample 25
nodes = [i for i in g.nodes if int(g.nodes[i]['sensor']) in sample_25]
print(len(nodes))
print(nodes)

80
[1, 3, 5, 7, 8, 10, 12, 16, 21, 28, 34, 39, 42, 43, 44, 46, 47, 50, 52, 59, 62, 65, 66, 69, 74, 83, 86, 87, 88, 95, 96, 98, 99, 100, 101, 104, 106, 108, 112, 115, 116, 117, 125, 127, 136, 139, 140, 141, 142, 145, 147, 149, 152, 155, 159, 161, 162, 163, 164, 181, 182, 184, 185, 196, 224, 231, 232, 233, 237, 238, 239, 240, 241, 250, 251, 252, 305, 306, 318, 319]


In [130]:
# sample 50
nodes = [i for i in g.nodes if int(g.nodes[i]['sensor']) in sample_50]
print(len(nodes))
print(nodes)

NameError: name 'sample_50' is not defined

In [155]:
# sample 75
nodes = [i for i in g.nodes if int(g.nodes[i]['sensor']) in sample_75]
print(len(nodes))
print(nodes)

240
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 20, 21, 24, 26, 28, 29, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 46, 47, 48, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 78, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 121, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 134, 136, 137, 139, 140, 141, 142, 143, 144, 145, 147, 148, 149, 150, 151, 152, 155, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 181, 182, 184, 185, 186, 190, 191, 192, 193, 194, 195, 196, 197, 198, 201, 202, 203, 204, 205, 209, 210, 211, 212, 213, 214, 215, 216, 217, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 247, 248, 249, 250, 251, 252, 260, 261, 262, 275, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 2

In [7]:
'''
Replace with your masking logic
'''
# nodes = list(range(len(g)))

# subgraph = g.subgraph(nodes)
subgraph = g

In [8]:
len(subgraph.nodes)

320

In [35]:
subgraph

<networkx.classes.graph.Graph at 0x187ebccafa0>

In [36]:
type(subgraph.nodes[1]['values'])

pandas.core.series.Series

In [14]:
subgraph.nodes[1]['values']

0
2020-01-01 00:05:00    68.0
2020-01-01 00:10:00    68.0
2020-01-01 00:15:00    67.8
2020-01-01 00:20:00    67.9
2020-01-01 00:25:00    67.8
                       ... 
2020-06-29 23:40:00    61.7
2020-06-29 23:45:00    61.8
2020-06-29 23:50:00    62.0
2020-06-29 23:55:00    62.1
2020-06-30 00:00:00    62.2
Name: 400017, Length: 52113, dtype: float64

In [15]:
subgraph.nodes[1]['values'].index

DatetimeIndex(['2020-01-01 00:05:00', '2020-01-01 00:10:00',
               '2020-01-01 00:15:00', '2020-01-01 00:20:00',
               '2020-01-01 00:25:00', '2020-01-01 00:30:00',
               '2020-01-01 00:35:00', '2020-01-01 00:40:00',
               '2020-01-01 00:45:00', '2020-01-01 00:50:00',
               ...
               '2020-06-29 23:15:00', '2020-06-29 23:20:00',
               '2020-06-29 23:25:00', '2020-06-29 23:30:00',
               '2020-06-29 23:35:00', '2020-06-29 23:40:00',
               '2020-06-29 23:45:00', '2020-06-29 23:50:00',
               '2020-06-29 23:55:00', '2020-06-30 00:00:00'],
              dtype='datetime64[ns]', name=0, length=52113, freq=None)

In [16]:
len(subgraph.nodes[1]['values'].index)

52113

In [17]:
min(subgraph.nodes[1]['values'].index), max(subgraph.nodes[1]['values'].index) 

(Timestamp('2020-01-01 00:05:00'), Timestamp('2020-06-30 00:00:00'))

In [18]:
subgraph.nodes[1]['values'].values

array([68. , 68. , 67.8, ..., 62. , 62.1, 62.2])

In [19]:
len(subgraph.nodes[1]['values'].values)

52113

In [20]:
subgraph.nodes[1]['sensor']

'400017'

### Flush out Input data for DCRNN

In [21]:
# list(subgraph.nodes)

In [22]:
len(subgraph.nodes[1]['values'].index)

52113

In [23]:
from datetime import datetime 

start_date = datetime.strptime('2020-01-01 00:35:00', '%Y-%m-%d %H:%M:%S')
end_date = datetime.strptime('2020-01-05 00:35:00', '%Y-%m-%d %H:%M:%S')


In [24]:
%%time

for i in subgraph.nodes:
    filter_ = (subgraph.nodes[i]['values'].index>=start_date)&(subgraph.nodes[i]['values'].index<=end_date)
    subgraph.nodes[i]['values'] = subgraph.nodes[i]['values'][filter_]

Wall time: 216 ms


In [25]:
len(subgraph.nodes[1]['values'].index)

1153

In [9]:
%%time
outdir = ''

c = create_dcrnn_data(subgraph, outdir = '', horizon = 12)


Wall time: 0 ns


In [10]:
c._createDataFrame()

Unnamed: 0_level_0,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01 00:05:00,71.7,68.0,66.7,68.4,68.0,68.1,66.3,67.1,66.9,71.6,...,68.1,68.2,67.2,66.8,72.0,68.2,68.9,66.0,70.3,67.0
2020-01-01 00:10:00,71.7,68.0,67.2,68.3,68.0,68.2,66.5,67.1,66.6,72.3,...,68.6,68.5,67.1,66.5,72.2,68.8,69.2,66.6,70.3,67.5
2020-01-01 00:15:00,71.8,67.8,67.4,67.4,67.8,68.0,67.0,67.6,66.2,71.2,...,68.8,68.4,66.9,66.8,72.0,68.7,69.4,66.5,70.9,67.6
2020-01-01 00:20:00,71.6,67.9,66.4,67.3,67.9,67.7,66.6,67.4,66.2,71.3,...,68.8,68.4,67.3,66.8,72.1,67.3,69.7,66.7,70.7,67.1
2020-01-01 00:25:00,71.3,67.8,66.0,67.3,67.8,67.5,66.9,67.6,65.9,70.5,...,68.7,68.3,67.9,66.9,71.6,67.6,69.8,66.5,70.5,67.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-29 23:40:00,71.3,61.7,68.3,67.4,62.2,66.4,67.0,67.2,65.7,64.7,...,62.6,67.8,66.1,61.2,70.6,65.6,70.0,67.1,71.3,67.3
2020-06-29 23:45:00,71.5,61.8,67.4,67.1,62.5,66.0,65.6,67.4,65.6,64.2,...,64.0,67.8,65.8,61.4,70.3,66.7,69.8,66.9,71.2,66.6
2020-06-29 23:50:00,71.3,62.0,67.7,67.6,62.5,67.7,67.5,67.3,65.9,64.6,...,63.2,67.7,66.1,60.9,70.9,67.7,69.2,66.9,70.4,65.9
2020-06-29 23:55:00,69.5,62.1,68.2,67.0,62.3,66.0,66.5,67.2,66.3,64.7,...,62.5,67.7,65.7,61.0,71.6,68.4,69.3,67.2,71.0,67.5


In [11]:
%%time
# c.generate_dataset()
c.generate_train_val_test()
c.generateAdjacencyMatrix()

x shape:  (52090, 12, 320, 2) , y shape:  (52090, 12, 320, 2)
train x:  (36463, 12, 320, 2) y: (36463, 12, 320, 2)
val x:  (5209, 12, 320, 2) y: (5209, 12, 320, 2)
test x:  (10418, 12, 320, 2) y: (10418, 12, 320, 2)
Wall time: 9min 2s


### Now train DCRNN using this data