In [1]:
import sqlalchemy as sal
import pandas as pd
from os.path import join
import numpy as np

import torch

In [2]:
%%time

#parameters of the AWS database
endpoint = "capstone.clihskgj8i7s.us-west-2.rds.amazonaws.com"
user="group3"
db="db1"
#pw=getpass.getpass("Enter database password")
pw=open(r'C:\Users\rmartinez4\OneDrive - Illumina, Inc\Desktop\password.txt',"r").read().rstrip()

engine = sal.create_engine('postgresql://%s:%s@%s/%s' % (user, pw, endpoint, db))

# query="""
# select timestamp, station, total_flow
# from traffic_train
# ;
# """

query="""
select timestamp, station, avg_speed, total_flow
from traffic_train
where station in (400714, 400743, 400001)
;
"""

df_query_raw=pd.read_sql(query, engine)

print(df_query_raw.shape)
df_query_raw.head()

(156328, 4)
Wall time: 2.97 s


Unnamed: 0,timestamp,station,avg_speed,total_flow
0,2020-01-01 00:05:00,400001,71.7,57.0
1,2020-01-01 00:05:00,400714,71.0,61.0
2,2020-01-01 00:05:00,400743,68.8,111.0
3,2020-01-01 00:10:00,400001,71.7,47.0
4,2020-01-01 00:10:00,400714,70.6,63.0


In [3]:
df_query_raw.isna().sum()

timestamp      0
station        0
avg_speed     17
total_flow    17
dtype: int64

In [4]:
# fill na values with rolling mean
df_query_cleaned = df_query_raw.fillna(df_query_raw.rolling(window=6,min_periods=1).mean())

In [5]:
df_query_cleaned.isna().sum()

timestamp     0
station       0
avg_speed     0
total_flow    0
dtype: int64

In [6]:
def generate_seq2seq_data_4dim(df, horizon, window, features):
    
    idx_cols = ['station','timestamp']

    df = df.set_index(idx_cols).sort_values(by=idx_cols)

    features_tensor_list = []
    for f in features:
        print(f)
        
        ts_seq_list = []
        for s in df.index.unique(level=0):
            print(s)
            values = df.loc[s][f].values

            for i in range(len(values)-horizon*2):
                arr = np.array(values[i:i+horizon*2])
                ts_seq_list.append(torch.from_numpy(arr.reshape(horizon*2,1)))

        sequence_tensor = torch.stack(ts_seq_list, dim=0)
        sequence_tensor = torch.reshape(sequence_tensor, tuple(sequence_tensor.shape)+(1,))

        features_tensor_list.append(sequence_tensor)

    return torch.cat(features_tensor_list, dim=3)

In [18]:
def generate_seq2seq_data(df, horizon, window, features):
    
    idx_cols = ['station','timestamp']

    df = df.set_index(idx_cols).sort_values(by=idx_cols)

    features_tensor_list = []
    for f in features:
        print(f)
        
        ts_seq_list = []
        for s in df.index.unique(level=0):
            print(s)
            values = df.loc[s][f].values

            for i in range(len(values)-horizon*2):
                arr = np.array(values[i:i+horizon*2])
                ts_seq_list.append(torch.from_numpy(arr.reshape(horizon*2,1)))

        sequence_tensor = torch.stack(ts_seq_list, dim=0)

        features_tensor_list.append(sequence_tensor)

    return torch.cat(features_tensor_list, dim=2)

In [21]:
%%time
data_seq2seq = generate_seq2seq_data(df_query_cleaned,
                                     horizon=12, 
                                     window=1, 
#                                     features=['avg_speed', 'total_flow'])
                                     features=['avg_speed'])

print(data_seq2seq.shape)

avg_speed
400001
400714
400743
torch.Size([156256, 24, 1])
Wall time: 1.12 s


In [22]:
x = data_seq2seq[:, :12, :]
y = data_seq2seq[:, 12:, :]

print(x.shape, y.shape)

torch.Size([156256, 12, 1]) torch.Size([156256, 12, 1])


In [23]:
num_samples = x.shape[0]
num_test = round(num_samples * 0.2)
num_train = round(num_samples * 0.7)
num_val = num_samples - num_test - num_train

print(num_samples, num_test, num_train, num_val)

156256 31251 109379 15626


In [24]:
# train
x_train, y_train = x[:num_train], y[:num_train]

# val
x_val, y_val = (
    x[num_train: num_train + num_val],
    y[num_train: num_train + num_val],
)

# test
x_test, y_test = x[-num_test:], y[-num_test:]

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

torch.Size([109379, 12, 1]) torch.Size([109379, 12, 1])
torch.Size([15626, 12, 1]) torch.Size([15626, 12, 1])
torch.Size([31251, 12, 1]) torch.Size([31251, 12, 1])


In [25]:
output_dir = r'C:\Users\rmartinez4\Box\Personal Git\Nautilus-seq2seq\boiler_plate_seq2seq'

for cat in ["train", "val", "test"]:
    _x, _y = locals()["x_" + cat], locals()["y_" + cat]
    print(cat, "x: ", _x.shape, "y:", _y.shape)
    np.savez_compressed(
        join(output_dir, "%s.npz" % cat),
        x=_x,
        y=_y
    )

train x:  torch.Size([109379, 12, 1]) y: torch.Size([109379, 12, 1])
val x:  torch.Size([15626, 12, 1]) y: torch.Size([15626, 12, 1])
test x:  torch.Size([31251, 12, 1]) y: torch.Size([31251, 12, 1])


In [15]:
# locals()