In [1]:
import os
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

In [2]:
config = {
    "N_PREV": 60,
    "N_FWD": 30,
    
    "TEST_SIZE": 0.15
}

## **Load DataFrame**

In [3]:
df = pd.read_csv('data/venezia.csv')
df['datetime'] = pd.to_datetime(df['datetime'], infer_datetime_format=True)
df.sort_values(by='datetime', inplace=True, ignore_index=True)
df.head()

Unnamed: 0,datetime,level
0,1983-01-01 01:00:00,44.0
1,1983-01-01 02:00:00,35.0
2,1983-01-01 03:00:00,23.0
3,1983-01-01 04:00:00,10.0
4,1983-01-01 05:00:00,1.0


## **Analying Time Increments**

In [4]:
values = df.datetime.values.astype('int64') // 10**9
ranges = values[1:]-values[:-1]
vals, counts = np.unique(ranges, return_counts=True)
print("TIME INCREMENTS")
for val, count in zip(vals, counts):
    print(f"{val} - {count}")

TIME INCREMENTS
0 - 5
3600 - 289261
7200 - 5


In [5]:
bad_indecies = [[x, x+1] for x in np.where(ranges!=3600)[0]]
print("BAD INDECIES")
for [idx1, idx2] in bad_indecies:
    print(f"INCREMENT: {values[idx2] - values[idx1]} FROM {idx1}: {df.level.values[idx1]} TO {idx2}: {df.level.values[idx1]}")

BAD INDECIES
INCREMENT: 0 FROM 140255: 57.0 TO 140256: 57.0
INCREMENT: 7200 FROM 148991: 6.0 TO 148992: 6.0
INCREMENT: 0 FROM 198815: 54.0 TO 198816: 54.0
INCREMENT: 7200 FROM 198839: 51.0 TO 198840: 51.0
INCREMENT: 0 FROM 219143: 2.0 TO 219144: 2.0
INCREMENT: 7200 FROM 227903: 19.0 TO 227904: 19.0
INCREMENT: 0 FROM 242639: 37.0 TO 242640: 37.0
INCREMENT: 7200 FROM 242663: 56.0 TO 242664: 56.0
INCREMENT: 0 FROM 251399: -1.0 TO 251400: -1.0
INCREMENT: 7200 FROM 251423: 8.0 TO 251424: 8.0


## **Clean Zero Increments**

In [6]:
df = df.drop(axis=0, index=[x[0] for x in bad_indecies if values[x[1]] - values[x[0]] == 0]).reset_index(drop=True)
df.head()

Unnamed: 0,datetime,level
0,1983-01-01 01:00:00,44.0
1,1983-01-01 02:00:00,35.0
2,1983-01-01 03:00:00,23.0
3,1983-01-01 04:00:00,10.0
4,1983-01-01 05:00:00,1.0


In [7]:
values = df.datetime.values.astype('int64') // 10**9
ranges = values[1:]-values[:-1]
vals, counts = np.unique(ranges, return_counts=True)
print("TIME INCREMENTS")
for val, count in zip(vals, counts):
    print(f"{val} - {count}")

TIME INCREMENTS
3600 - 289261
7200 - 5


In [8]:
bad_indecies = [[x, x+1] for x in np.where(ranges!=3600)[0]]
print("BAD INDECIES")
for [idx1, idx2] in bad_indecies:
    print(f"INCREMENT: {values[idx2] - values[idx1]} FROM {idx1}: {df.level.values[idx1]} TO {idx2}: {df.level.values[idx1]}")

BAD INDECIES
INCREMENT: 7200 FROM 148990: 6.0 TO 148991: 6.0
INCREMENT: 7200 FROM 198837: 51.0 TO 198838: 51.0
INCREMENT: 7200 FROM 227900: 19.0 TO 227901: 19.0
INCREMENT: 7200 FROM 242659: 56.0 TO 242660: 56.0
INCREMENT: 7200 FROM 251418: 8.0 TO 251419: 8.0


## **Data Functions**

In [9]:
def get_splits():
    data = df.values
    splits = []
    tss = TimeSeriesSplit(n_splits=3, test_size=int(df.shape[0]*config["TEST_SIZE"]))
    for train_idxs, test_idxs in tss.split(data):
        final_train_sample = train_idxs[-1]
        final_test_sample = test_idxs[-1]
        splits.append([final_train_sample, final_test_sample])
    return splits

SPLITS = get_splits()

In [10]:
def get_data(split):
    data = df.level.values
    [final_train_sample, final_test_sample] = split
    normalizer = StandardScaler()
    normalizer.fit(data[:final_train_sample].reshape(-1,1))
    data = normalizer.transform(data.reshape(-1,1)).flatten()
    X_train, y_train = [], []
    X_val, y_val = [], []
    for i in range(final_train_sample-config["N_FWD"]-config["N_PREV"]):
        X_train.append(data[i:i+config["N_PREV"]])
        y_train.append(data[i+config["N_PREV"]:i+config["N_PREV"]+config["N_FWD"]])
    for i in range(final_train_sample, final_test_sample-config["N_FWD"]-config["N_PREV"]):
        X_val.append(data[i:i+config["N_PREV"]])
        y_val.append(data[i+config["N_PREV"]:i+config["N_PREV"]+config["N_FWD"]])
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val), normalizer

In [11]:
for fold, split in enumerate(SPLITS):
    X_train, y_train, X_val, y_val, normalizer = get_data(split)
    with open(f'./data/fold{fold+1}_data_v2.npy', mode='wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_val)
        np.save(f, y_val)
    with open(f'./data/fold{fold+1}_normalizer_v2.pkl', mode='wb') as f:
        pickle.dump(normalizer, f)