In [1]:
import numpy as np
import pandas as pd
import sys

from copy import deepcopy
from tqdm.auto import tqdm, trange

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from IPython.display import SVG, display
import plotly.graph_objects as go
import plotly.express as px

%matplotlib inline 
%config InlineBackend.figure_format='retina'
%load_ext autoreload 
%autoreload 2

def imshow(fig):
    return display(SVG(fig.to_image(format="svg")))

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [4]:
def summarize(data):
    print(f'number of data: {len(data)}')
    
    seq_lens = [len(seq) for seq in data]
    print(f'sequence length range: {min(seq_lens)} ~ {max(seq_lens)}')
    
    ranges = []
    for i in range(3):
        start = min([seq[0, i]  for seq in data])
        end   = max([seq[-1, i] for seq in data])
        ranges.append((start, end))

    print(f'time range: {ranges[0][0]:.3f} ~ {ranges[0][1]:.3f}')
    print(f's1 range:   {ranges[1][0]:.3f} ~ {ranges[1][1]:.3f}')
    print(f's2 range:   {ranges[2][0]:.3f} ~ {ranges[2][1]:.3f}')
        
    fig = go.Figure(data=[go.Histogram(x=seq_lens, histnorm='probability', name='Sequence Lengths')])
    fig.show()

In [2]:
dataset = np.load("boston_violent_crimes.npz")
dates = dict()
for f in dataset.files:
    dates[f[:8]] = 1
dates = list(dates.keys())
exclude_from_train = (dates[::27] + dates[1::27] + dates[2::27]
                              + dates[3::27] + dates[4::27] + dates[5::27]
                              + dates[6::27] + dates[7::27])
val_dates = dates[2::27]
test_dates = dates[5::27]
train_dates = set(dates).difference(exclude_from_train)
date_splits = {"train": train_dates, "val": val_dates, "test": test_dates}

d2f = lambda dates : [f for f in dataset.files if f[:8] in dates]

val_files, test_files, train_files = d2f(val_dates), d2f(test_dates), d2f(train_dates)
file_splits = {"train": train_files, "val": val_files, "test": test_files}

for key, value in file_splits.items():
    print(f'{key} set contains {len(value)} sequences')

train set contains 1935 sequences
val set contains 102 sequences
test set contains 102 sequences


In [5]:
data = [dataset[f] for f in train_files + test_files + val_files]
summarize(data)

number of data: 2139
sequence length range: 10 ~ 159
time range: 0.000 ~ 168.060
s1 range:   -71.179 ~ -71.005
s2 range:   42.237 ~ 42.393


In [6]:
with open('processed/boston_crimes.npz', 'wb') as f:
    np.savez(f, 
             train=np.array([dataset[f] for f in train_files],dtype=object),
             test =np.array([dataset[f] for f in test_files],dtype=object),
             val  =np.array([dataset[f] for f in val_files],dtype=object),)