In [1]:
import os 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import random

In [2]:
os.chdir('../')
import data 
import losses
os.chdir('notebooks')

2023-03-05 16:25:15.703880: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Infering Station Closure

Previously I defined station closure if the station had no transaction in the 15-min period. However this seems to be a very restrictive assumptions. There seems to be some time in which the closure happened an there is still a few amount of transactions. The objective here is to select a less restrictive treshold to signal that a station in close 

In [None]:
target = data.clean_data('../../data/transactions.parquet')
target = target.iloc[:,:-8]

In [None]:
treshold = 5 
stations = random.sample(list(target.columns), k = 10)

In [None]:
# Histogram of transactions by station
fig, axs = plt.subplots(10,2, sharey = True, figsize = (15,50))

for i, j in itertools.product(range(10), range(2)):
    
    station = stations[i]
    series = target[station]
    series = series[series.between(0,100)]
    
    if j == 0:
        series.plot.hist(bins = 50, ax = axs[i][j])
        axs[i][j].set_title(f"{station} \n No filter")
    else:
        series_mask = series.copy().mask(series < treshold, 0)
        series_mask.plot.hist(bins = 50, ax = axs[i][j])
        axs[i][j].set_title(f"{station} \n Treshold : {treshold}")

It seems like the treshold of 5 is borderline for small satation, but good for big stations. I'm wondering if most of this small transactions happen before 6 am, or after 10 pm. 

In [None]:
# Histogram of transactions by station
fig, axs = plt.subplots(10, sharey = False, figsize = (15,60))
for i in range(10):
    
    station = stations[i]
    series = target[station]
    
    df = pd.DataFrame({'hour':series.index.hour + series.index.minute/60, 'value':series.values})
    g = sns.boxplot(x='hour', y='value', data=df, ax = axs[i])
    axs[i].tick_params(axis='x', rotation=90)
    axs[i].set_title(station)
    axs[i].set_xlabel("")

plt.show()

In [None]:
# Histogram of transactions by station
fig, axs = plt.subplots(10, sharey = False, figsize = (15,60))
for i in range(10):
    
    station = stations[i]
    series = target[station]
    series = series[series.between(0,20)]
    
    df = pd.DataFrame({'hour':series.index.hour + series.index.minute/60, 'value':series.values})
    sns.boxplot(data = df, x = 'hour', y = 'value', ax = axs[i])
    axs[i].tick_params(axis='x', rotation=90)
    axs[i].set_title(station)
    axs[i].set_xlabel("")

plt.show()

5 seems to be a resonale treshold for most stations. I will use this treshold to infer that a station was close. 

For that I will modify the input itself, because I want to avoid comparing a zero value with some prediction, which would increase the error by a lot. 

In [None]:
# Save new dataFrame, so I keep both versions 
target = data.clean_data('../../data/transactions.parquet')
# target.iloc[:,:-8] = target.iloc[:,:-8].mask(target.iloc[:,:-8] <= 5, 0)
# target.to_parquet('../../data/transactions_closures_infered.parquet')

# Reducing the lookback window 

Remove the 8th to 10th hour look back window. This could be useful for the fully defiened transformer, but no really for the GNN + Transformer. 

The 8th and 10th tries to look at shifted spatio-temporal correlations (This could be another research problem)

In [None]:
#Prepare data for simulation 
transactions_path = '../../data/transactions.parquet'
stations_path = '../../data/stations_DB.parquet'
adj_path = '../../data/adjacency_matrix.parquet'
aggregation = "15-mins"
# train_date = '2018-08-01'
train_date = '2015-08-20'
max_transactions = '1500'
max_stations = None

train_data, test_data, adj_matrix, metadata = data.tf_data(
        transactions_path,
        stations_path,
        adj_path,
        aggregation,
        train_date,
        max_transactions,
        max_stations)

# Including function to get data for a ranges of dates

In [None]:
target = data.clean_data('../../data/transactions.parquet')

In [15]:
#Prepare data for simulation 
transactions_path = '../../data/transactions.parquet'
stations_path = '../../data/stations_DB.parquet'
adj_path = '../../data/adjacency_matrix.parquet'
aggregation = "15-mins"
# train_date = '2018-08-01'
train_date = '2018-08-29'
max_transactions = None
max_stations = None
date_range = ["2018-08-01",'2018-08-30'] #New piece of information. 

train_data, test_data, adj_matrix, metadata = data.tf_data(
        transactions_path,
        stations_path,
        adj_path,
        aggregation,
        train_date,
        max_transactions,
        max_stations,
        date_range)


Train features shape: (1062, 10, 147)
Test features shape: (154, 10, 147)

Train time_embeddings shape: (1062, 11, 8)
Test time_embeddings shape: (154, 11, 8)

Train spatial_embeddings shape: (1062, 147, 2)
Test spatial_embeddings shape: (154, 147, 2)

Train labels shape: (1062, 147)
Test labels shape: (154, 147)

Train status shape: (1062, 11, 147)
Test status shape: (154, 11, 147)

Adj Matrix Shape: (1, 147, 10, 147)


In [16]:
metadata['train_date_index']

DatetimeIndex(['2018-08-15 04:00:00', '2018-08-15 04:15:00',
               '2018-08-15 04:30:00', '2018-08-15 04:45:00',
               '2018-08-15 05:00:00', '2018-08-15 05:15:00',
               '2018-08-15 05:30:00', '2018-08-15 05:45:00',
               '2018-08-15 06:00:00', '2018-08-15 06:15:00',
               ...
               '2018-08-28 20:00:00', '2018-08-28 20:15:00',
               '2018-08-28 20:30:00', '2018-08-28 20:45:00',
               '2018-08-28 21:00:00', '2018-08-28 21:15:00',
               '2018-08-28 21:30:00', '2018-08-28 21:45:00',
               '2018-08-28 22:00:00', '2018-08-28 22:15:00'],
              dtype='datetime64[ns]', name='timestamp', length=1062, freq=None)

In [17]:
metadata['test_date_index']

DatetimeIndex(['2018-08-28 22:30:00', '2018-08-28 22:45:00',
               '2018-08-29 04:00:00', '2018-08-29 04:15:00',
               '2018-08-29 04:30:00', '2018-08-29 04:45:00',
               '2018-08-29 05:00:00', '2018-08-29 05:15:00',
               '2018-08-29 05:30:00', '2018-08-29 05:45:00',
               ...
               '2018-08-30 20:30:00', '2018-08-30 20:45:00',
               '2018-08-30 21:00:00', '2018-08-30 21:15:00',
               '2018-08-30 21:30:00', '2018-08-30 21:45:00',
               '2018-08-30 22:00:00', '2018-08-30 22:15:00',
               '2018-08-30 22:30:00', '2018-08-30 22:45:00'],
              dtype='datetime64[ns]', name='timestamp', length=154, freq=None)