In [1]:
import sys
import os
sys.path.append(os.path.abspath('.'))

In [None]:
import lib as grin

In [None]:
from lib import datasets

In [None]:
from lib.utils import numpy_metrics
from lib.utils.parser_utils import str_to_bool

metrics = {
    'mae': numpy_metrics.masked_mae,
    'mse': numpy_metrics.masked_mse,
    'mre': numpy_metrics.masked_mre,
    'mape': numpy_metrics.masked_mape
}


In [None]:
import pandas as pd
df_air = pd.HDFStore('/home/jhzhou/repos/tourism-imputation/data/grin-data/air_quality/small36.h5')

In [None]:
df_air.groups()

In [None]:
df_air['eval_mask']

In [None]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from pathlib import Path

data_path = Path('../../data/open-data/HK2012-2018/')
df_au = pd.read_csv(data_path / 'Australia.csv')
df_au['date'] = pd.to_datetime(df_au['date'])
df_au.set_index('date', inplace=True)
df_au.head()

## Dataset Test

In [None]:
import numpy as np
import pandas as pd
import lib as grin
from lib.datasets.pd_dataset import PandasDataset
from lib.utils import compute_mean
dataset_path = '../../data/open-data/HK2012-2018/Australia.csv'
mask_path = '../../data/masks/block5.npy'

country = 'au'

class ArrivalDataset(PandasDataset):
    SEED = 1145
    
    def __init__(self, impute_nans=True, small=False, freq='MS'):
        self.random = np.random.default_rng(self.SEED)
        self.eval_mask = None
        df, mask = self.load(impute_nans=impute_nans, small=small)
        df = df.astype('float32')
        super().__init__(dataframe=df, u=None, mask=mask, name='arrival', freq=freq, aggr='nearest')

    def load_raw(self, small=False):
        df = pd.read_csv(dataset_path, index_col=0, parse_dates=True)
        mask = np.load(mask_path)
        return df, mask


    def load(self, impute_nans=True, small=False):
        # load readings and stations metadata
        df, eval_mask = self.load_raw(small)
        # compute the masks
        mask = (~np.isnan(df.values)).astype('uint8')  # 1 if value is not nan else 0

        eval_mask = eval_mask.astype('uint8')
        self.eval_mask = eval_mask  # 1 if value is ground-truth for imputation else 0
        # eventually replace nans with weekly mean by hour
        if impute_nans:
            df = df.fillna(compute_mean(df))
        # compute distances from latitude and longitude degrees
        return df, mask

    def splitter(self, dataset, val_len=1., in_sample=False, window=0):
        if in_sample:
            train_idxs = np.arange(len(dataset))
        else:
            val_len = 12
            test_len = 24
            train_idxs = np.arange(len(dataset) - val_len - test_len)
            val_idxs = np.arange(len(train_idxs), len(dataset) - test_len)
            test_idxs = np.arange(len(dataset) - test_len, len(dataset))
        return [train_idxs, val_idxs, test_idxs]

    def get_similarity(self):
        N = len(self.df)
        return np.ones(N, N) - np.identity(N) 

    @property
    def mask(self):
        return self._mask

    @property
    def training_mask(self):
        return self._mask if self.eval_mask is None else (self._mask & (1 - self.eval_mask))

    def test_interval_mask(self, dtype=bool, squeeze=True):
        m = np.in1d(self.df.index.month, self.test_months).astype(dtype)
        if squeeze:
            return m
        return m[:, None]

In [None]:
arrival = ArrivalDataset()

In [None]:
from lib.datasets import AirQuality
air = AirQuality()

In [None]:
arrival.numpy().dtype

## Splitter Test

In [None]:
dataset = arrival

In [None]:
from lib.data.imputation_dataset import ImputationDataset, GraphImputationDataset
from lib.nn import models
from lib import fillers, datasets, config

def has_graph_support(model_cls):
    return model_cls in [models.GRINet, models.MPGRUNet, models.BiMPGRUNet]
model_cls, filler_cls = models.GRINet, fillers.GraphFiller
dataset_cls = GraphImputationDataset if has_graph_support(model_cls) else ImputationDataset
torch_dataset = dataset_cls(*dataset.numpy(return_idx=True),
                            mask=dataset.training_mask,
                            eval_mask=dataset.eval_mask,
                            window=12,
                            stride=1)

idxs = dataset.splitter(torch_dataset, val_len=0.1, in_sample=False, window=0)

In [None]:
torch_dataset[0][0]['x'].shape

In [None]:
idxs

In [None]:
torch_dataset.data

# Mask Test

In [4]:
import lib as grin
from lib.utils import sample_mask
import pandas as pd
import numpy as np
arrival = pd.read_csv('/home/jhzhou/repos/tourism-imputation/data/open-data/HK2012-2018/Australia.csv', index_col=0, parse_dates=True)
n, m = arrival.shape
mask_shape = (n - 1, m)

In [5]:
arrival.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 84 entries, 2012-01-01 to 2018-12-01
Data columns (total 97 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   arrival                                     84 non-null     int64
 1   Hong kong                                   84 non-null     int64
 2   Hong kong dollar                            84 non-null     int64
 3   Sheung Wan                                  84 non-null     int64
 4   Tai Ping Shan Street                        84 non-null     int64
 5   Hong kong central                           84 non-null     int64
 6   Hong Kong Disneyland                        84 non-null     int64
 7   hong kong dollar to rmb                     84 non-null     int64
 8   hkd to usd                                  84 non-null     int64
 9   Hong Kong cuisine                           84 non-null     int64
 10  hong kong food      

In [16]:
def concate_mask(mask_shape, p = 0.05, min_seq=1, max_seq=1):
    mask = sample_mask(mask_shape, p, min_seq=min_seq, max_seq=max_seq)
    #return mask
    return np.vstack([np.zeros((1, m), dtype='uint8'), mask])

In [28]:
random5  = concate_mask(mask_shape, p = 0.05, min_seq=1, max_seq=1)
random10 = concate_mask(mask_shape, p = 0.10, min_seq=1, max_seq=1)
block5   = concate_mask(mask_shape, p = 0.006, min_seq=6, max_seq=12)
block10  = concate_mask(mask_shape, p = 0.012, min_seq=6, max_seq=12)
print(random5.sum(), random10.sum(), block5.sum(), block10.sum())

396 797 398 785


In [31]:
random5

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [29]:
import numpy as np
np.save('../../data/masks/random5.npy',random5)
np.save('../../data/masks/random10.npy',random10)
np.save('../../data/masks/block5.npy',block5)
np.save('../../data/masks/block10.npy',block10)

## Export Missing dataset

In [19]:
import numpy as np
import pandas as pd

masks = dict(
    random5 = np.load('../../data/masks/random5.npy').astype(bool),
    random10 = np.load('../../data/masks/random10.npy').astype(bool),
    block5 = np.load('../../data/masks/block5.npy').astype(bool),
    block10 = np.load('../../data/masks/block10.npy').astype(bool)
)
datasets = {
    'AU': '../../data/open-data/HK2012-2018/Australia.csv',
    'PH': '../../data/open-data/HK2012-2018/Philippine.csv',
    'SG': '../../data/open-data/HK2012-2018/Singapore.csv',
    'TH': '../../data/open-data/HK2012-2018/Thailand.csv',
    'UK': '../../data/open-data/HK2012-2018/United_Kingdom.csv',
    'US': '../../data/open-data/HK2012-2018/United_States.csv',
}

In [20]:
from sklearn.preprocessing import MinMaxScaler
test_len = 12
for country, _path in datasets.items():
    data = pd.read_csv(_path,index_col=0, parse_dates=True)
    norlizer = MinMaxScaler().fit(data)
    for key, mask in masks.items():
        miss_data = data.to_numpy(copy=True)
        miss_data = norlizer.transform(miss_data)
        miss_data[mask] = np.nan
        pd.DataFrame(miss_data[:-test_len], columns=data.columns).to_csv(f'../../data/miss_data/{country.lower()}-{key}.csv', index=False)