In [2]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from raw_data import air_quality_train_data

dt = air_quality_train_data("/home/hoang/Documents/CodeSpace/air-quality-forecasting/data/data-train")

In [35]:
feats = []

for key in ("input", "output"):
    for station in dt[key].values():
        feats += station["data"]["humidity"].dropna().tolist()

In [3]:
x = torch.tensor([1, 2, 3, 4])
x[0:None]

tensor([1, 2, 3, 4])

In [36]:
from sklearn.preprocessing import StandardScaler

feats = np.array(feats).reshape(-1, 1)
scaler = StandardScaler()
scaled_feats = scaler.fit_transform(feats)

In [37]:
import math
scaler.mean_, math.sqrt(scaler.var_)

(array([75.76537115]), 15.434225331097021)

In [38]:
feats.mean(), feats.std()

(75.76537114905305, 15.434225331097021)

In [28]:
L = 24
x = []

for i in range(375):
    _x = []
    for station in dt["input"].values():
        stdt = station["data"]["PM2.5"]
        start_idx = L * i
        end_idx = start_idx + L

        frame = stdt[start_idx : end_idx]

        if frame.isna().sum() == 0:
            _x.append(frame.tolist())

    x.append(torch.tensor(_x))

In [None]:
x[-1]

In [30]:
class MissingDataset(Dataset):
    def __init__(self, path: str, frame_size: int, usecol: str = "PM2.5"):
        self.data = self._preprocessing(path, usecol, frame_size)

    def _preprocessing(self, path: str, usecol: str, frame_size: int):
        raw = air_quality_train_data(path)
        data_len = int(9000 / frame_size)
        
        x = []
        for i in range(data_len):
            _x = []
            for k in ("input", "output"):
                for station in dt[k].values():
                    stdt = station["data"][usecol]
                    start_idx = frame_size * i
                    end_idx = start_idx + frame_size

                    frame = stdt[start_idx : end_idx]

                    if frame.isna().sum() == 0:
                        _x.append(frame.tolist())

            x.append(torch.tensor(_x))

        return x

    def __getitem__(self, index):
        return x[index]

    def __len__(self):
        return len(x)

In [31]:
dts = MissingDataset(
    "/home/hoang/Documents/CodeSpace/air-quality-forecasting/data/data-train",
    frame_size=24,
)

In [35]:
dts[1].shape

torch.Size([10, 24])