In [2]:
import pandas as pd
import numpy as np
import os

In [None]:
file_list = [x for x in os.listdir('./NO2_2021/') if x.endswith('csv') and '2021' in x]
print(len(file_list))

In [3]:
def sliding_window(matrix, window_size, steps=1):
    rows = len(matrix) - window_size + 1
    cols = len(matrix[0]) - window_size + 1
    for i in range(0, rows, steps):
        for j in range(0, cols, steps):
            sub_matrix = matrix[i : i+window_size, j : j+window_size]
            yield sub_matrix

In [4]:
window_size = 96

In [5]:
num_samples = len(file_list)
valid_list = np.random.choice(file_list, size=int(num_samples * 0.2), replace=False)
train_list = [x for x in file_list if x not in valid_list]
test_list = np.random.choice(valid_list, size=int(num_samples * 0.1), replace=False)
val_list = [x for x in valid_list if x not in test_list]

In [None]:
len(train_list), len(val_list), len(test_list)

In [7]:
for file in file_list:
    data = pd.read_csv(f"./NO2/{file}").values
    file_id = file.split('.')[0]
    for ind, mat in enumerate(sliding_window(data, window_size, 4)):
        if (np.isnan(mat) * 1).sum() != 0:
            continue
        else:
            if file in train_list:
                np.save(f'./train_no2/{window_size}/train/{file_id}-{ind}.npy', mat)
            elif file in val_list:
                np.save(f'./train_no2/{window_size}/valid/{file_id}-{ind}.npy', mat)
            else:
                np.save(f'./train_no2/{window_size}/test/{file_id}-{ind}.npy', mat)

In [None]:
len(os.listdir(f'./train_no2/{window_size}/train/'))

In [None]:
len(os.listdir(f'./train_no2/{window_size}/valid/'))

In [None]:
len(os.listdir(f'./train_no2/{window_size}/test/'))

In [11]:
import cv2

In [12]:
mask_list = {}
for file in file_list:
    data = pd.read_csv(f"./NO2/{file}").values
    file_id = file.split('.')[0]
    count = 0
    for ind, mat in enumerate(sliding_window(data, window_size, 4)):
        cur_no2 = np.isnan(mat[:,:])
        na_sums = (cur_no2 * 1).sum()
        miss_rate = round(na_sums / (window_size**2), 2) * 100
        if miss_rate > 40:
            continue
        if (miss_rate % 10 == 0) and miss_rate > 0:
            fold_path = str(int(miss_rate))
            if not os.path.exists(f"./train_no2/{window_size}/mask/{fold_path}"):
                os.mkdir(f"./train_no2/{window_size}/mask/{fold_path}")
            if fold_path not in mask_list:
                mask_list[fold_path] = 1
            else:
                mask_list[fold_path] += 1
            msk = 1 - (cur_no2 * 1)
            cv2.imwrite(f'./train_no2/{window_size}/mask/{fold_path}/{file_id}-{ind}.jpg', msk)

In [None]:
len(os.listdir(f'./train_no2/{window_size}/mask/20/'))