In [88]:
import torch
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle as pkl
import os
import math
import random
import datetime
from tqdm import tqdm
%matplotlib widget

In [18]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

seed_everything(seed=10086)

# 数据集处理

In [59]:
# 输出目录
output_dir = 'processed'

## 缺失值设置

缺失比例设置为[10%, 30%, 50%]

In [60]:
miss_rates = [10, 30, 50]

**mask矩阵 0->missing ; 1->observed**

In [61]:
# (MCAR) Missing completely at random
def generate_mcar_mask(samples, ratio=0.1):
    assert isinstance(samples, np.ndarray)

    mask_matrix = np.random.choice([0, 1], size=samples.shape, p=[ratio, 1 - ratio])

    return mask_matrix

## 训练集/测试集 划分设置

训练集为整个数据集的前2/3，剩余为测试集

In [62]:
train_proportion = 2. / 3
test_proportion = 1. - train_proportion

## GHG数据集

In [6]:
ghg_dir = 'GHG'
num_files = len(os.listdir(ghg_dir))
print(f'GHG文件夹中有{num_files}个子文件')

GHG文件夹中有2921个子文件


In [21]:
# 随机选指定数量的文件当devices
num_devices = 7
indexes = np.random.choice(num_files, num_devices, replace=False)
indexes.sort()
indexes

array([  82,   93,  312,  645, 1573, 2099, 2853])

In [49]:
# 读取 dat 文件 转成numpy数组
ghg_list = []
for index in indexes:    
    filepath = os.path.join(ghg_dir, 'ghg.gid.site' + str(index).zfill(4) + '.dat')
    df = pd.read_csv(filepath, sep = " ", header=None, dtype=np.float32) 
    data = df.values
    data = np.transpose(data, (1, 0))
    ghg_list.append(data)

ghg_arr = np.stack(ghg_list, axis=1)

In [53]:
# (seq_len, num_devices, num_features)
ghg_arr.shape

(327, 7, 16)

In [55]:
# 保存成 npy 文件
np.save(os.path.join(output_dir, 'ghg.npy'), ghg_arr)

In [58]:
# 加载数组
ghg_arr = np.load(os.path.join(output_dir, 'ghg.npy'))
print(ghg_arr.shape)

(327, 7, 16)


In [64]:
# 生成 mask
for rate in miss_rates:
    mask = generate_mcar_mask(ghg_arr, ratio=rate / 100.)
    np.save(os.path.join(output_dir, f'ghg_mask_{rate}.npy'), mask)

## PRSA数据集

In [87]:
def temporal_KNN(seq, k_num):
    """
    时间KNN
    :param seq: 含有NAN的时间序列
    :param k_num: K近邻数
    :return: 补全的时间序列
    """
    ret = np.copy(seq)
    for index in range(len(seq)):
        if ~np.isnan(seq[index]):
            continue

        candi = []

        offset = 1
        while offset < max(index, len(seq) - index) and len(candi) < k_num:
            if index - offset >= 0 and ~np.isnan(ret[index - offset]):
                candi.append((offset, ret[index - offset]))
            if index + offset < len(seq) and ~np.isnan(ret[index + offset]):
                candi.append((offset, ret[index + offset]))
            offset += 1

        if len(candi) > 0:
            candi.sort(key=lambda elem: elem[0])
            p = 0.
            weights = 0.
            mean_val = 0.
            for x in range(min(len(candi), k_num)):
                # 加权算法
                weights += 1. / candi[x][0]
                p += candi[x][1] / candi[x][0]

                # 平均值算法
                # mean_val += candi[x][1]

            # seq[index] = mean_val / min(len(candi), k_num)
            seq[index] = p / weights

    return seq

In [68]:
prsa_dir = 'PRSA'
num_files = len(os.listdir(prsa_dir))
print(f'PRSA文件夹中有{num_files}个子文件')

PRSA文件夹中有12个子文件


In [116]:
# 用KNN补全NAN做预处理
monitor_list = []
with tqdm(total=72) as pbar:
    for filename in os.listdir(prsa_dir):
        if filename.startswith('PRSA'):
            df = pd.read_csv(os.path.join(prsa_dir, filename), sep=',')
            monitor_arr = df.iloc[:, 5:11].values
    #         extra_arr = df.iloc[:, 11:17].values
            for i in range(6):
                monitor_arr[:,i] = temporal_KNN(monitor_arr[:,i], k_num=2)
                pbar.update(1)

            monitor_list.append(monitor_arr)

monitor_data = np.stack(monitor_list, axis=1)
print(monitor_data.shape)
print('处理后的数据含有的缺失值数量:', np.isnan(monitor_data).sum())
        

100%|██████████| 72/72 [00:15<00:00,  4.56it/s]

(35064, 12, 6)
处理后的数据含有的缺失值数量: 0





In [125]:
# 保存成 npy 文件
np.save(os.path.join(output_dir, 'prsa.npy'), monitor_data)

In [126]:
# 生成 mask
for rate in miss_rates:
    mask = generate_mcar_mask(monitor_data, ratio=rate / 100.)
    np.save(os.path.join(output_dir, f'prsa_mask_{rate}.npy'), mask)