In [1]:
import sys
import os
sys.path += [os.path.abspath("./Autoformer")]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import random
import argparse
import torch
import argparse
from Autoformer.data_provider.data_loader import CustomPM
from Autoformer.exp.exp_main import Exp_Main

In [2]:
year_map = {0: '2016', 1: '2017', 2: '2018', 3: '2019', 4: '2020'}

In [3]:
def get_pm_files():
    paths = glob('./dataset/TRAIN/*.csv')
    dfs = []
    for path in paths:
        df = pd.read_csv(path)
        df.interpolate(inplace=True)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

def get_aws_files():
    paths = glob('./dataset/TRAIN_AWS/*.csv')
    dfs = []
    for path in paths:
        df = pd.read_csv(path)
        df.interpolate(inplace=True)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

def get_aws_map(path: str) -> dict:
    df = pd.read_csv(path)
    aws_map = {}
    for row in df.itertuples():
        aws_map[row[1]] = (row[2], row[3])
    
    return aws_map

def get_pm_map(path: str) -> dict:
    df = pd.read_csv(path)
    pm_map = {}
    for row in df.itertuples():
        pm_map[row[1]] = (row[2], row[3])
    
    return pm_map


In [4]:
def find_n_nearest_points(point_name: str, n: int, aws_map: dict, pm_map:dict) -> list:
    point = pm_map[point_name]
    distances = []
    for key, value in aws_map.items():
        distances.append((key, np.linalg.norm(np.array(point) - np.array(value))))
    
    distances.sort(key=lambda x: x[1])
    return distances[:n]

In [5]:
def merge_n_nearest(aws: pd.DataFrame, pm: pd.DataFrame, n: int, aws_map: dict, pm_map: dict):
    selected_columns = ['연도', '일시', '기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '습도(%)', 'lat', 'lon']
    year_map = {0: '2017', 1: '2018', 2: '2019', 3: '2020'}
    pms = []
    for key, _ in pm_map.items():
        distances = find_n_nearest_points(key, n, aws_map, pm_map)
        sub_pm =  pm[pm['측정소'] == key]
        for i, dist in enumerate(distances):
            dist_name = dist[0]
            sub_aws = aws[aws['지점'] == dist_name]
            sub_aws = sub_aws[selected_columns]
            sub_pm =  pd.merge(sub_aws, sub_pm, on=['연도', '일시'], how='right', suffixes=(f'_aws{i}', f'_pm{i}'))
        
        sub_pm['date'] = sub_pm['연도'].map(lambda x: year_map[x]) + "-" + sub_pm['일시']
        sub_pm['date'] = pd.to_datetime(sub_pm['date'], format='%Y-%m-%d %H:%M')
        sub_pm.drop(['연도', '일시'], axis=1, inplace=True)
        sub_pm.insert(0, 'date', sub_pm.pop('date'))
        sub_pm.bfill(inplace=True)

        
        pms.append(sub_pm)
    
    return pms


In [6]:
# aws = get_aws_files()
# pm = get_pm_files()
# aws_map = get_aws_map("./dataset/META/awsmap.csv")
# pm_map = get_pm_map("./dataset/META/pmmap.csv")
# aws['lat'] = aws['지점'].apply(lambda x: aws_map[x][0])
# aws['lon'] = aws['지점'].apply(lambda x: aws_map[x][1])
# pm['lat'] = pm['측정소'].apply(lambda x: pm_map[x][0])
# pm['lon'] = pm['측정소'].apply(lambda x: pm_map[x][1])
# x_plus = y[['연도', '일시', '측정소']]
# x = pd.merge(x, x_plus, on=['연도', '일시'])
# y = y['PM2.5']

In [7]:
# pms = merge_n_nearest(aws, pm, 3, aws_map, pm_map)

In [8]:
# for i, pm in enumerate(pms):
#     pm.to_csv(f'./dataset/pm_for_train/pm_{i}.csv', index=False)

In [9]:
dataset = CustomPM(data_path='./dataset/pm_for_train', flag='val')

In [10]:
len(dataset[-1][3])

96

In [11]:
dataset.list_len

10449

In [12]:
len(dataset)

175610

In [13]:
dataset[0][0].shape

(48, 24)

In [14]:
class DotDict(dict):
    def __getattr__(self, name):
        return self[name]

    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [15]:
fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)

args = DotDict({
    #basic
    'is_training': True,
    'model_id': 'test',
    'model': 'Autoformer',
    'data': 'customPM',
    'root_path': './dataset',
    'data_path': 'pm_for_train',
    'features': 'MS',
    'target': 'PM2.5',
    'freq': 'h',
    'checkpoints': './checkpoints',

    #forecasting task
    'seq_len': 48, # seq_len + label_len = pred_len 이여야함.
    'label_len': 24,
    'pred_len': 72,

    #model config
    'bucket_size':4,
    'n_hashes': 4,
    'enc_in': dataset[0][0].shape[1],
    'dec_in': dataset[0][0].shape[1],
    'c_out': 1,
    'd_model': 512,
    'n_heads': 8,
    'e_layers': 2,
    'd_layers': 2,
    'd_ff': 2048,
    'moving_avg': 25,
    'factor': 1,
    'distil': True,
    'dropout': 0.05,
    'embed': 'timeF',
    'activation': 'gelu',
    'output_attention': False,

    

    #optimization
    'num_workers': 8,
    'itr': 2,
    'train_epochs': 100,
    'batch_size': 8,
    'patience': 3,
    'learning_rate': 0.0001,
    'des': 'test',
    'loss': 'mse',
    'lradj': 'type1',
    'use_amp': False,

    #GPU
    'use_gpu': False,
    'gpu': 0,
    'use_multi_gpu': False,
    'devices': '0,1,2,3',
    
    
    
    
})

In [16]:
dataset[0][0].shape

(48, 24)

In [17]:
from Autoformer.layers.Embed import DataEmbedding, DataEmbedding_wo_pos


In [20]:
Exp = Exp_Main
if args.is_training:
    for ii in range(args.itr):
        # setting record of experiments
        setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_dt{}_{}_{}'.format(
            args.model_id,
            args.model,
            args.data,
            args.features,
            args.seq_len,
            args.label_len,
            args.pred_len,
            args.d_model,
            args.n_heads,
            args.e_layers,
            args.d_layers,
            args.d_ff,
            args.factor,
            args.embed,
            args.distil,
            args.des, ii)

        exp = Exp(args)  # set experiments
        print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
        exp.train(setting)

        print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
        exp.test(setting)

        if args.do_predict:
            print('>>>>>>>predicting : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
            exp.predict(setting, True)

        torch.cuda.empty_cache()
else:
    ii = 0
    setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_dt{}_{}_{}'.format(args.model_id,
                                                                                                    args.model,
                                                                                                    args.data,
                                                                                                    args.features,
                                                                                                    args.seq_len,
                                                                                                    args.label_len,
                                                                                                    args.pred_len,
                                                                                                    args.d_model,
                                                                                                    args.n_heads,
                                                                                                    args.e_layers,
                                                                                                    args.d_layers,
                                                                                                    args.d_ff,
                                                                                                    args.factor,
                                                                                                    args.embed,
                                                                                                    args.distil,
                                                                                                    args.des, ii)

    exp = Exp(args)  # set experiments
    print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
    exp.test(setting, test=1)
    torch.cuda.empty_cache()

Use CPU
>>>>>>>start training : test_Autoformer_customPM_ftMS_sl48_ll24_pl72_dm512_nh8_el2_dl2_df2048_fc1_ebtimeF_dtTrue_test_0>>>>>>>>>>>>>>>>>>>>>>>>>>
	iters: 100, epoch: 1 | loss: 0.3080468
	speed: 0.3566s/iter; left time: 1841772.5957s
	iters: 200, epoch: 1 | loss: 0.2709801
	speed: 0.2714s/iter; left time: 1401883.1241s
	iters: 300, epoch: 1 | loss: 0.2229000
	speed: 0.2653s/iter; left time: 1370396.9569s
	iters: 400, epoch: 1 | loss: 0.1909914
	speed: 0.2649s/iter; left time: 1368009.9503s


In [None]:
# indepent variables

# year (int): start from 0
# datetime (str): 'MM-DD HH:MM' format
# point (str): categoricla value, 30 unique values
# temp (float): temperature in celsius. range from 0 to 1
# degree (float): wind direction in degree. range from 0 to 1
# speed (float): wind speed in m/s. range from 0 to 1
# rain (float): rain in mm. range from 0 to 1
# humidity (float): relative humidity in %. range from 0 to 1
# station (str): categorical value, 17 unique values, place where the data was collected

# dependent variables

# PM2.5 (float): PM2.5 concentration in ug/m3. range from 0 to 1

In [None]:
# 17 list
# 24544 items in each list
# total length = 
len(dataset.xs[0])

NameError: name 'dataset' is not defined