In [1]:
import sys
import os
sys.path += [os.path.abspath("./Autoformer")]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import random
import argparse
import torch
import argparse
from Autoformer.data_provider.data_loader import CustomPM
from Autoformer.exp.exp_main import Exp_Main
import datetime
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, Dataset

In [2]:
def get_pm_files(path: str, interpolate=True):
    paths = glob(os.path.join(path, '*.csv'))
    paths.sort()
    dfs = []
    for path in paths:
        df = pd.read_csv(path)
        if interpolate:
            df.interpolate(inplace=True)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

def get_aws_files(path :str, interpolate=True):
    paths = glob(os.path.join(path, '*.csv'))
    paths.sort()
    dfs = []
    for path in paths:
        df = pd.read_csv(path)
        if interpolate:
            df.interpolate(inplace=True)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

def get_aws_map(path: str) -> dict:
    df = pd.read_csv(path)
    aws_map = {}
    for row in df.itertuples():
        aws_map[row[1]] = (row[2], row[3])
    
    return aws_map

def get_pm_map(path: str) -> dict:
    df = pd.read_csv(path)
    pm_map = {}
    for row in df.itertuples():
        pm_map[row[1]] = (row[2], row[3])
    
    return pm_map


In [3]:
def find_n_nearest_points(point_name: str, n: int, aws_map: dict, pm_map:dict) -> list:
    point = pm_map[point_name]
    distances = []
    for key, value in aws_map.items():
        distances.append((key, np.linalg.norm(np.array(point) - np.array(value))))
    
    distances.sort(key=lambda x: x[1])
    return distances[:n]

In [4]:
# 위도 경도 스케일링을 위한 값 (min-max)
lat_max = 37.0106
lat_min = 36.0625
lon_max = 127.4938
lon_min = 125.5595


In [5]:
def merge_all(aws: pd.DataFrame, pm: pd.DataFrame, pm_map: dict):
    selected_columns = ['연도', '일시', '기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '습도(%)', 'lat', 'lon']
    year_map = {0: '2017', 1: '2018', 2: '2019', 3: '2020', 4: '2021'}
    pms = []
    aws['lat'] = aws['lat'].map(lambda x: (x - lat_min) / (lat_max - lat_min))
    aws['lon'] = aws['lon'].map(lambda x: (x - lon_min) / (lon_max - lon_min))
    pm['lat'] = pm['lat'].map(lambda x: (x - lat_min) / (lat_max - lat_min))
    pm['lon'] = pm['lon'].map(lambda x: (x - lon_min) / (lon_max - lon_min))
    stations = list(pm_map.keys())
    stations.sort()
    points =  aws['지점'].unique().tolist()
    points.sort()
    for key in stations:
        sub_pm =  pm[pm['측정소'] == key]
        for i, point in enumerate(points):
            sub_aws = aws[aws['지점'] == point]
            sub_aws = sub_aws[selected_columns]
            sub_pm =  pd.merge(sub_aws, sub_pm, on=['연도', '일시'], how='right', suffixes=(f'_aws{i}', f'_pm{i}'))
            sub_pm.sort_values(by=['연도', '일시'], inplace=True)
        
        sub_pm['date'] = sub_pm['연도'].map(lambda x: year_map[x]) + "-" + sub_pm['일시']
        sub_pm['date'] = pd.to_datetime(sub_pm['date'], format='%Y-%m-%d %H:%M')
        sub_pm.drop(['연도', '일시'], axis=1, inplace=True)
        sub_pm.insert(0, 'date', sub_pm.pop('date'))
        sub_pm.bfill(inplace=True)
        sub_pm.drop(['측정소'], axis=1, inplace=True)


        
        pms.append(sub_pm)

    
    return pms


In [6]:
def add_lat_long_to_pm(df: pd.DataFrame, pm_map: dict):
    df['lat'] = df['측정소'].map(lambda x: pm_map[x][0])
    df['lon'] = df['측정소'].map(lambda x: pm_map[x][1])
    return df


def add_lat_long_to_aws(df: pd.DataFrame, aws_map: dict):
    df['lat'] = df['지점'].map(lambda x: aws_map[x][0])
    df['lon'] = df['지점'].map(lambda x: aws_map[x][1])
    return df

In [7]:
def make_train_data(aws: pd.DataFrame, pm: pd.DataFrame, pm_map: dict, aws_map: dict, save_path: str = './dataset/pm_for_train'):
    aws = add_lat_long_to_aws(aws, aws_map)
    pm = add_lat_long_to_pm(pm, pm_map)
    data = merge_all(aws, pm, pm_map)
    for i, df in enumerate(data):
        df.to_csv(os.path.join(save_path, f'pm_{i}.csv'), index=False)
    return data


In [8]:
def make_test_data(aws: pd.DataFrame, pm: pd.DataFrame, pm_map: dict, aws_map: dict):
    chunks = []
    aws.dropna(inplace=True)
    pm.dropna(inplace=True)
    aws = add_lat_long_to_aws(aws, aws_map)
    pm = add_lat_long_to_pm(pm, pm_map)
    data = merge_all(aws, pm, pm_map)
    data = [df.iloc[:-48] for df in data]
    data = pd.concat(data, ignore_index=True)
    cols = list(data.columns)
    cols.remove('PM2.5')
    cols.remove('date')
    data = data[["date"] + cols + ['PM2.5']]

    x_data = data[data.columns[1:]]
    print(len(data))
    x_data = np.array(x_data)
  

    for i in range(0, len(x_data), 48):
        x = x_data[i:i+48]
        stamp = data['date'][i:i+48]
        stamp = pd.DataFrame(stamp, columns=['date'])
        last_date = stamp['date'].iloc[-1]
        test_date = [last_date + datetime.timedelta(hours=x) for x in range(1, 73)]
        test_stamp = pd.DataFrame(test_date, columns=['date'])
        stamp = pd.concat([stamp, test_stamp], ignore_index=True)
        stamp['month'] = stamp.date.apply(lambda row: row.month, 1)
        stamp['day'] = stamp.date.apply(lambda row: row.day, 1)
        stamp['weekday'] = stamp.date.apply(lambda row: row.weekday(), 1)
        stamp['hour'] = stamp.date.apply(lambda row: row.hour, 1)
        stamp.drop(['date'], axis=1, inplace=True)
        x_mark = stamp.values[:48]
        y_mark = stamp.values[48-24:]
        dec_inp = torch.zeros(48, x.shape[-1])
        dec_inp  = torch.cat([torch.tensor(x[24:, :]), dec_inp], dim=0)



        chunks.append((x, x_mark, dec_inp, y_mark))



    return chunks

In [9]:
aws_map = get_aws_map('./dataset/META/awsmap.csv')
pm_map = get_pm_map('./dataset/META/pmmap.csv')
train_pm = get_pm_files('./dataset/TRAIN', interpolate=True)
train_aws = get_aws_files('./dataset/TRAIN_AWS', interpolate=True)
test_pm = get_pm_files('./dataset/TEST_INPUT', interpolate=False)
test_aws = get_aws_files('./dataset/TEST_AWS', interpolate=False)

In [10]:
train_data = make_train_data(train_aws, train_pm, pm_map, aws_map)
test_data = make_test_data(test_aws, test_pm, pm_map, aws_map)

52224


In [11]:
class TestDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.len = len(data)
    
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return self.len

In [12]:
test_dataset = TestDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [13]:
dataset = CustomPM(data_path='./dataset/pm_for_train')

In [16]:
dataset[0][0]

array([[0.25118482, 0.01277778, 0.18158568, ..., 0.40549648, 0.80634284,
        0.056     ],
       [0.2543444 , 0.00972222, 0.1892583 , ..., 0.40549648, 0.80634284,
        0.06      ],
       [0.25592417, 0.01638889, 0.1892583 , ..., 0.40549648, 0.80634284,
        0.068     ],
       ...,
       [0.37598735, 0.3061111 , 0.09207161, ..., 0.40549648, 0.80634284,
        0.332     ],
       [0.3728278 , 0.32472223, 0.08951407, ..., 0.40549648, 0.80634284,
        0.264     ],
       [0.36966825, 0.29777777, 0.10485934, ..., 0.40549648, 0.80634284,
        0.268     ]], dtype=float32)

In [18]:
class DotDict(dict):
    def __getattr__(self, name):
        return self[name]

    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [19]:
fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)

args = DotDict({
    #basic
    'is_training': True,
    'model_id': 'test',
    'model': 'Autoformer',
    'data': 'customPM',
    'root_path': './dataset',
    'data_path': 'pm_for_train',
    'features': 'MS',
    'target': 'PM2.5',
    'freq': 'h',
    'checkpoints': './checkpoints',
    #forecasting task
    'seq_len': 48, # seq_len + label_len = pred_len 이여야함.
    'label_len': 24,
    'pred_len': 72,
    #model config
    'bucket_size':4,
    'n_hashes': 4,
    'enc_in': dataset[0][0].shape[1],
    'dec_in': dataset[0][0].shape[1],
    'c_out': 1,
    'd_model': 512,
    'n_heads': 8,
    'e_layers': 2,
    'd_layers': 2,
    'd_ff': 2048,
    'moving_avg': 25,
    'factor': 1,
    'distil': True,
    'dropout': 0.05,
    'embed': 'timeF',
    'activation': 'gelu',
    'output_attention': False,
    #optimization
    'num_workers': 8,
    'itr': 1,
    'train_epochs': 2, # train_epoch 수
    'batch_size': 32,
    'patience': 3,
    'learning_rate': 0.0001,
    'des': 'test',
    'loss': 'mse',
    'lradj': 'type1',
    'use_amp': False,

    #GPU
    'use_gpu': False,
    'gpu': 0,
    'use_multi_gpu': False,
    'devices': '0,1,2,3',  
})

In [20]:
Exp = Exp_Main
exp = Exp(args)  

setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_dt{}_{}_{}'.format(
    args.model_id,
    args.model,
    args.data,
    args.features,
    args.seq_len,
    args.label_len,
    args.pred_len,
    args.d_model,
    args.n_heads,
    args.e_layers,
    args.d_layers,
    args.d_ff,
    args.factor,
    args.embed,
    args.distil,
    args.des, 1)

Use CPU


In [23]:
# 모델 훈련
exp.train(setting)
torch.cuda.empty_cache()

>>>>>>>start training : test_Autoformer_customPM_ftMS_sl48_ll24_pl72_dm512_nh8_el2_dl2_df2048_fc1_ebtimeF_dtTrue_test_1>>>>>>>>>>>>>>>>>>>>>>>>>>
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8, 96, 24])
torch.Size([8,

In [17]:
# 훈련된 모델로 테스트 데이터 예측. 결과 데이터프레임으로 저장됨.
exp.predict(setting, test_loader, load=False )

In [18]:
# indepent variables

# year (int): start from 0
# datetime (str): 'MM-DD HH:MM' format
# point (str): categoricla value, 30 unique values
# temp (float): temperature in celsius. range from 0 to 1
# degree (float): wind direction in degree. range from 0 to 1
# speed (float): wind speed in m/s. range from 0 to 1
# rain (float): rain in mm. range from 0 to 1
# humidity (float): relative humidity in %. range from 0 to 1
# station (str): categorical value, 17 unique values, place where the data was collected

# dependent variables

# PM2.5 (float): PM2.5 concentration in ug/m3. range from 0 to 1

In [21]:
exp.model.load_state_dict(torch.load('./checkpoints/checkpoint2.pth', map_location=torch.device('cpu')))
exp.predict(setting, test_loader, load=False )

In [22]:
import re
def merge_answer(answer_format: str, prediction: str) -> pd.DataFrame:
    answer_form = pd.read_csv(answer_format)
    answer = pd.read_csv(prediction)
    answer =answer['0'].to_list()
    answer = [max(0, x) for x in answer]
    answer_form['PM2.5'] = answer
    return answer_form

In [32]:
answers = []
with torch.no_grad():
    for i, (batch_x, batch_x_mark, batch_dec_inp, batch_y_mark) in enumerate(test_loader):
        batch_x = batch_x.float()
        batch_x_mark = batch_x_mark.float()
        batch_dec_inp = batch_dec_inp.float()
        batch_y_mark = batch_y_mark.float()
        outputs = exp.model(batch_x, batch_x_mark, batch_dec_inp, batch_y_mark)
        a = outputs[:, :, -1:].squeeze().detach().numpy().tolist()
        answers.extend(a)
answers = [max(0, a) for a in answers]

In [34]:
answer_form = pd.read_csv('./dataset/answer_sample.csv')
answer_form['PM2.5'] = answers
answer_form.to_csv('./lb4.csv', index=False)


In [23]:
a = merge_answer('./dataset/answer_sample.csv', 'results/2023-05-04_21-29-02/real_prediction.csv')

ValueError: Length of values (16685568) does not match length of index (78336)

In [50]:
a.to_csv('./lb4.csv', index=False)