**RandomForest Classifier 직접 배포해보기**

1. 데이터 입력 받기
2. 데이터 전처리
    - 이전 데이터 분포도 로드
    - 결측값이 있다면 평균치로 채워주기
    - NO2, CO, SO2, pressure를 standardization (이전 10년간의 분포를 참고해서)
    - wind_direction, overall, pm25_cat encoding
    - X 벡터 구성
3. 모델 Load
4. Classify
5. 결과 출력

## 데이터 입력 받기

In [2]:
import pandas as pd

test = pd.read_csv('testset.csv')
use = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'wind_direction', 'cloud', 'precipitation',
       'pressure', 'wind_speed', 'gust', 'overall']
test_sample = test[use].iloc[4]

FileNotFoundError: [Errno 2] File testset.csv does not exist: 'testset.csv'

In [3]:
test_sample = [0.031, 0.5, 0.004, 15.0, 3.0, 6.0, 0.0, 1031.0, 3.0, 4.0, 'NNE', 'Sunny']
feature = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'cloud', 'precipitation',
           'pressure', 'wind_speed', 'gust', 'wind_direction', 'overall'] # 입력 받아서 이 순서대로 벡터 구성
non_cat = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'cloud', 'precipitation',
           'pressure', 'wind_speed', 'gust'] # non-categorical
print(test_sample)

[0.031, 0.5, 0.004, 15.0, 3.0, 6.0, 0.0, 1031.0, 3.0, 4.0, 'NNE', 'Sunny']


## 데이터 전처리

### 이전 데이터 분포 입력 받기

In [4]:
df = pd.read_csv('distribution.csv')
df

FileNotFoundError: [Errno 2] File distribution.csv does not exist: 'distribution.csv'

In [118]:
from collections import defaultdict

dist_dict = defaultdict(lambda: list())
for f in non_cat:
    mean = df[f][1]
    std = df[f][2]
    dist_dict[f].append(mean)
    dist_dict[f].append(std)
print(dist_dict)

with open('dist.txt', 'w') as f:
    keys = dist_dict.keys()
    for key in keys:
        f.write(key + ',' + str(dist_dict[key][0]) + ',' + str(dist_dict[key][1]) + '\n')

defaultdict(<function <lambda> at 0x7fbfcf4581f0>, {'no2': [0.03243439691926833, 0.015167630117481656], 'co': [0.5384231880071517, 0.2404888549704333], 'so2': [0.005165211112639252, 0.002012093893840232], 'pm25_con': [24.943267776096825, 16.482869412642348], 'temp': [12.587367624810893, 10.928954439561892], 'cloud': [33.30607894374914, 34.34003921577874], 'precipitation': [0.4681096135332141, 2.180623895687632], 'pressure': [1016.3186631825058, 8.056032133544594], 'wind_speed': [5.971943336542429, 4.269604098872862], 'gust': [8.502406821620134, 5.851627487582354]})


여기서부터 dist 데이터 가져오기

In [5]:
def load_distribution(file):
    result = dict()
    with open(file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        key, mean, std = line.strip().split(',')
        result[key] = [float(mean)]
        result[key].append(float(std))
    return result

In [6]:
dist = load_distribution('dist.txt')
keys = list(dist.keys())
dist

{'no2': [0.03243439691926833, 0.015167630117481656],
 'co': [0.5384231880071517, 0.2404888549704333],
 'so2': [0.005165211112639252, 0.002012093893840232],
 'pm25_con': [24.943267776096825, 16.482869412642348],
 'temp': [12.587367624810893, 10.928954439561892],
 'cloud': [33.30607894374914, 34.34003921577874],
 'precipitation': [0.4681096135332141, 2.180623895687632],
 'pressure': [1016.3186631825058, 8.056032133544594],
 'wind_speed': [5.971943336542429, 4.269604098872862],
 'gust': [8.502406821620134, 5.851627487582354]}

### 결측 값이 있다면 평균치로 채워주기 

In [7]:
test_sample

[0.031, 0.5, 0.004, 15.0, 3.0, 6.0, 0.0, 1031.0, 3.0, 4.0, 'NNE', 'Sunny']

In [8]:
def fill_false(sample):
    global dist, keys
    for i in range(len(keys)):
        if not sample[i] and sample[i] != 0:
            sample[i] = dist[keys[i]][0] # 평균치로 채우기
    return sample

In [9]:
test_sample = fill_false(test_sample)
print(test_sample)

[0.031, 0.5, 0.004, 15.0, 3.0, 6.0, 0.0, 1031.0, 3.0, 4.0, 'NNE', 'Sunny']


### 전처리
1. NO2, CO, SO2, pressure를 standardization
2. wind_direction, overall, pm25_cat encoding

In [10]:
# functions to encode (여기는 다른 파일에!)
# 바람 변환
wind_map = {'NW': 'W', 'WSW': 'W', 'WNW': 'W', 'W': 'W', 'SW': 'W', 'SSW': 'S', 'E': 'E', 'ENE': 'E',
           'ESE': 'E', 'NNW': 'N', 'SE': 'E', 'S': 'S', 'SSE': 'S', 'NE': 'E', 'NNE': 'N', 'N': 'N'}

def cat_wind(val):
    return wind_map[val]

wind_map_int = {'W': 0, 'E': 1, 'S': 2, 'N': 3}
def encode_wind(val):
    return wind_map_int[val]

# 날씨 변환
rain = set(['drizzle', 'rain', 'snow', 'sleet'])
def cat_overall(val):
    if set(val.lower().split()) & rain:
        return 1
    else:
        return 0

# PM25 카테고라이즈
def cat_pm25(val):
    if val < 15:
        return 0
    elif 15 <= val < 35:
        return 1
    elif 36 <= val < 75:
        return 2
    else:
        return 3

In [11]:
def process(sample, dist):
    keys = list(dist.keys())
    standard = [0,1,2,7] # standardization idx: NO2, CO, SO2, pressure
    for i in standard:
        sample[i] = (sample[i] - dist[keys[i]][0]) / dist[keys[i]][1] # get Z
    # 날씨
    if type(sample[11]) == str:
        sample[11] = cat_overall(sample[11])
    # 바람 변환
    sample[10] = encode_wind(cat_wind(sample[10]))
    return sample

In [12]:
X = process(test_sample, dist)
X = [X]
X

[[-0.0945696135888163,
  -0.15977117946640657,
  -0.5791037467020784,
  15.0,
  3.0,
  6.0,
  0.0,
  1.8224029614234551,
  3.0,
  4.0,
  3,
  0]]

## 모델 Load

In [14]:
import joblib

model_path = 'AiR_Predictor_RF.pkl'
model = joblib.load(model_path)

In [16]:
y = model.predict(X)

In [18]:
y.item()

1

In [179]:
class_map = {0: 'good', 1: 'moderate', 2: 'bad', 3: 'worst'}

In [12]:
from easydict import EasyDict
import torch
import os, sys
from model.air_predictor import AiR_predictor, AiR_predictor_att

opt = EasyDict()
opt.features = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'wind_direction', 'cloud', 'precipitation',
                'pressure', 'wind_speed', 'gust', 'overall_int', 'pm25_cat']
opt.seed = 42
opt.dataset = 2 # 1 for ml(past pm), 2 for dl(seq)
opt.seq_length = 5 # 3, 5, 10 how many timesteps to use for prediction
opt.test_ratio = 0.2 # 0.2 for dl models 0.3 for ml models
opt.val_ratio = 0.2 # for dl models
opt.batch_size = 16 # for dl models
opt.num_epochs = 15 # for dl models
opt.log_steps = 2000 # for dl models
opt.patience = 5 # for dl models
opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

opt.model_name = 'full' # ml: [dt, rf], dl: [full, each] (encoder)
opt.num_classes = 4 # for dl models
opt.num_layers = None # for dl models
opt.dir = ''

model = AiR_predictor(input_dim=16, embed_dim=256, rnn_dim=256, fc_dim=128, num_classes=4, bidirectional=False,
                     opt=opt)
weight = 'BEST_full_val_acc_81.31_'
path = os.path.join('state_dict', weight)
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
#loss, acc, f1, truth, pred = evaluate(test_loader, model=model, criterion=criterion, opt=opt)
#print('test loss: {:.3f} | test_acc: {:.2f}% | test_f1: {:.2f}'.format(loss, acc*100, f1))

<All keys matched successfully>

In [13]:
model

AiR_predictor(
  (embedding): Linear(in_features=16, out_features=256, bias=True)
  (rnn): LSTM(256, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

## 데이터 변환

In [16]:
import pandas as pd
df_before = pd.read_csv('dataset/for_Seq.csv')

In [21]:
df_before.describe()

Unnamed: 0,no2,co,so2,pm25_con,temp,wind_direction,cloud,precipitation,pressure,wind_speed,gust,overall_int,pm25_cat
count,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0
mean,-3.361666e-16,-1.172674e-16,-1.954457e-16,-8.990501e-17,-1.11404e-16,0.987072,-1.1726740000000001e-17,5.4724790000000005e-17,3.662652e-15,-7.426935e-17,8.59961e-17,0.188523,0.962454
std,1.000017,1.000017,1.000017,1.000017,1.000017,1.069068,1.000017,1.000017,1.000017,1.000017,1.000017,0.391136,0.750042
min,-1.94064,-1.407254,-1.57312,-1.39197,-3.164801,0.0,-0.9699072,-0.2146714,-4.508334,-1.398735,-1.453024,0.0,0.0
25%,-0.8198124,-0.5756008,-0.5791137,-0.6639291,-0.8772598,0.0,-0.8825441,-0.2146714,-0.7843529,-0.696082,-0.7694414,0.0,0.0
50%,-0.1605023,-0.1597739,-0.08211046,-0.2392384,0.1292582,1.0,-0.4166077,-0.2146714,0.08457619,-0.2276464,-0.2567547,0.0,1.0
75%,0.6306699,0.2560529,0.4148928,0.4281326,0.8612712,2.0,0.7482334,-0.2146714,0.8293725,0.4750069,0.4268276,0.0,1.0
max,5.245841,8.57259,14.82799,24.27148,2.416799,3.0,1.942195,39.82043,2.691363,7.267323,7.26265,1.0,3.0


In [34]:
test = pd.read_csv('testset.csv')
print(test.columns)
test.head(3)


Index(['time', 'district', 'pm10_con', 'pm25_con', 'o3', 'no2', 'co', 'so2',
       'pm10_aqi', 'pm25_aqi', 'overall', 'temp', 'feels', 'wind_speed',
       'wind_direction', 'gust', 'cloud', 'humidity', 'precipitation',
       'pressure', 'loc', 'hour'],
      dtype='object')


Unnamed: 0,time,district,pm10_con,pm25_con,o3,no2,co,so2,pm10_aqi,pm25_aqi,...,feels,wind_speed,wind_direction,gust,cloud,humidity,precipitation,pressure,loc,hour
0,2011-12-05 00:00:00,0,15.0,10.0,0.019,0.021,0.3,0.003,14.0,42.0,...,1.0,7.0,NNW,8.0,0.0,58.0,0.0,1029.0,,0
1,2011-12-05 03:00:00,0,18.0,11.0,0.021,0.014,0.4,0.003,17.0,46.0,...,0.0,6.0,NNW,7.0,0.0,56.0,0.0,1029.0,,3
2,2011-12-05 06:00:00,0,21.0,13.0,0.012,0.025,0.4,0.003,19.0,53.0,...,0.0,4.0,NNW,5.0,0.0,54.0,0.0,1030.0,,6


In [39]:
use = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'wind_direction', 'cloud', 'precipitation',
       'pressure', 'wind_speed', 'gust', 'overall']
use_test = test[use]

In [42]:
use_test

Unnamed: 0,no2,co,so2,pm25_con,temp,wind_direction,cloud,precipitation,pressure,wind_speed,gust,overall
0,0.021,0.3,0.003,10.0,4.0,NNW,0.0,0.0,1029.0,7.0,8.0,Clear
1,0.014,0.4,0.003,11.0,3.0,NNW,0.0,0.0,1029.0,6.0,7.0,Clear
2,0.025,0.4,0.003,13.0,2.0,NNW,0.0,0.0,1030.0,4.0,5.0,Sunny
3,0.042,0.7,0.004,16.0,3.0,N,0.0,0.0,1031.0,4.0,5.0,Sunny
4,0.031,0.5,0.004,15.0,3.0,NNE,6.0,0.0,1031.0,3.0,4.0,Sunny


In [43]:
with open('testset.csv', 'r') as f:
    cols = f.readline()
    lines = f.readlines()

In [52]:
import numpy as np
dataset = list()
for line in lines:
    dataset.append(line.strip().split(',')[1:]) # index 제외하고
print(dataset)
print(len(dataset[0]))

[['0.021', '0.3', '0.003', '10.0', '4.0', 'NNW', '0.0', '0.0', '1029.0', '7.0', '8.0', 'Clear'], ['0.014', '0.4', '0.003', '11.0', '3.0', 'NNW', '0.0', '0.0', '1029.0', '6.0', '7.0', 'Clear'], ['0.025', '0.4', '0.003', '13.0', '2.0', 'NNW', '0.0', '0.0', '1030.0', '4.0', '5.0', 'Sunny'], ['0.042', '0.7', '0.004', '16.0', '3.0', 'N', '0.0', '0.0', '1031.0', '4.0', '5.0', 'Sunny'], ['0.031', '0.5', '0.004', '15.0', '3.0', 'NNE', '6.0', '0.0', '1031.0', '3.0', '4.0', 'Sunny']]
12


In [84]:
# 변수 분포, non-categorical 만 표준정규화 적용 해주면 됨
dist = pd.read_csv('distribution.csv')
dist

Unnamed: 0.1,Unnamed: 0,no2,co,so2,pm25_con,temp,cloud,precipitation,pressure,wind_speed,gust
0,count,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0,29084.0
1,mean,0.032434,0.538423,0.005165,24.943268,12.587368,33.306079,0.46811,1016.318663,5.971943,8.502407
2,std,0.015168,0.240489,0.002012,16.482869,10.928954,34.340039,2.180624,8.056032,4.269604,5.851627
3,min,0.003,0.2,0.002,2.0,-22.0,0.0,0.0,980.0,0.0,0.0
4,25%,0.02,0.4,0.004,14.0,3.0,3.0,0.0,1010.0,3.0,4.0
5,50%,0.03,0.5,0.005,21.0,14.0,19.0,0.0,1017.0,5.0,7.0
6,75%,0.042,0.6,0.006,32.0,22.0,59.0,0.0,1023.0,8.0,11.0
7,max,0.112,2.6,0.035,425.0,39.0,100.0,87.3,1038.0,37.0,51.0


In [85]:
dist[non_categorical].iloc[1:3].to_csv('non_categorical_distribute.csv', index=False)

In [86]:
# 5, 11번 빼고 dict에 평균과 표준편차를 담기
dist = pd.read_csv('non_categorical_distribute.csv')
dist

Unnamed: 0,no2,co,so2,pm25_con,temp,cloud,precipitation,pressure,wind_speed,gust
0,0.032434,0.538423,0.005165,24.943268,12.587368,33.306079,0.46811,1016.318663,5.971943,8.502407
1,0.015168,0.240489,0.002012,16.482869,10.928954,34.340039,2.180624,8.056032,4.269604,5.851627


In [73]:
non_categorical = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'cloud', 'precipitation', 'pressure', 'wind_speed', 'gust']
categorical = ['wind_direction', 'overall_int']

In [None]:
# 3번은 category_pm을 적용
# 5번은 wind_direction적용
# 11번은 overall 인코딩


In [71]:
import pandas as pd
# use_feature = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'wind_direction', 'cloud', 'precipitation',
#                'pressure', 'wind_speed', 'gust', 'overall_int', 'target(pm25_cat)']


def data_input(file: str, form: str='csv', header: bool=True):
    with open(file, 'r') as f:
        if header: cols = f.readline()
        lines = f.readlines()
    dataset = [line.strip().split(',')[1:] for line in lines]
    new_set = list()
    for sample in dataset:
        sample.append(cat_pm25(float(sample[3])))
        sample[5] = encode_wind(cat_wind(sample[5]))
        sample[11] = cat_overall(sample[11])
        new_set.append(list(map(float, sample)))

    return new_set

In [72]:
t = data_input('testset.csv')
print(t)

[[0.021, 0.3, 0.003, 10.0, 4.0, 3.0, 0.0, 0.0, 1029.0, 7.0, 8.0, 0.0, 0.0], [0.014, 0.4, 0.003, 11.0, 3.0, 3.0, 0.0, 0.0, 1029.0, 6.0, 7.0, 0.0, 0.0], [0.025, 0.4, 0.003, 13.0, 2.0, 3.0, 0.0, 0.0, 1030.0, 4.0, 5.0, 0.0, 0.0], [0.042, 0.7, 0.004, 16.0, 3.0, 3.0, 0.0, 0.0, 1031.0, 4.0, 5.0, 0.0, 1.0], [0.031, 0.5, 0.004, 15.0, 3.0, 3.0, 6.0, 0.0, 1031.0, 3.0, 4.0, 0.0, 1.0]]


In [22]:
def cat_pm25(val):
    if val < 15:
        return 0
    elif 15 <= val < 35:
        return 1
    elif 36 <= val < 75:
        return 2
    else:
        return 3

In [28]:
# 바람 변환
wind_map = {'NW': 'W', 'WSW': 'W', 'WNW': 'W', 'W': 'W', 'SW': 'W', 'SSW': 'S', 'E': 'E', 'ENE': 'E',
           'ESE': 'E', 'NNW': 'N', 'SE': 'E', 'S': 'S', 'SSE': 'S', 'NE': 'E', 'NNE': 'N', 'N': 'N'}

def cat_wind(val):
    return wind_map[val]

wind_map_int = {'W': 0, 'E': 1, 'S': 2, 'N': 3}
def encode_wind(val):
    return wind_map_int[val]

# 날씨 변환
rain = set(['drizzle', 'rain', 'snow', 'sleet'])
def cat_overall(val):
    if set(val.lower().split()) & rain:
        return 1
    else:
        return 0