## for colab

In [1]:
# !pip install colab-ssh --upgrade

# from colab_ssh import launch_ssh_cloudflared, init_git_cloudflared
# launch_ssh_cloudflared(password='0000')

In [2]:
from google.colab import drive

# mount Google Drive
drive.mount('/content/drive', force_remount=True)
GDRIVE_HOME = '/content/drive/MyDrive'

Mounted at /content/drive


## Set Args.

In [3]:
import pandas as pd
import numpy as np
import torch
from easydict import EasyDict
import os

# args
opt = EasyDict()
opt.features = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'wind_direction', 'cloud', 'precipitation',
                'pressure', 'wind_speed', 'gust', 'overall_int', 'pm25_cat']
opt.seed = 42
opt.dataset = 2 # 1 for ml(past pm), 2 for dl(seq)
opt.seq_length = 5 # 3, 5, 10 how many timesteps to use for prediction
opt.test_ratio = 0.2 # 0.2 for dl models 0.3 for ml models
opt.val_ratio = 0.2 # for dl models
opt.batch_size = 16 # for dl models
opt.num_epochs = 15 # for dl models
opt.log_steps = 2000 # for dl models
opt.patience = 5 # for dl models
opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

opt.model_name = 'full' # ml: [dt, rf], dl: [full, each] (encoder)
opt.num_classes = 4 # for dl models
opt.num_layers = None # for dl models
opt.dir = os.path.join(GDRIVE_HOME, 'project_aqi')
print(opt.device)

cuda


## Prepare datasets for Sequential NN models

In [4]:
import os, sys
project_root = opt.dir
sys.path.append(project_root)

if opt.model_name in ['dt', 'rf']:
    df = pd.read_csv(os.path.join(project_root, 'dataset/for_ML.csv'))
else:
    df = pd.read_csv(os.path.join(project_root, 'dataset/for_Seq.csv'))

df = df[opt.features]
print(df.shape)
df.head(3)

(29084, 13)


Unnamed: 0,no2,co,so2,pm25_con,temp,wind_direction,cloud,precipitation,pressure,wind_speed,gust,overall_int,pm25_cat
0,-0.094571,-0.575601,-0.579114,-0.603259,0.76977,2,0.777354,0.427358,-1.529149,0.475007,0.768619,0,1
1,-0.556088,-0.575601,-0.579114,-0.663929,0.678268,2,1.21417,1.390401,-1.653282,0.475007,0.597723,0,0
2,-0.622019,-0.575601,-1.076117,-0.724599,0.678268,2,1.883953,0.060484,-1.529149,0.475007,0.597723,1,0


transform for sequential learning

In [5]:
from data_utils import make_seq, Custom_Dataset, Custom_Random_Split, Custom_Loader

non_categorical = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'cloud', 'precipitation', 'pressure', 'wind_speed', 'gust']
categorical = ['wind_direction', 'overall_int']

inputs, targets = make_seq(df=df, non_cat=non_categorical, cat=categorical, target='pm25_cat', seq_length=opt.seq_length)
dataset = Custom_Dataset(input_arr=inputs, target_arr=targets)
train_set, val_set, test_set = Custom_Random_Split(dataset=dataset, val_ratio=opt.val_ratio, test_ratio=opt.test_ratio, random_seed=opt.seed)
train_loader, val_loader, test_loader = Custom_Loader(train=train_set, val=val_set, test=test_set, batch_size=opt.batch_size)

shape of sequential input data: (29079, 5, 16)
get 29,079 samples
Number of datasets 17,449 : 5,815 : 5,815


## Model

In [6]:
from model.air_predictor import AiR_predictor, AiR_predictor_att
from model.parameters import get_parameters

model = AiR_predictor(input_dim=16, embed_dim=256, rnn_dim=256, fc_dim=128, num_classes=4, bidirectional=False, opt=opt)
total, params = get_parameters(model)

531,716 total parameters in this model
531,716 trainable parameters in this model


In [7]:
from trainer import trainer, evaluate
import torch.nn as nn
import torch.optim as optim
#from Custom_Trainer import trainer

optimizer = optim.AdamW(params, lr=2e-5)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.8)
criterion = nn.CrossEntropyLoss()

_, _, path = trainer(train_loader, val_loader, model, criterion, optimizer, scheduler=False, opt=opt)

Epoch: 01 | Val Loss: 0.042 | Val Acc: 72.83%
  global step: 2,000 | train loss: 0.644 | train acc: 73.53
Epoch: 02 | Val Loss: 0.038 | Val Acc: 75.17%
>> saved: /content/drive/MyDrive/project_aqi/state_dict/full_epoch_3_val_acc_76.77%
Epoch: 03 | Val Loss: 0.036 | Val Acc: 76.77%
  global step: 4,000 | train loss: 0.572 | train acc: 76.95
>> saved: /content/drive/MyDrive/project_aqi/state_dict/full_epoch_4_val_acc_77.95%
Epoch: 04 | Val Loss: 0.034 | Val Acc: 77.95%
>> saved: /content/drive/MyDrive/project_aqi/state_dict/full_epoch_5_val_acc_79.29%
Epoch: 05 | Val Loss: 0.033 | Val Acc: 79.29%
  global step: 6,000 | train loss: 0.533 | train acc: 79.58
>> saved: /content/drive/MyDrive/project_aqi/state_dict/full_epoch_6_val_acc_79.5%
Epoch: 06 | Val Loss: 0.033 | Val Acc: 79.50%
>> saved: /content/drive/MyDrive/project_aqi/state_dict/full_epoch_7_val_acc_80.03%
Epoch: 07 | Val Loss: 0.032 | Val Acc: 80.03%
  global step: 8,000 | train loss: 0.513 | train acc: 79.91
>> saved: /content/

In [8]:
path = os.path.join('/content', path)
model.load_state_dict(torch.load(path))
loss, acc, f1, truth, pred = evaluate(test_loader, model=model, criterion=criterion, opt=opt)
print('test loss: {:.3f} | test_acc: {:.2f}% | test_f1: {:.2f}'.format(loss, acc*100, f1))

test loss: 0.030 | test_acc: 81.60% | test_f1: 70.49


## Analyze

In [9]:
from metric import custom_metric, get_scores
recall_bad, acc, f1, cf = get_scores(y_test=truth, predicted=pred)
cf

 >> recall_bad: 75.04%
 >> total acc.: 81.60%
 >> total F1: 70.49


Unnamed: 0,pr_good,pr_moderate,pr_bad,pr_worst,truth_total
truth_good,1241,287,2,0,1530
truth_moderate,217,2706,165,3,3091
truth_bad,4,254,751,9,1018
truth_worst,1,39,89,47,176
