In [22]:
import pandas as pd
import numpy as np
import torch
from easydict import EasyDict

## Set Args

In [55]:
# args
opt = EasyDict()
opt.features = ['no2', 'co', 'so2', 'pm25_con', 'temp', 'wind_direction', 'cloud', 'precipitation',
                'pressure', 'wind_speed_rs', 'gust_rs', 'overall_int', 'pm25_cat']
opt.seed = 42
opt.dataset = 1 # 1 for ml(past pm), 2 for dl(seq)
opt.test_ratio = 0.3 # 0.2 for dl models 0.3 for ml models
opt.val_ratio = 0.2 # for dl models
opt.batch_size = None # for dl models
opt.num_epochs = None # for dl models
opt.log_steps = None # for dl models
opt.patience = 5 # for dl models
opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

opt.model_name = 'dt' # ml: [dt, rf], dl: [full, each] (encoder)
opt.num_classes = 4 # for dl models
opt.num_layers = None # for dl models

print(opt.device)

cpu


## Load Dataset for ML

In [56]:
if opt.model_name in ['dt', 'rf']:
    df = pd.read_csv('dataset/for_ML.csv')
    df = df[opt.features]
else:
    df = pd.read_csv('dataset/for_Seq.csv')

In [57]:
print(df.shape)
df.head(3)

(29083, 13)


Unnamed: 0,no2,co,so2,pm25_con,temp,wind_direction,cloud,precipitation,pressure,wind_speed_rs,gust_rs,overall_int,pm25_cat
0,-0.094571,-0.575601,-0.579114,15.0,20.0,2,75.0,3.5,-1.653282,8.0,12.0,0,0
1,-0.556088,-0.575601,-0.579114,14.0,20.0,2,98.0,0.6,-1.529149,8.0,12.0,1,0
2,-0.622019,-0.575601,-1.076117,13.0,20.0,2,100.0,1.3,-1.529149,9.0,13.0,1,1


In [58]:
class_map = {0: 'good', 1: 'moderate', 2: 'bad', 3: 'worst'}

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29083 entries, 0 to 29082
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   no2             29083 non-null  float64
 1   co              29083 non-null  float64
 2   so2             29083 non-null  float64
 3   pm25_con        29083 non-null  float64
 4   temp            29083 non-null  float64
 5   wind_direction  29083 non-null  int64  
 6   cloud           29083 non-null  float64
 7   precipitation   29083 non-null  float64
 8   pressure        29083 non-null  float64
 9   wind_speed_rs   29083 non-null  float64
 10  gust_rs         29083 non-null  float64
 11  overall_int     29083 non-null  int64  
 12  pm25_cat        29083 non-null  int64  
dtypes: float64(10), int64(3)
memory usage: 2.9 MB


In [60]:
category = ['wind_direction', 'overall_int']
for cat in category:
    df[cat] = df[cat].astype('category')

## Split Dataset

In [61]:
import os, sys
import random
import numpy as np

# Stable Random Seed
SEED = opt.seed
random.seed(SEED)
np.random.seed(SEED)
# torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(SEED)

In [62]:
from sklearn.model_selection import train_test_split

X = df.drop('pm25_cat', axis=1)
y = df['pm25_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=opt.test_ratio, random_state=SEED)
print('X_train: {:,} \nX_test: {:,}\ny_train: {:,} \ny_test: {:,}'.format(len(X_train), len(X_test), len(y_train),
                                                                  len(y_test)))

X_train: 20,358 
X_test: 8,725
y_train: 20,358 
y_test: 8,725


## Model

In [63]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=12, class_weight={0:1, 1:1, 2:3, 3:5}, random_state=SEED)
clf.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 1, 1: 1, 2: 3, 3: 5}, max_depth=12,
                       random_state=42)

In [64]:
predicted = clf.predict(X_test)

## Evaluation

In [65]:
from sklearn.metrics import classification_report
class_names = class_map.values()
print(classification_report(y_test, predicted, target_names=class_names))

              precision    recall  f1-score   support

        good       0.85      0.78      0.82      2368
    moderate       0.83      0.84      0.83      4573
         bad       0.69      0.83      0.75      1510
       worst       0.77      0.32      0.45       274

    accuracy                           0.80      8725
   macro avg       0.78      0.69      0.71      8725
weighted avg       0.81      0.80      0.80      8725



In [72]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, roc_auc_score
cf = pd.DataFrame(confusion_matrix(y_test, predicted))
cf.index = ['truth_good', 'truth_moderate', 'truth_bad', 'truth_worst']
cf.columns = ['pr_good', 'pr_moderate', 'pr_bad', 'pr_worst']
cf

Unnamed: 0,pr_good,pr_moderate,pr_bad,pr_worst
truth_good,1851,506,11,0
truth_moderate,314,3828,429,2
truth_bad,3,230,1253,24
truth_worst,1,56,130,87


In [77]:
cf.loc['truth_bad'][:-1]

pr_good           3
pr_moderate     230
pr_bad         1253
Name: truth_bad, dtype: int64

In [67]:
def custom_metric(matrix):
    bad = matrix.loc['truth_bad']
    worst = matrix.loc['truth_worst']
    t_bad = sum(bad)
    t_worst = sum(worst)
    right_bad = bad['pr_bad']
    right_worst = worst['pr_worst']
    return round((right_bad + right_worst) / (t_bad + t_worst), 4)

In [68]:
custom_metric(cf)

0.7511

## for multiple runs

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score
from sklearn.metrics import classification_report

def custom_metric(matrix):
    bad = matrix.loc['truth_bad'][:-1]
    worst = matrix.loc['truth_worst'][:-1]
    t_bad = sum(bad)
    t_worst = sum(worst)
    right_bad = bad['pr_bad']+bad['pr_worst']
    right_worst = worst['pr_worst']+worst['pr_bad']
    return round(((right_bad + right_worst) / (t_bad + t_worst)), 4)

def modeling(depth, weight, X_train, y_train, X_test):
    clf = RandomForestClassifier(max_depth=depth, class_weight=weight, random_state=SEED)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    return predicted

def get_scores(y_test, predicted):
    cf = pd.DataFrame(confusion_matrix(y_test, predicted))
    cf.index = ['truth_good', 'truth_moderate', 'truth_bad', 'truth_worst']
    cf.columns = ['pr_good', 'pr_moderate', 'pr_bad', 'pr_worst']
    cf['truth_total'] = cf['pr_good']+cf['pr_moderate']+cf['pr_bad']+cf['pr_worst']
    recall_bad = custom_metric(cf)
    acc = round(accuracy_score(y_test, predicted), 4)
    f1 = round(f1_score(y_test, predicted, average='macro'), 4)
    
    print(' >> recall_bad: {:.02f}%'.format(recall_bad*100))
    print(' >> total acc.: {:.02f}%'.format(acc*100))
    print(' >> total F1: {:.02f}'.format(f1*100))
    return recall_bad, acc, f1, cf

def run(X_train, y_train, X_test, y_test, depth, weight):
    predicted = modeling(depth, weight, X_train, y_train, X_test)
    print('max_depth: {:} | class_weight: {:}'.format(depth, weight))
    _, _, _, cf = get_scores(y_test, predicted)
    return

In [82]:
weight = [None, {0:1, 1:1, 2:2, 3:3}, {0:1, 1:1, 2:3, 3:5}]
depth = [8, 10, 12, 15]

In [83]:
for i in weight:
    for j in depth:
        run(X_train, y_train, X_test, y_test, j, i)

max_depth: 8 | class_weight: None
 >> recall_bad: 73.26%
 >> total acc.: 80.56%
 >> total F1: 67.88
max_depth: 10 | class_weight: None
 >> recall_bad: 73.32%
 >> total acc.: 81.00%
 >> total F1: 70.13
max_depth: 12 | class_weight: None
 >> recall_bad: 73.88%
 >> total acc.: 80.80%
 >> total F1: 70.47
max_depth: 15 | class_weight: None
 >> recall_bad: 74.22%
 >> total acc.: 80.77%
 >> total F1: 70.66
max_depth: 8 | class_weight: {0: 1, 1: 1, 2: 2, 3: 3}
 >> recall_bad: 81.67%
 >> total acc.: 80.78%
 >> total F1: 71.27
max_depth: 10 | class_weight: {0: 1, 1: 1, 2: 2, 3: 3}
 >> recall_bad: 80.77%
 >> total acc.: 81.02%
 >> total F1: 71.70
max_depth: 12 | class_weight: {0: 1, 1: 1, 2: 2, 3: 3}
 >> recall_bad: 79.65%
 >> total acc.: 81.07%
 >> total F1: 71.60
max_depth: 15 | class_weight: {0: 1, 1: 1, 2: 2, 3: 3}
 >> recall_bad: 77.69%
 >> total acc.: 80.99%
 >> total F1: 71.22
max_depth: 8 | class_weight: {0: 1, 1: 1, 2: 3, 3: 5}
 >> recall_bad: 86.60%
 >> total acc.: 79.51%
 >> total F1: 

## Conclusion
**The more class weight for 'worst', the better recall for bad air quality**  

**Recall_bad: 86.60% | Acc: 79.51% | F1: 70.49** when max_depth=8, weight={1,1,3,5}  
Best total ACC: 81.07% when max_depth=12, weight={1,1,2,3}

In [85]:
predicted = modeling(8, {0:1, 1:1, 2:3, 3:5}, X_train, y_train, X_test)
recall_bad, acc, f1, cf = get_scores(y_test, predicted)
cf

 >> recall_bad: 86.60%
 >> total acc.: 79.51%
 >> total F1: 70.49


Unnamed: 0,pr_good,pr_moderate,pr_bad,pr_worst,truth_total
truth_good,1813,537,18,0,2368
truth_moderate,291,3739,541,2,4573
truth_bad,3,186,1300,21,1510
truth_worst,1,49,139,85,274
