In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as mt
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix 
import joblib

In [2]:
# 1. daily raw data 가져오기
model_data = pd.read_csv("C:/Users/yunoa/SW/dataset/data_강남구_f.csv", encoding="utf-8")
model_data.set_index('time', inplace=True)
model_data

Unnamed: 0_level_0,SO2,CO,O3,NO2,PM10,temp,deg_sin,deg_cos,spd,rain,humi
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-01-01 1:00,0.005,0.4,0.011,0.027,좋음,-10.1,0.146083,0.989272,1.3,0.0,64.4
2010-01-01 2:00,0.005,0.5,0.006,0.034,보통,-9.8,0.648120,0.761538,0.8,0.0,38.5
2010-01-01 3:00,0.005,0.5,0.008,0.030,보통,-10.7,0.928486,0.371368,1.9,0.0,49.0
2010-01-01 4:00,0.005,0.5,0.008,0.030,보통,-11.1,0.873772,0.486335,0.7,0.0,54.2
2010-01-01 5:00,0.005,0.4,0.009,0.027,보통,-11.1,0.887815,0.460200,1.9,0.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 20:00,0.004,0.5,0.013,0.030,좋음,-5.4,-0.004847,-0.016767,1.6,0.0,48.9
2020-12-31 21:00,0.004,0.5,0.014,0.027,좋음,-5.6,-0.017272,0.002509,1.5,0.0,51.9
2020-12-31 22:00,0.004,0.5,0.018,0.022,보통,-5.8,0.014219,-0.010121,2.1,0.0,54.5
2020-12-31 23:00,0.004,0.5,0.020,0.020,보통,-5.6,-0.013408,0.011173,1.9,0.0,54.2


In [3]:
len(model_data)

96432

In [4]:
# 2. features, label 전체데이터 생성
X = model_data.drop("PM10", axis = 1)
y = model_data.PM10

In [5]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 

In [6]:
for train_index, test_index in sss.split(X, y): #sss.split(~) 안에 n_splits 수만큼 준비됨
    X_train, X_test = X.iloc[train_index,], X.iloc[test_index,]
    y_train, y_test = y[train_index], y[test_index] 

In [None]:
# ===== 랜덤포레스트 메인 =====
# 4. 모델 세부 튜닝: 최적 하이퍼파라미터 찾기
rnd_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=1)

param_dist_rf = {
    'n_estimators':[10, 50, 100, 150, 200, 300],
    'max_leaf_nodes':[5, 10, 15, 20, 30, 40],
    'min_samples_leaf' : [5, 10, 15, 20, 30, 40],
    'min_samples_split' : [5, 10,15, 20, 30, 40],
    'max_depth' : [5, 10, 15, 20, 25, 30]
}

rnd_search = RandomizedSearchCV(rnd_clf, param_dist_rf, cv=10, random_state=1)
rnd_search.fit(X_train, y_train)
print(rnd_search.best_params_)

In [22]:
# 5. 학습 및 K-fold cross_validation 평가
rnd_clf = RandomForestClassifier(n_estimators=200, min_samples_split = 15, min_samples_leaf = 15, max_leaf_nodes=40,
                                 max_features='auto', max_depth = 20, n_jobs=-1, random_state=1) #디폴트
rnd_scores = cross_val_score(rnd_clf, X_train, y_train, scoring="accuracy", cv=10)
print("\n<10-fold cross-validation>")
print("accuracy score mean: ", rnd_scores.mean())


<10-fold cross-validation>
accuracy score mean:  0.7025344312577093


In [23]:
# 6. 최종 모델 학습
rnd_clf.fit(X_train, y_train)
print("\n<AI model: machine learning done >")
print("accuracy_score of train data(0.8 of sample): ", rnd_clf.score(X_train, y_train))
print("accuracy_score of test data(0.2 of sample): ", rnd_clf.score(X_test, y_test)) 


<AI model: machine learning done >
accuracy_score of train data(0.8 of sample):  0.7042063646380193
accuracy_score of test data(0.2 of sample):  0.7072121117851402


In [24]:
# 7. confusion matrix 확인
y_test_pred = rnd_clf.predict(X_test)
cm1= confusion_matrix(y_test, y_test_pred, labels=["좋음","보통","나쁨", "매우나쁨"])
print("\n<Confusion matrix>")
print("(of test)")
print("좋음","보통","나쁨", "매우나쁨")
print(cm1)


<Confusion matrix>
(of test)
좋음 보통 나쁨 매우나쁨
[[4671 2409    0    0]
 [1505 8854   15    0]
 [  46 1443  115    0]
 [  20  170   39    0]]


In [25]:
# 8. 변수 중요도 체크
print("\n<Feature importance>")
for name, score in zip(X.columns, rnd_clf.feature_importances_):
    print(name, ": ", score)


<Feature importance>
SO2 :  0.1838853508479851
CO :  0.375207860165281
O3 :  0.07626355927166313
NO2 :  0.1934912346928316
temp :  0.02694125363451027
deg_sin :  0.08496452589444371
deg_cos :  0.005845625816061271
spd :  0.003922483969654498
rain :  0.0191621715974142
humi :  0.03031593411015523


In [26]:
# 9. backtesting용 과거의 예측데이터 생성
y_prediction = rnd_clf.predict(X)
y_pred = pd.Series(y_prediction, index=y.index) 

In [27]:
# 10. 모델 저장
joblib.dump(rnd_clf, "C:/Users/yunoa/SW/model/forecast_model_강남구.pkl")
print("\n< AI model: save >") 


< AI model: save >
