## 上一次實驗成果並未完全排除風的因素，因為還留有風向屬性

In [1]:
# 引入模組
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_profiling

In [2]:
# 載入資料與空值處理
df = pd.read_csv('a.csv')
print(f'原本:{len(df)}')
df = df.dropna(axis=0, how='any')
print(f'刪除後:{len(df)}')

原本:43824
刪除後:41757


In [3]:
# 前處理
def TEMP_def(row):
    return row['TEMP'] - row['DEWP']

a = df[['PRES', 'TEMP', 'DEWP', 'Is', 'Ir']].astype(int)

a.loc[df.month >= 0, 'season'] = int(0)
a.loc[df.month >= 2, 'season'] = int(1)
a.loc[df.month >= 5, 'season'] = int(2)
a.loc[df.month >= 8, 'season'] = int(3)
a.loc[df.month >= 11,'season'] = int(0)

a.loc[df['pm2.5'] > 0, 'PM'] = int(0)       # 對人體無害
a.loc[df['pm2.5'] > 150, 'PM'] = int(1)      # 對人體有害

# 創建新欄位 : "DOTP" 用以紀錄露點溫度差(depression of the dew point)
a.insert(2, "DOTP", np.ones(len(a)))
a['DOTP'] = a.apply(TEMP_def, axis=1)

a

Unnamed: 0,PRES,TEMP,DOTP,DEWP,Is,Ir,season,PM
24,1020,-4,12.0,-16,0,0,0.0,0.0
25,1020,-4,11.0,-15,0,0,0.0,0.0
26,1021,-5,6.0,-11,0,0,0.0,1.0
27,1022,-5,2.0,-7,1,0,0.0,1.0
28,1022,-5,2.0,-7,2,0,0.0,0.0
...,...,...,...,...,...,...,...,...
43819,1034,-2,21.0,-23,0,0,0.0,0.0
43820,1034,-3,19.0,-22,0,0,0.0,0.0
43821,1034,-3,19.0,-22,0,0,0.0,0.0
43822,1034,-4,18.0,-22,0,0,0.0,0.0


In [4]:
# 清除空值
a = a.dropna()
# 取出PM 做label
y = a[['PM']]
# 將剩餘屬性作x
x = a.drop(["PM", "TEMP", "DEWP"], axis = 1)
x

Unnamed: 0,PRES,DOTP,Is,Ir,season
24,1020,12.0,0,0,0.0
25,1020,11.0,0,0,0.0
26,1021,6.0,0,0,0.0
27,1022,2.0,1,0,0.0
28,1022,2.0,2,0,0.0
...,...,...,...,...,...
43819,1034,21.0,0,0,0.0
43820,1034,19.0,0,0,0.0
43821,1034,19.0,0,0,0.0
43822,1034,18.0,0,0,0.0


In [5]:
# 製作訓練集與測試集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1241)

In [10]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
train_list=[]
test_list=[]
tree = []
max_acc = 0
node = 0
for i in range(0,15):
    clf = DecisionTreeClassifier(criterion = "entropy", max_depth=i+1)
    clf.fit(X_train, y_train)
    train_score=clf.score(X_train, y_train)
    test_score=clf.score(X_test,y_test)
    train_list.append(train_score)
    test_list.append(test_score)
    tree.append(clf)
    if max_acc < test_score:
        max_acc = test_score
        node = i



relust={
    'Train_score':train_list,
    'Test_score':test_list
}
result_df=pd.DataFrame(relust)

print(result_df)
print(f'Top Score：\n{result_df.max()}')
print(f'最佳節點數: {node}')

    Train_score  Test_score
0      0.785377    0.783029
1      0.785377    0.783029
2      0.792186    0.795083
3      0.798105    0.797238
4      0.798105    0.797238
5      0.804331    0.803704
6      0.806555    0.806817
7      0.810148    0.808174
8      0.814117    0.811288
9      0.818051    0.812325
10     0.821575    0.811767
11     0.824347    0.810010
12     0.826844    0.810170
13     0.829718    0.808733
14     0.832455    0.808174
Top Score：
Train_score    0.832455
Test_score     0.812325
dtype: float64
最佳節點數: 9


## 結果顯示，排除風的影響後，準確度不無影響

In [7]:
# 計算重要性並排序
tree[9].feature_importances_.tolist()
df_gini = pd.DataFrame({'feature':x.columns,'feature_importance':clf.feature_importances_.tolist()})
df_gini = df_gini.sort_values(by=['feature_importance'],ascending=False).reset_index(drop=True)
df_gini=df_gini.loc[(df_gini.feature_importance!=0)]
df_gini.head(15)

Unnamed: 0,feature,feature_importance
0,DOTP,0.511645
1,PRES,0.29213
2,season,0.130873
3,Ir,0.047253
4,Is,0.018099


In [8]:
clf = tree[9]
pm_class = ['0', '1']

In [9]:
#建構樹狀圖
from sklearn.tree import export_graphviz
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\\bin'
#視覺化
dot_data = export_graphviz(clf, out_file=None, label='all', impurity=False, proportion=True, feature_names=list(X_train), class_names=pm_class,filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph
graph.render('第六次探勘', view=True)  

'第六次探勘.pdf'