## 為了瞭解決策樹中，是否有其他屬性干擾露點溫度的作用，於是此次實驗將露點、溫度、露點溫度差這三個屬性都捨去掉，丟入運算，希望了解除去這三個屬性外的重要影響因素。
結果在下面

In [1]:
# 引入模組
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_profiling

In [2]:
# 載入資料與空值處理
df = pd.read_csv('a.csv')
print(f'原本:{len(df)}')
df = df.dropna(axis=0, how='any')
print(f'刪除後:{len(df)}')

原本:43824
刪除後:41757


In [3]:
# 前處理
a = df[['PRES', 'Iws', 'Is','Ir']].astype(int)

a = a.join(pd.get_dummies(df[['cbwd']]))

a.loc[df.month >= 0, 'season'] = int(0)
a.loc[df.month >= 2, 'season'] = int(1)
a.loc[df.month >= 5, 'season'] = int(2)
a.loc[df.month >= 8, 'season'] = int(3)
a.loc[df.month >= 11,'season'] = int(0)

a.loc[df['pm2.5'] > 0, 'PM'] = int(0)       # 對人體無害
a.loc[df['pm2.5'] > 150, 'PM'] = int(1)      # 對人體有害


In [4]:
# 清除空值
a = a.dropna()
# 取出PM 做label
y = a[['PM']]
# 將剩餘屬性作x
x = a.drop("PM", axis = 1)
x

Unnamed: 0,PRES,Iws,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv,season
24,1020,1,0,0,0,0,1,0,0.0
25,1020,2,0,0,0,0,1,0,0.0
26,1021,3,0,0,0,0,1,0,0.0
27,1022,5,1,0,0,0,1,0,0.0
28,1022,6,2,0,0,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...
43819,1034,231,0,0,0,1,0,0,0.0
43820,1034,237,0,0,0,1,0,0,0.0
43821,1034,242,0,0,0,1,0,0,0.0
43822,1034,246,0,0,0,1,0,0,0.0


In [5]:
# 製作訓練集與測試集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1241)

In [6]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
train_list=[]
test_list=[]
tree = []
max_acc = 0
node = 0
for i in range(0,15):
    clf = DecisionTreeClassifier(criterion = "entropy", max_depth=i+1)
    clf.fit(X_train, y_train)
    train_score=clf.score(X_train, y_train)
    test_score=clf.score(X_test,y_test)
    train_list.append(train_score)
    test_list.append(test_score)
    tree.append(clf)
    if max_acc < test_score:
        max_acc = test_score
        node = i
relust={
    'Train_score':train_list,
    'Test_score':test_list
}
result_df=pd.DataFrame(relust)

print(result_df)
print(f'Top Score：\n{result_df.max()}')
print(f'最佳節點數: {node}')

    Train_score  Test_score
0      0.785377    0.783029
1      0.785377    0.783029
2      0.785377    0.783029
3      0.786711    0.782310
4      0.791707    0.787659
5      0.792801    0.788617
6      0.793349    0.788776
7      0.795025    0.790692
8      0.798926    0.790453
9      0.802005    0.791011
10     0.805837    0.788537
11     0.809806    0.786302
12     0.813569    0.785024
13     0.818428    0.780315
14     0.822773    0.779436
Top Score：
Train_score    0.822773
Test_score     0.791011
dtype: float64
最佳節點數: 9


## 結果顯示，除去三個屬性後，累積風速成為最重要的影響因素


## 並且，雖然準確度有所下滑，但依然在7成8以上，代表風可能才是最大原因

In [7]:
# 計算重要性並排序
tree[9].feature_importances_.tolist()
df_gini = pd.DataFrame({'feature':x.columns,'feature_importance':clf.feature_importances_.tolist()})
df_gini = df_gini.sort_values(by=['feature_importance'],ascending=False).reset_index(drop=True)
df_gini=df_gini.loc[(df_gini.feature_importance!=0)]
df_gini.head(15)

Unnamed: 0,feature,feature_importance
0,Iws,0.360113
1,PRES,0.304759
2,season,0.115849
3,cbwd_SE,0.077107
4,Ir,0.046619
5,Is,0.036324
6,cbwd_NE,0.024729
7,cbwd_cv,0.017254
8,cbwd_NW,0.017245


In [8]:
clf = tree[9]
pm_class = ['0', '1']

In [9]:
#建構樹狀圖
from sklearn.tree import export_graphviz
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\\bin'
#視覺化
dot_data = export_graphviz(clf, out_file=None, label='all', impurity=False, proportion=True, feature_names=list(X_train), class_names=pm_class,filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph
graph.render('第四次探勘', view=True)  

'第四次探勘.pdf'