### 第二次實驗
由於PM種類過多，且PM2.5三級（輕度污染）以上的數量僅9000筆
所過多的分類並無法提高決策樹效果，意義也不大。
因此，此次實驗主要目的為減少PM種類，並實測效果

In [1]:
# 引入模組
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import pandas_profiling

In [2]:
# 載入資料
df = pd.read_csv('a.csv')
df

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,43820,2014,12,31,19,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43820,43821,2014,12,31,20,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43821,43822,2014,12,31,21,10.0,-22,-3.0,1034.0,NW,242.70,0,0
43822,43823,2014,12,31,22,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [3]:
# 空值處理
print(f'原本:{len(df)}')
df = df.dropna(axis=0, how='any')
print(f'刪除後:{len(df)}')

原本:43824
刪除後:41757


In [4]:
# 前處理
a = df[['PRES', 'TEMP', 'Iws','DEWP','Is','Ir']].astype(int)

a = a.join(pd.get_dummies(df[['cbwd']]))

a.loc[df.month >= 0, 'season'] = int(0)
a.loc[df.month >= 2, 'season'] = int(1)
a.loc[df.month >= 5, 'season'] = int(2)
a.loc[df.month >= 8, 'season'] = int(3)
a.loc[df.month >= 11,'season'] = int(0)

a

Unnamed: 0,PRES,TEMP,Iws,DEWP,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv,season
24,1020,-4,1,-16,0,0,0,0,1,0,0.0
25,1020,-4,2,-15,0,0,0,0,1,0,0.0
26,1021,-5,3,-11,0,0,0,0,1,0,0.0
27,1022,-5,5,-7,1,0,0,0,1,0,0.0
28,1022,-5,6,-7,2,0,0,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
43819,1034,-2,231,-23,0,0,0,1,0,0,0.0
43820,1034,-3,237,-22,0,0,0,1,0,0,0.0
43821,1034,-3,242,-22,0,0,0,1,0,0,0.0
43822,1034,-4,246,-22,0,0,0,1,0,0,0.0


In [5]:
# 由於結果種類過多不利於決策樹，因此將PM2.5更改為兩類 (對人體無害 : 對人體有害)

a.loc[df['pm2.5'] > 0, 'PM'] = int(0)       # 對人體無害
a.loc[df['pm2.5'] > 150, 'PM'] = int(1)      # 對人體有害
a

Unnamed: 0,PRES,TEMP,Iws,DEWP,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv,season,PM
24,1020,-4,1,-16,0,0,0,0,1,0,0.0,0.0
25,1020,-4,2,-15,0,0,0,0,1,0,0.0,0.0
26,1021,-5,3,-11,0,0,0,0,1,0,0.0,1.0
27,1022,-5,5,-7,1,0,0,0,1,0,0.0,1.0
28,1022,-5,6,-7,2,0,0,0,1,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
43819,1034,-2,231,-23,0,0,0,1,0,0,0.0,0.0
43820,1034,-3,237,-22,0,0,0,1,0,0,0.0,0.0
43821,1034,-3,242,-22,0,0,0,1,0,0,0.0,0.0
43822,1034,-4,246,-22,0,0,0,1,0,0,0.0,0.0


In [6]:
# 查看圖表
a.profile_report()

Summarize dataset: 100%|██████████| 76/76 [00:09<00:00,  7.92it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]




In [7]:
# 清除空值
a = a.dropna()
# 取出PM 做label
y = a[['PM']]
# 將剩餘屬性作x
x = a.drop("PM", axis = 1)

In [8]:
# 製作訓練集與測試集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1241)

In [9]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
train_list=[]
test_list=[]
tree = []
max_acc = 0
node = 0
for i in range(0,15):
    clf = DecisionTreeClassifier(criterion = "entropy", max_depth=i+1)
    clf.fit(X_train, y_train)
    train_score=clf.score(X_train, y_train)
    test_score=clf.score(X_test,y_test)
    train_list.append(train_score)
    test_list.append(test_score)
    tree.append(clf)
    if max_acc < test_score:
        max_acc = test_score
        node = i
relust={
    'Train_score':train_list,
    'Test_score':test_list
}
result_df=pd.DataFrame(relust)

print(result_df)
print(f'Top Score：\n{result_df.max()}')
print(f'最佳節點數: {node}')

    Train_score  Test_score
0      0.785377    0.783029
1      0.785377    0.783029
2      0.787909    0.786461
3      0.797660    0.796679
4      0.803989    0.802107
5      0.805495    0.801229
6      0.810114    0.803305
7      0.816991    0.810968
8      0.825373    0.813603
9      0.832524    0.814002
10     0.840735    0.820787
11     0.848638    0.820707
12     0.860237    0.814720
13     0.870467    0.819510
14     0.884836    0.815678
Top Score：
Train_score    0.884836
Test_score     0.820787
dtype: float64
最佳節點數: 10


note : 由上面結果可以看到，準確度大幅提升，因此將PM分為兩部分為正確方向。

In [10]:
# 計算重要性並排序
tree[11].feature_importances_.tolist()
df_gini = pd.DataFrame({'feature':x.columns,'feature_importance':clf.feature_importances_.tolist()})
df_gini = df_gini.sort_values(by=['feature_importance'],ascending=False).reset_index(drop=True)
df_gini=df_gini.loc[(df_gini.feature_importance!=0)]
df_gini.head(15)

Unnamed: 0,feature,feature_importance
0,DEWP,0.286236
1,TEMP,0.21427
2,Iws,0.175571
3,PRES,0.172011
4,season,0.06297
5,cbwd_NW,0.035567
6,Ir,0.014182
7,cbwd_SE,0.013191
8,cbwd_NE,0.012959
9,cbwd_cv,0.00827


In [11]:
clf = tree[11]
pm_class = ['0', '1']

In [12]:
#建構樹狀圖
from sklearn.tree import export_graphviz
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\\bin'
#視覺化
dot_data = export_graphviz(clf, out_file=None, label='all', impurity=False, proportion=True, feature_names=list(X_train), class_names=pm_class,filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph
graph.render('第二次探勘', view=True)  

'第二次探勘.pdf'