In [4]:
#先導入資料處理會用到的模組
import numpy as np
import scipy as sp
import pandas as pd
from pandas_profiling import ProfileReport

# 可視化模組
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 機器學習模組
import sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
pd.set_option('display.max_columns', None)

In [42]:
trainset = pd.read_csv('/Users/shyanechang/Desktop/AI_Class/專題/data/station_observation/data_ver_4_DC.csv', index_col = 0)
trainset.dropna(subset = ['TmrDayoff'], inplace=True)
trainset.shape

(3573, 42)

In [53]:
X = trainset.drop('TmrDayoff', axis = 1)  # 訓練的全部變數
y = trainset['TmrDayoff']  # 分類
X.columns

Index(['ObsDate', 'Region', 'County', 'TyID', 'TyNameCh', 'TyName', 'route',
       'intensity', 'hpa', 'TyWS', 'X7_radius', 'X10_radius', 'alert_num',
       'born_spotN', 'born_spotE', 'Dayoff', 'StnPres', 'SeaPres',
       'StnPresMax', 'StnPresMin', 'Temperature', 'T.Max', 'T.Min',
       'Td.dew.point', 'RH', 'RHMin', 'Precp', 'PrecpHour', 'PrecpMax10',
       'PrecpMax60', 'SunShine', 'SunShineRate', 'GloblRad', 'VisbMean',
       'EvapA', 'UVI.Max', 'Cloud.Amount', 'WD_vector_x', 'WD_vector_y',
       'WDGust_vector_x', 'WDGust_vector_y'],
      dtype='object')

In [54]:
X = X.drop(X.columns[:6], axis = 1)
X.columns

1       0.0
2       1.0
3       1.0
4       0.0
5       0.0
       ... 
3544    1.0
3545    0.0
3546    1.0
3547    1.0
3548    1.0
Name: TmrDayoff, Length: 1827, dtype: float64

In [56]:
X = pd.get_dummies(X)

In [57]:
X.columns

Index(['hpa', 'TyWS', 'X7_radius', 'X10_radius', 'alert_num', 'born_spotN',
       'born_spotE', 'Dayoff', 'StnPres', 'SeaPres', 'StnPresMax',
       'StnPresMin', 'Temperature', 'T.Max', 'T.Min', 'Td.dew.point', 'RH',
       'RHMin', 'Precp', 'PrecpHour', 'PrecpMax10', 'PrecpMax60', 'SunShine',
       'SunShineRate', 'GloblRad', 'VisbMean', 'EvapA', 'UVI.Max',
       'Cloud.Amount', 'WD_vector_x', 'WD_vector_y', 'WDGust_vector_x',
       'WDGust_vector_y', 'route_--', 'route_1', 'route_2', 'route_3',
       'route_4', 'route_5', 'route_6', 'route_7', 'route_9', 'route_特殊',
       'intensity_intense', 'intensity_moderate', 'intensity_tropical'],
      dtype='object')

In [58]:
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.fit_transform(X)
X_scaled
# imputer = KNNImputer(n_neighbors=5)

array([[0.69072165, 0.33898305, 0.70833333, ..., 0.        , 1.        ,
        0.        ],
       [0.69072165, 0.33898305, 0.70833333, ..., 0.        , 1.        ,
        0.        ],
       [0.69072165, 0.33898305, 0.70833333, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.82474227, 0.20338983, 0.70833333, ..., 0.        , 0.        ,
        1.        ],
       [0.82474227, 0.20338983, 0.70833333, ..., 0.        , 0.        ,
        1.        ],
       [0.82474227, 0.20338983, 0.70833333, ..., 0.        , 0.        ,
        1.        ]])

In [59]:
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X_scaled)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

In [None]:
'''
挑選跑一百次模型後，最適合data的rf model
'''

rf = RandomForestClassifier(n_estimators=500)  # 指定隨機森林模型內要有500棵樹
score = 0  
for i in range(10):  # 跑100次這個模型fitting
    X_train,X_test,y_train,y_test = train_test_split(X_imputed, y, test_size=0.2)  # 每次隨機取出20%資料拿來當作model得分測試
    model = rf.fit(X_train, y_train)
    model_score = model.score(X_test, y_test)
    if model_score > score:  # 取model fitting得分最高的
        score = model_score
        model_best = model
        print(model_score)

0.7622950819672131


In [None]:
'''
視覺化rf變數的重要性
'''
def showFeatureImportance(model):
    # model.feature_importances_ & feature_name_in_分別是變數重要性與變數名稱
    df = pd.DataFrame({'Importance': model.feature_importances_}, index = model.feature_names_in_)
    df.sort_values(by = 'Importance', inplace = True)  # 依照變數重要性排序
    plt.figure(figsize = (10,10))
    plt.barh(df.index, df.Importance)  # 橫躺的bar plot
    plt.show()
showFeatureImportance(model_best)

In [45]:
model_best.feature_importances_

array([0.0190937 , 0.01689147, 0.01572023, 0.01149008, 0.02084181,
       0.02478825, 0.02132785, 0.02307182, 0.03657707, 0.02163568,
       0.03142581, 0.02183795, 0.02438617, 0.02116288, 0.02181254,
       0.02285761, 0.02244796, 0.09147147, 0.06277055, 0.04273092,
       0.05655546, 0.02457634, 0.02371503, 0.05133667, 0.03044431,
       0.0212376 , 0.03789928, 0.03047478, 0.02036646, 0.0289264 ,
       0.02205666, 0.02686028, 0.00571548, 0.00227798, 0.00827723,
       0.00760528, 0.00210637, 0.00095747, 0.00367653, 0.00614368,
       0.00028464, 0.00337108, 0.00164328, 0.00388309, 0.00246683,
       0.00279998])

In [None]:
plt.figure()