In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm 

### 資料讀檔&前處理

In [2]:
air_quality = pd.read_csv('./AirQualityUCI.csv')
air_quality['Date'] = pd.to_datetime(air_quality['Date'])
air_quality['Date'] = (air_quality['Date'] - air_quality['Date'].min()).dt.total_seconds()
air_quality['Time'] = [int(x[:2]) for x in air_quality['Time']]
air_quality.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,23587200.0,18,2,6,1360,150,11,9,1046,166,1056,113,1692,1268,13
1,23587200.0,19,2,1292,112,9,4,955,103,1174,92,1559,972,13,3
2,23587200.0,20,2,2,1402,88,9,0,939,131,1140,114,1555,1074,11
3,23587200.0,21,2,2,1376,80,9,2,948,172,1092,122,1584,1203,11
4,23587200.0,22,1,6,1272,51,6,5,836,131,1205,116,1490,1110,11


In [3]:
labels = air_quality['PT08.S5(O3)'] # 若有做"特徵重要性"(step 4)再拿出來即可
air_quality = air_quality.drop(columns = 'PT08.S5(O3)')

In [4]:
# 決定是否把哪些feature刪掉的步驟->思路參考
# 把每個步驟要刪掉的feature記錄在to_drop中
data = air_quality
to_drop = [] 

### step 1(feature的nan比例太高的刪掉)

In [5]:
feature_nan = data.isnull().sum() / data.shape[0]
feature_nan_drop = feature_nan[feature_nan > 0.5] # missing_threshold : 0.5
to_drop.extend(list(feature_nan_drop.index))

### step 2(feature只有1種的刪掉)

In [6]:
feature_nunique = data.nunique()
feature_nunique_drop = feature_nunique[feature_nunique == 1]
to_drop.extend(list(feature_nunique_drop.index))

### step 3(feature的相關係數太高的刪掉)

In [7]:
corr_matrix = data.corr()
feature_corr = pd.DataFrame(columns = [0 , 1 , 2])
for i in range(0 , corr_matrix.shape[0]):
    for j in range(i , corr_matrix.shape[1]):
        if i != j:
            if abs(corr_matrix.iloc[i , j]) > 0.7: # 將兩兩特徵相關係數大於0.7的特徵記錄起來
                temp = [corr_matrix.index[i] , corr_matrix.columns[j] , corr_matrix.iloc[i , j]]
                temp = pd.DataFrame(temp).T
                feature_corr = pd.concat([feature_corr , temp] , axis = 0)

# 感覺這個方法比較合理，總不能把feature刪太多
if len(set(feature_corr.iloc[: , 0])) >= len(set(feature_corr.iloc[: , 1])):
    to_drop.extend(list(set(feature_corr.iloc[: , 1]))) 
else:
    to_drop.extend(list(set(feature_corr.iloc[: , 0]))) 

### step 4(用一個方法建模計算feature的重要性，feature重要性太低的刪掉)

In [8]:
early_stopping = True
task = 'regression'
eval_metric = '12'
n_iterations = 10

if early_stopping and eval_metric is None:
    raise ValueError('Eval metric must be provided with early stopping. Examples include auc for classification or l2 for regression.')   
if labels is None:
    raise ValueError('No training labels provided.')

# One hot encoding
features = pd.get_dummies(data)
one_hot_features = []
for column in features.columns:
    if column not in list(data.columns):
        one_hot_features.append(column) 

# 將One hot encoding的數據加入原本的數據中
data_all = pd.concat([features[one_hot_features] , data] , axis = 1)

# 拿到所有特徵的名字
feature_names = list(features.columns)

# 將數據轉換成array的形式
features = np.array(features)
labels = np.array(labels).reshape((-1, ))

# 創造一個陣列來存feature_importance
feature_importance_values = np.zeros(len(feature_names))

In [9]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# 跑個n_iterations次，計算每一次數據的特徵重要性，再將這些特徵重要性做平均，決定那些特徵比較重要
for i in tqdm(range(0 , n_iterations)):     

    if task == 'classification':
        model = lgb.LGBMClassifier(n_estimators = 1000 , learning_rate = 0.05 , verbose = 0)

    elif task == 'regression':
        model = lgb.LGBMRegressor(n_estimators = 1000 , learning_rate = 0.05 , verbose = 0)

    else:
        raise ValueError('Task must be either classification or regression')
        
    if early_stopping:       
        train_features , valid_features , train_labels , valid_labels = train_test_split(features , labels , test_size = 0.15)

        model.fit(train_features, 
                  train_labels, 
                  eval_metric = eval_metric ,
                  eval_set = [(valid_features , valid_labels)],
                  early_stopping_rounds = 100 , 
                  verbose = 0)
        
        # 刪除以下資料以免記憶體爆掉
        del train_features, train_labels, valid_features, valid_labels
        
    else:
        model.fit(features, labels)

    # 將每一次的特徵重要性的數值加總再平均
    feature_importance_values += model.feature_importances_ / n_iterations

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.61s/it]


In [10]:
feature_importances = pd.DataFrame({'feature': feature_names , 'importance': feature_importance_values})

# 依據特徵重要性形排列
feature_importances = feature_importances.sort_values(by = 'importance', ascending = False).reset_index(drop = True)

# 對特徵重要性的數值做Normalization
feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()

# 對特徵重要性做累加
feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])

# 將累計特徵重要性大於9成以上的刪除
record_low_importance = feature_importances[feature_importances['cumulative_importance'] > 0.9]

to_drop.extend(list(record_low_importance['feature']))

# 最後to_drop就是刪掉的特徵
to_drop = list(set(to_drop))

### 展示要刪除的特徵

In [11]:
to_drop

['C6H6(GT)',
 'PT08.S1(CO)',
 'NMHC(GT)',
 'PT08.S3(NOx)',
 'PT08.S2(NMHC)',
 'NO2(GT)',
 'PT08.S4(NO2)',
 'CO(GT)',
 'NOx(GT)']