In [None]:
# Googleドライブマウント
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# データの読み込みと確認

In [None]:
# ライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import (
    chi2, f_classif, f_regression, mutual_info_classif, 
    mutual_info_regression, SelectKBest, SelectPercentile
)
from sklearn.preprocessing import StandardScaler

# データの読み込み
train = pd.read_csv('/content/drive/MyDrive/pm/data/input/train.csv')

In [None]:
# データ型の確認
print(train.dtypes)

id                   int64
year                 int64
month                int64
day                  int64
Country             object
City                object
lat                float64
lon                float64
co_cnt               int64
co_min             float64
co_mid             float64
co_max             float64
co_var             float64
o3_cnt               int64
o3_min             float64
o3_mid             float64
o3_max             float64
o3_var             float64
so2_cnt              int64
so2_min            float64
so2_mid            float64
so2_max            float64
so2_var            float64
no2_cnt              int64
no2_min            float64
no2_mid            float64
no2_max            float64
no2_var            float64
temperature_cnt      int64
temperature_min    float64
temperature_mid    float64
temperature_max    float64
temperature_var    float64
humidity_cnt         int64
humidity_min       float64
humidity_mid       float64
humidity_max       float64
h

In [None]:
# 不要なカラムを削除する
del_columns = ['id', 'year', 'month', 'day', 'Country', 'City', 'pm25_mid']
X_train = train.drop(del_columns, axis=1)

In [None]:
# 説明変数の標準化
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)

# 目的変数
y = train['pm25_mid']

# 単変量特徴量選択

In [None]:
# 単変量特徴量選択（univariate feature selection）のラッパークラスを作成
class UnivariateFeatureSelection:
    
    def __init__(self, n_features, problem_type, scoring):
        
        if problem_type == 'classification':
            valid_scoring = {
                'f_classif': f_classif,
                'chi2': chi2,
                'mutual_info_classif': mutual_info_classif
            }
        else:
            valid_scoring = {
                'f_regression': f_regression,
                'mutual_info_regression': mutual_info_regression
            }
        
        # 手法が対応していない場合の例外の発生
        if scoring not in valid_scoring:
            raise Exception('Invalid scoring function')
        
        if isinstance(n_features, int):
            self.selection = SelectKBest(
                valid_scoring[scoring],
                k=n_features
            )
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(
                valid_scoring[scoring],
                percentile=int(n_features * 100)
            )
        else:
            raise Exception('Invalid type of feature')

    # fit関数
    def fit(self, X, y):
        return self.selection.fit(X, y)
    
    # transform関数
    def transform(self, X):
        return self.selection.transform(X)
    
    # fit_transform関数
    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)

In [None]:
ufs = UnivariateFeatureSelection(n_features=20,
                                 problem_type='regression',
                                 scoring='f_regression'
)
ufs.fit(X_train_std, y)
X_transformed = ufs.transform(X_train_std)
mask = ufs.selection.get_support() # 各特徴量を選択した（True）、していない（False）

In [None]:
print(mask)

[False False  True  True  True  True False False  True False  True  True
  True False  True  True False False  True  True  True  True False False
 False False False False False False False False False  True  True  True
  True False False  True  True False False False False False False]


In [None]:
# 結果の確認（input: 特徴量、selected: 特徴量として選択されたかどうか）
selection_result = pd.DataFrame({'input': X_train.columns, 'selected': mask})
print(selection_result)

              input  selected
0               lat     False
1               lon     False
2            co_cnt      True
3            co_min      True
4            co_mid      True
5            co_max      True
6            co_var     False
7            o3_cnt     False
8            o3_min      True
9            o3_mid     False
10           o3_max      True
11           o3_var      True
12          so2_cnt      True
13          so2_min     False
14          so2_mid      True
15          so2_max      True
16          so2_var     False
17          no2_cnt     False
18          no2_min      True
19          no2_mid      True
20          no2_max      True
21          no2_var      True
22  temperature_cnt     False
23  temperature_min     False
24  temperature_mid     False
25  temperature_max     False
26  temperature_var     False
27     humidity_cnt     False
28     humidity_min     False
29     humidity_mid     False
30     humidity_max     False
31     humidity_var     False
32     pre

単変量特徴量選択の結果、<br>
・一酸化炭素<br>
・オゾン<br>
・二酸化硫黄<br>
・二酸化窒素<br>
・気圧<br>
・風速<br>
といった特徴量がPM2.5の予測に効果的な説明変数だということがわかった