In [8]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression


In [9]:
class FRUFS:
    def __init__(self, df, method="lgb"):
        self.df = df
        self.method = method
        self.model_func = LGBMRegressor if method == "lgb" else LinearRegression
        self.W = np.zeros((df.shape[1], df.shape[1]))  # 重み/重要度のマトリクス

    # 他の特徴量を計算するための重み/重要度を計算する
    def calc_coef(self):
        X = self.df.values
        for i in tqdm(range(X.shape[1]), total=X.shape[1], desc="Calculating coefficient/importances"):
            indices = np.concatenate((np.arange(i).reshape(-1, 1), np.arange(i+1, X.shape[1]).reshape(-1, 1)))

            train_X = np.hstack((X[:, :i], X[:, i+1:]))  # i番目の特徴量を外す
            train_y = X[:, i]  # i番目の特徴量

            # i番目の特徴量を他の特徴量で表現するための学習
            model = self.model_func()
            model.fit(train_X, train_y)

            # モデルの重み/重要度をWに格納する
            if self.method == "linear":
                coef = model.coef_
                coef = np.absolute(coef)
            elif self.method == "lgb":
                coef = model.feature_importances_

            self.W[i, indices] = coef.reshape(-1, 1)

        # 各特徴量が他の特徴量を表すときの重み/重要度の平均を計算
        self.W_average = self.W.mean(axis=0)  # 列方向に平均を取る = ある特徴量にかけられる重みの平均
        self.average_coef_df = pd.DataFrame({"columns": self.df.columns.values, "importances": self.W_average})

    # 特徴量の重要度をDataFrameとして取得
    def get_feature_importance(self):
        return self.average_coef_df.sort_values(by="importances", ascending=False)


In [10]:
all_df = pd.read_csv("all_data.csv")

# データフレームをtrainとtestに分割
df = all_df[all_df['attendance'] != -1]
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
df = df.drop(categorical_cols + ['id'], axis=1)

In [11]:
frufs = FRUFS(df, method="lgb")  # methodは"linear"も可
frufs.calc_coef()
importances = frufs.get_feature_importance()
print(importances)

Calculating coefficient/importances: 100%|██████████| 142/142 [00:57<00:00,  2.47it/s]

                         columns  importances
8                       capacity   112.394366
0                        section    81.316901
65                           day    69.676056
5                     attendance    57.661972
139  venue_prefecture_target_enc    57.415493
..                           ...          ...
104        capacity_min_by_venue     0.000000
14                          DAZN     0.000000
119             home_same_region     0.000000
105        capacity_var_by_venue     0.000000
106       capacity_mean_by_venue     0.000000

[142 rows x 2 columns]





In [17]:
print(importances.loc[importances['importances']>0, 'columns'].tolist())

['capacity', 'section', 'day', 'attendance', 'venue_prefecture_target_enc', 'year', 'temperature', 'home_team_conceded', 'humidity', 'temperature_max_by_venue', 'discomfort_index_sum_by_venue', 'home_team_scored', 'away_team_conceded', 'temperature_min_by_venue', 'day_sin', 'rank_diff', 'away_prefecture_target_enc', 'discomfort_index', 'away_team_scored', 'knn_avg_dist_3', 'distance', 'venue_region_target_enc', 'last_year_rank_diff', 'knn_avg_dist_10', 'away_team_avg_conceded_last_3', 'home_team_avg_scored_last_5', 'away_team_avg_scored_last_5', 'section_sin', 'away_team_lon', 'temperature_var_by_venue', 'away_team_last_year_rank', 'diff_score', 'away_team_avg_scored_last_3', 'home_team_rank', 'home_team_avg_conceded_last_3', 'home_team_avg_scored_last_3', 'away_team_rank', 'home_team_last_year_rank', 'knn_avg_dist_9', 'knn_avg_dist_4', 'temperature_mean_by_venue', 'knn_avg_dist_7', 'knn_avg_dist_2', 'knn_avg_dist_6', 'knn_avg_dist_8', 'humidity_var_by_venue', 'knn_avg_dist_5', 'away_t