In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('input/sample-data/train.csv')

data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.sort_values('date').reset_index()

train_x, test_x = train_test_split(data, train_size=0.8, shuffle=False)

train_y = train_x['target']
train_x = train_x.drop('target', axis=1)

test_y = test_x['target']
test_x = test_x.drop('target', axis=1)

train_x = train_x.select_dtypes(include=[float, int])

## 相関係数

In [11]:
import scipy.stats as st

# 相関係数
corrs = []
for c in train_x.columns:
    corr = np.corrcoef(train_x[c], train_y)[0, 1]
    corrs.append(corr)
corrs = np.array(corrs)

# スピアマンの順位相関係数
corrs_sp = []
for c in train_x.columns:
    corr_sp = st.spearmanr(train_x[c], train_y).correlation
    corrs_sp.append(corr_sp)
corrs_sp = np.array(corrs_sp)

# 重要度の出力
idx = np.argsort(np.abs(corrs))[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)

idx = np.argsort(np.abs(corrs_sp))[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs_sp[idx][:5]
print(top_cols, top_importances)

['medical_info_c1' 'medical_info_c2' 'medical_keyword_5' 'medical_info_a1'
 'medical_keyword_4'] [       nan        nan 0.22001428 0.21082613 0.16699822]
['medical_info_c1' 'medical_info_c2' 'medical_keyword_5' 'medical_info_a1'
 'medical_keyword_4'] [       nan        nan 0.22001428 0.21286561 0.16699822]


## カイ二乗統計量

In [20]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

x = MinMaxScaler().fit_transform(train_x)
x = np.nan_to_num(x, nan=np.nanmean(x))
c2, _ = chi2(x, train_y)

idx = np.argsort(np.abs(c2))[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], c2[idx][:5]
print(top_cols, top_importances)

['medical_keyword_5' 'medical_keyword_4' 'medical_keyword_3'
 'medical_keyword_2' 'age'] [379.65048469 212.28654019 170.18834223  49.61304054  28.46407823]


## 相互情報量

In [22]:
from sklearn.feature_selection import mutual_info_classif

x = train_x.select_dtypes(include=int)

mi = mutual_info_classif(x, train_y)

idx = np.argsort(np.abs(mi))[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], mi[idx][:5]
print(top_cols, top_importances)

['weight' 'age' 'medical_keyword_1' 'medical_info_c1' 'medical_info_a1'] [0.04493014 0.02425523 0.02205501 0.01261546 0.00925141]


## ランダムフォレストの特徴量の重要度

In [26]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, random_state=71)
x = train_x.fillna(train_x.mean())
clf.fit(x, train_y)
fi = clf.feature_importances_

idx = np.argsort(fi)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], fi[idx][:5]
print(top_cols, top_importances)

['weight' 'medical_info_a1' 'age' 'medical_info_a2' 'height'] [0.15420126 0.14974574 0.10180357 0.08542085 0.07978597]


## gbdtの特徴量の重要度

In [28]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.1-py3-none-manylinux2014_aarch64.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.1
[0mNote: you may need to restart the kernel to use updated packages.


In [29]:
import xgboost as xgb

dtrain = xgb.DMatrix(train_x, label=train_y)
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
num_round = 50
model = xgb.train(params, dtrain, num_round)

fscore = model.get_score(importance_type='total_gain')
fscore = sorted([(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True)
print(fscore[:5])

Parameters: { "silent" } are not used.

[('weight', 2283.56103515625), ('medical_info_a1', 1778.5419921875), ('height', 1696.2003173828125), ('age', 1156.5743408203125), ('medical_info_a2', 983.0481567382812)]
