### 令和6年7月3日(水)
feature_selectを修正

In [3]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
# LGBMClassifier
import lightgbm as lgbm
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials
import pickle
import os
import sys
sys.path.append('../')
from my_utils import MyUtils
from params import Params
import warnings
warnings.filterwarnings('ignore')
INPUT_DIR = "../input/"
RANDOM_STATE = 10
OUTPUT_PATH = "C:/Users/gwsgs/workSpace/GCIcomp2/02.（公開）コンペ2/output/"
remove_outliers_columns = [ "AMT_INCOME_TOTAL", "AMT_ANNUITY", "DAYS_ID_PUBLISH",]
exel_path = 'score_record.xlsx' 
utils = MyUtils(input_path=INPUT_DIR, output_path=OUTPUT_PATH, exel_path=exel_path) 
my_utils = MyUtils(input_path=INPUT_DIR, output_path=OUTPUT_PATH, exel_path=exel_path)
my_params = Params() 

In [4]:
# データの読み込み
# INPUT_DIRにtrain.csvなどのデータを置いているディレクトリを指定してください。
INPUT_DIR = "../input/"
import datetime
time = datetime.datetime.now().strftime('%Y%m%d%H%M')
train = pd.read_csv(INPUT_DIR + "train.csv")
test = pd.read_csv(INPUT_DIR + "test.csv")
sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")
X_train = train.drop('TARGET',axis=1)
y_train = train['TARGET']

## 2. 前処理と特徴量作成
ここでは、上記の可視化と分析でわかったことを踏まえて、前処理と特徴量の作成を行います。

In [5]:
# 欠損値の補完を行う
# FLAF_OWN_CARの欠損値はY=1, N=0, Nan=2で補完 
train["FLAG_OWN_CAR"].fillna(2, inplace=True)
train["FLAG_OWN_CAR"] = train["FLAG_OWN_CAR"].map({"Y": 1, "N": 0, 2: 2})
test["FLAG_OWN_CAR"].fillna(2, inplace=True)
test["FLAG_OWN_CAR"] = test["FLAG_OWN_CAR"].map({"Y": 1, "N": 0, 2: 2})
# FLAG_OWN_REALTYの欠損値はY=1, N=0, Nan=2で補完
train["FLAG_OWN_REALTY"].fillna(2, inplace=True)
train["FLAG_OWN_REALTY"] = train["FLAG_OWN_REALTY"].map({"Y": 1, "N": 0, 2: 2})
test["FLAG_OWN_REALTY"].fillna(2, inplace=True)
test["FLAG_OWN_REALTY"] = test["FLAG_OWN_REALTY"].map({"Y": 1, "N": 0, 2: 2})
# AMT_ANNUITYの欠損値を中央値で補完
train["AMT_ANNUITY"].fillna(train["AMT_ANNUITY"].median(), inplace=True)
test["AMT_ANNUITY"].fillna(train["AMT_ANNUITY"].median(), inplace=True)
# AMT_GOODS_PRICEの欠損値を中央値で補完
train["AMT_GOODS_PRICE"].fillna(train["AMT_GOODS_PRICE"].median(), inplace=True)
test["AMT_GOODS_PRICE"].fillna(train["AMT_GOODS_PRICE"].median(), inplace=True)
# NAME_TYPE_SUITEの欠損値をラベルエンコーディングで補完
train["NAME_TYPE_SUITE"].fillna("Unknown", inplace=True)
train["NAME_TYPE_SUITE"] = train["NAME_TYPE_SUITE"].map(train["NAME_TYPE_SUITE"].value_counts().rank(ascending=False, method='first'))
test["NAME_TYPE_SUITE"].fillna("Unknown", inplace=True)
test["NAME_TYPE_SUITE"] = test["NAME_TYPE_SUITE"].map(test["NAME_TYPE_SUITE"].value_counts().rank(ascending=False, method='first'))
# OWN_CAR_AGEの欠損値を-9999で補完
# train["OWN_CAR_AGE"].fillna(-9999, inplace=True)
# test["OWN_CAR_AGE"].fillna(-9999, inplace=True)
# OCCUPATION_TYPEの欠損値をラベルエンコーディングで補完
train["OCCUPATION_TYPE"].fillna("Unknown", inplace=True)
train["OCCUPATION_TYPE"] = train["OCCUPATION_TYPE"].map(train["OCCUPATION_TYPE"].value_counts().rank(ascending=False, method='first'))
test["OCCUPATION_TYPE"].fillna("Unknown", inplace=True)
test["OCCUPATION_TYPE"] = test["OCCUPATION_TYPE"].map(test["OCCUPATION_TYPE"].value_counts().rank(ascending=False, method='first'))
# CNT_FAM_MEMBERSの欠損値を中央値で補完
train["CNT_FAM_MEMBERS"].fillna(train["CNT_FAM_MEMBERS"].median(), inplace=True)
test["CNT_FAM_MEMBERS"].fillna(train["CNT_FAM_MEMBERS"].median(), inplace=True)
# EXT_SOURCE_1の欠損値を-9999で補完
train['EXT_SOURCE_1'] = train['EXT_SOURCE_1'].fillna(-9999)
test['EXT_SOURCE_1'] = test['EXT_SOURCE_1'].fillna(-9999)
# EXT_SOURCE_2の欠損値を平均値で補完
train["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True)
test["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True)
# EXT_SOURCE_3の欠損値を-9999で補完
train['EXT_SOURCE_3'] = train['EXT_SOURCE_2'].fillna(-9999)
test['EXT_SOURCE_3'] = test['EXT_SOURCE_2'].fillna(-9999)


In [6]:
# カテゴリカルデータを数値に変換
# NAME_CONTRACT_TYPEのラベルエンコーディング
train["NAME_CONTRACT_TYPE"] = train["NAME_CONTRACT_TYPE"].map({"Cash loans": 0, "Revolving loans": 1})
test["NAME_CONTRACT_TYPE"] = test["NAME_CONTRACT_TYPE"].map({"Cash loans": 0, "Revolving loans": 1})
# CODE_GENDERのラベルエンコーディング(男性=0, 女性=1, XNA=0)
train["CODE_GENDER"] = train["CODE_GENDER"].map({"M": 0, "F": 1, "XNA": 0})
test["CODE_GENDER"] = test["CODE_GENDER"].map({"M": 0, "F": 1, "XNA": 0})
# NAME_INCOME_TYPEのラベルカウントエンコーディング
train["NAME_INCOME_TYPE"] = train["NAME_INCOME_TYPE"].map(train["NAME_INCOME_TYPE"].value_counts().rank(ascending=False, method='first'))
test["NAME_INCOME_TYPE"] = test["NAME_INCOME_TYPE"].map(test["NAME_INCOME_TYPE"].value_counts().rank(ascending=False, method='first'))
# NAME_EDUCATION_TYPEのラベルカウントエンコーディング
train["NAME_EDUCATION_TYPE"] = train["NAME_EDUCATION_TYPE"].map(train["NAME_EDUCATION_TYPE"].value_counts().rank(ascending=False, method='first'))
test["NAME_EDUCATION_TYPE"] = test["NAME_EDUCATION_TYPE"].map(test["NAME_EDUCATION_TYPE"].value_counts().rank(ascending=False, method='first'))
# NAME_FAMILY_STATUSのラベルカウントエンコーディング
train["NAME_FAMILY_STATUS"] = train["NAME_FAMILY_STATUS"].map(train["NAME_FAMILY_STATUS"].value_counts().rank(ascending=False, method='first'))
test["NAME_FAMILY_STATUS"] = test["NAME_FAMILY_STATUS"].map(test["NAME_FAMILY_STATUS"].value_counts().rank(ascending=False, method='first'))
# NAME_HOUSING_TYPEのラベルカウントエンコーディング
train["NAME_HOUSING_TYPE"] = train["NAME_HOUSING_TYPE"].map(train["NAME_HOUSING_TYPE"].value_counts().rank(ascending=False, method='first'))
test["NAME_HOUSING_TYPE"] = test["NAME_HOUSING_TYPE"].map(test["NAME_HOUSING_TYPE"].value_counts().rank(ascending=False, method='first'))
# ORGANIZATIONのラベルカウントエンコーディング
train["ORGANIZATION_TYPE"] = train["ORGANIZATION_TYPE"].map(train["ORGANIZATION_TYPE"].value_counts().rank(ascending=False, method='first'))
test["ORGANIZATION_TYPE"] = test["ORGANIZATION_TYPE"].map(test["ORGANIZATION_TYPE"].value_counts().rank(ascending=False, method='first'))

In [7]:
# 特徴量の作成
# 参考 https://www.kaggle.com/competitions/home-credit-default-risk/discussion/64821
# AMT_CREDITとAMT_ANNUITYの比率
train["CREDIT_TO_ANNUITY_RATIO"] = train["AMT_CREDIT"] / train["AMT_ANNUITY"]
test["CREDIT_TO_ANNUITY_RATIO"] = test["AMT_CREDIT"] / test["AMT_ANNUITY"]
# AMT_CREDITとAMT_GOODS_PRICEの比率
train["CREDIT_TO_GOODS_RATIO"] = train["AMT_CREDIT"] / train["AMT_GOODS_PRICE"]
test["CREDIT_TO_GOODS_RATIO"] = test["AMT_CREDIT"] / test["AMT_GOODS_PRICE"]
# AMT_CREDITとAMT_ANNUITYの比率
train["AMT_CREDIT_TO_AMT_ANNUITY"] = train["AMT_CREDIT"] / train["AMT_ANNUITY"]
test["AMT_CREDIT_TO_AMT_ANNUITY"] = test["AMT_CREDIT"] / test["AMT_ANNUITY"]
# DAYS_BIRTH/ -365
train["YEARS_BIRTH"] = train["DAYS_BIRTH"] / -365
test["YEARS_BIRTH"] = test["DAYS_BIRTH"] / -365

In [8]:
print(train.shape)
print(test.shape)

(171202, 55)
(61500, 54)


## 3. 機械学習モデルの作成
ここでは、機械学習モデルの作成を行います。

In [9]:
# 目的変数と説明変数に分割
X = train.drop("TARGET", axis=1).values
y = train["TARGET"].values
X_test = test.values
# 訓練データと評価データに分割
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RANDOM_STATE)

In [10]:
lgb_n_iter = 200
xgb_n_iter = 30
lr_n_iter =  10
time = datetime.datetime.now().strftime('%Y%m%d%H%M')

In [11]:
def lgb_objective(args):
    lgb = LGBMClassifier(
        num_leaves = args['num_leaves'],
        max_depth = args['max_depth'],
        n_estimators = args['n_estimators'],
        learning_rate = args['learning_rate'],
        # min_child_samples = args['min_child_samples'],
        reg_lambda = args['reg_lamb'],
        bagging_freq = args['bagging_freq'],
        bagging_fraction = args['bagging_fraction'],
        feature_fraction = args['feature_fraction'],
        min_data_in_leaf = args['min_data_in_leaf'],
        random_state = RANDOM_STATE,
        objective = my_params.lgb['objective'],
        metric = my_params.lgb['metric'],
        verbosity = my_params.lgb['verbosity'],
        boosting_type = my_params.lgb['boosting_type'],
        early_stopping_round = 50
        )
    lgb.fit(X_train, y_train,
            eval_set = [(X_valid, y_valid)]
        )
    
    lgb_valid_pred = lgb.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, lgb_valid_pred)
    return -1.0 * auc   

In [13]:
trials = Trials()
lgb_best = fmin(
    lgb_objective,
    space = my_params.lgb,
    algo=tpe.suggest,
    max_evals=lgb_n_iter,
    trials=trials,
    # 試行の過程を出力
    verbose=-1,
    early_stop_fn=None,
    show_progressbar = True,
)
del trials

  0%|          | 0/200 [00:00<?, ?trial/s, best loss=?]

 40%|████      | 80/200 [11:29<20:36, 10.30s/trial, best loss: -0.7446292552072789]

In [17]:
print(lgb_best)

{'bagging_fraction': 0.976701282577164, 'bagging_freq': 5, 'feature_fraction': 0.3801089786656347, 'learning_rate': 0.12051544553963187, 'max_depth': 2, 'min_data_in_leaf': 26, 'n_estimators': 1407, 'num_leaves': 63, 'reg_lamb': 24.484507632387334, 'sub_sample': 0.5109915033514361}


In [None]:
# XGBoostのモデルを作成
def xgb_objective(args):
    xgb = XGBClassifier(
        n_estimators=args["n_estimators"],
        max_depth=args["max_depth"],
        learning_rate=args["learning_rate"],
        subsample=args["subsample"],
        colsample_bytree=args["colsample_bytree"],
        gamma=args["gamma"],
        min_child_weight=args["min_child_weight"],
        reg_alpha=args["reg_alpha"],
        
        verbose=0,
        random_state=RANDOM_STATE,
    )
    xgb.fit(X_train, y_train,
        eval_set = [(X_valid, y_valid),],
        # varbose=False,
    )
    xgb_valid_pred = xgb.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, xgb_valid_pred)    
    return -1.0 * auc 

trials = Trials()
xgb_best = fmin(
    xgb_objective,
    space = my_params.xgb,
    algo=tpe.suggest,
    max_evals=xgb_n_iter,
    trials=trials,
    verbose=-1, # 試行の過程を出力
    # early_stop_fn=no_progress_loss(100),
    show_progressbar = True,
    # trials_save_file=f"{output_path}_trials_save_file_{time}.pkl"
)
del trials

In [19]:
lgb = LGBMClassifier(
        learning_rate = lgb_best['learning_rate'],
        objective = my_params.lgb['objective'],
        max_depth = lgb_best['max_depth'],
        # min_child_samples = best['min_child_samples'],
        n_estimators = lgb_best['n_estimators'],
        num_leaves = lgb_best['num_leaves'],
        reg_lambda= lgb_best['reg_lamb'],
        bagging_freq = lgb_best['bagging_freq'],
        bagging_fraction = lgb_best['bagging_fraction'],
        feature_fraction = lgb_best['feature_fraction'],
        min_data_in_leaf = lgb_best['min_data_in_leaf'],
        verbose = -1,
        random_state = RANDOM_STATE,)
lgb.fit(X, y)

lgb_train_pred = lgb.predict_proba(X)[:, 1]

# XGBoost
# xgb = XGBClassifier(
#         learning_rate = xgb_best['learning_rate'],
#         n_estimators = xgb_best['n_estimators'],
#         max_depth = xgb_best['max_depth'],
#         subsample = xgb_best['subsample'],
#         colsample_bytree = xgb_best['colsample_bytree'],
#         gamma = xgb_best['gamma'],
#         min_child_weight = xgb_best['min_child_weight'],
#         verbose = 0,
#         random_state = RANDOM_STATE,)
# xgb.fit(X, y)

# xgb_train_pred = xgb.predict_proba(X)[:, 1]

predicts = [lgb_best]
# predicts = [xgb_best, lgb_best]

In [20]:
for i in predicts:
    utils.save_score_to_exel(utils.exel_path, i, trials.best_trial['result']['loss'] * -1, time)

NameError: name 'trials' is not defined

## 4. 予測結果の作成
最後にテストデータに対して予測を行い、提出用のcsvファイルを作成します。

In [21]:
lgbm.plot_importance(lgb, index=X_train.columns_,figsize=(10, 10))

AttributeError: 'numpy.ndarray' object has no attribute 'columns_'

In [23]:
# テストデータに対する予測値の作成
# pred = lgb.predict_proba(X_test, num_iteration=lgb.best_iteration)[:, 1]
lgb_pred = lgb.predict_proba(X_test)[:, 1]
# xgb_pred = xgb.predict_proba(X_test)[:, 1]
# lr_pred = lr.predict_proba(X_test_std)[:, 1]                    
# アンサンブル
# pred = lgb_pred * 0.5 + xgb_pred * 0.5
pred = lgb_pred

In [24]:
# 予測結果を提出用のフォーマットに格納
sample_sub['TARGET'] = pred

In [25]:
# 提出用のcsvファイルを作成
# formatを指定して現在時刻を取得（yyyyMMddhhmm）

sample_sub.to_csv(f'{OUTPUT_PATH}{time}.csv',index=False)
print(f"output: {OUTPUT_PATH}{time}.csv")

output: C:/Users/gwsgs/workSpace/GCIcomp2/02.（公開）コンペ2/output/202407032139.csv


以上で、Home Credit Default Riskコンペのチュートリアルは終了です。今回は、50種類ある特徴量のうち5種類しか使用していないので、まだまだ改善の余地があります。この後は、このnotebookやこれまでの教材を参考にして、さらなるスコアの向上を目指してください！