### 令和6年6月25日(火)
DAYS_EMPLOYED修正

#### 1. 読み込み

In [None]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors , KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
# LGBMClassifier
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from hyperopt import hp, fmin, tpe, Trials
import warnings
import sys
sys.path.append('../')
from my_utils import MyUtils
warnings.filterwarnings('ignore')

INPUT_DIR = "../input/"
output_path = "C:/Users/gwsgs/workSpace/GCIcomp2/02.（公開）コンペ2/output/"
exel_path = 'score_record.xlsx' 
utils = MyUtils(input_path=INPUT_DIR, output_path=output_path, exel_path=exel_path)  
params = utils.params
output_path = utils.get_output_path

In [None]:
# データの読み込み
train, test, X_train, y_train, sample_sub = utils.load_data()

#### 2. 前処理と特徴量作成

In [None]:
# 特徴量の作成
# CODE_GENDERが1かつNAME_FAMILY_STATUSがmarriedの場合 = 1,  新しく特徴量[female_marride]を作成
# OCCUPATION_TYPEのLow-skill Laborersを1、それ以外を0に変換。新しく特徴量[is_low_skill]を作成
train["is_low_skill"] = train["OCCUPATION_TYPE"].apply(lambda x: 1 if x == "Low-skill Laborers" else 0)
test["is_low_skill"] = test["OCCUPATION_TYPE"].apply(lambda x: 1 if x == "Low-skill Laborers" else 0)

# 欠損値のラベルカウントエンコーディング
missing_embeded_festure = ['FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE','OCCUPATION_TYPE','OCCUPATION_TYPE']
train, test = utils.label_count(train, test, missing_embeded_festure)
to_numerical = ['NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','ORGANIZATION_TYPE']
train, test = utils.label_count(train, test, to_numerical)

In [None]:
# 欠損値の補完を行う
train["CNT_FAM_MEMBERS"].fillna(train["CNT_FAM_MEMBERS"].median(), inplace=True)
test["CNT_FAM_MEMBERS"].fillna(train["CNT_FAM_MEMBERS"].median(), inplace=True)

In [None]:
# 特徴量の修正
# DAYS_EMPLOYEDの365243という値をnanに変更
train["DAYS_EMPLOYED"]  = train["DAYS_EMPLOYED"].replace(365243, np.nan)

In [None]:
# カテゴリカルデータを数値に変換
# NAME_CONTRACT_TYPEのラベルエンコーディング
train["NAME_CONTRACT_TYPE"] = train["NAME_CONTRACT_TYPE"].map({"Cash loans": 0, "Revolving loans": 1})
test["NAME_CONTRACT_TYPE"] = test["NAME_CONTRACT_TYPE"].map({"Cash loans": 0, "Revolving loans": 1})
# CODE_GENDERのラベルエンコーディング(男性=0, 女性=1, XNA=0)
train["CODE_GENDER"] = train["CODE_GENDER"].map({"M": 0, "F": 1, "XNA": 0})
test["CODE_GENDER"] = test["CODE_GENDER"].map({"M": 0, "F": 1, "XNA": 0})

In [None]:
# 特徴量の作成
# DAYS_LAST_PHONE_CHANGEを年に変換
train["YEARS_PHONR_CHANGE"] = train["DAYS_LAST_PHONE_CHANGE"] / -365
test["YEARS_PHONR_CHANGE"] = test["DAYS_LAST_PHONE_CHANGE"] / -365
# DAYS_EMPLOYEDを年に変換
train["YEARS_EMPLOYED"] = train["DAYS_EMPLOYED"] / 365
test["YEARS_EMPLOYED"] = test["DAYS_EMPLOYED"] / 365
# DAYS_REGISTRATIONを年に変換
train["YEARS_REGISTRATION"] = train["DAYS_REGISTRATION"] / -365
test["YEARS_REGISTRATION"] = test["DAYS_REGISTRATION"] / -365
# DAYS_ID_PUBLISHを年に変換
train["YEARS_ID_PUBLISH"] = train["DAYS_ID_PUBLISH"] / -365
test["YEARS_ID_PUBLISH"] = test["DAYS_ID_PUBLISH"] / -365
#  1 if AMT_GOODS_PRICEnan==nan other = 0
train["AMT_GOODS_PRICE_ISNAN"] = train["AMT_GOODS_PRICE"].isnull().astype(int)
test["AMT_GOODS_PRICE_ISNAN"] = test["AMT_GOODS_PRICE"].isnull().astype(int)
# own car and realty = 4 , own realty = 2 , own car = 1 , other = 0 commentと異なる
train["OWN_CAR_REALTY"] = train["FLAG_OWN_CAR"] + train["FLAG_OWN_REALTY"]
test["OWN_CAR_REALTY"] = test["FLAG_OWN_CAR"] + test["FLAG_OWN_REALTY"]




# 参考 https://www.kaggle.com/competitions/home-credit-default-risk/discussion/64821
# AMT_CREDITとAMT_ANNUITYの比率
train["CREDIT_TO_ANNUITY_RATIO"] = train["AMT_CREDIT"] / train["AMT_ANNUITY"]
test["CREDIT_TO_ANNUITY_RATIO"] = test["AMT_CREDIT"] / test["AMT_ANNUITY"]
# AMT_CREDITとAMT_GOODS_PRICEの比率
train["CREDIT_TO_GOODS_RATIO"] = train["AMT_CREDIT"] / train["AMT_GOODS_PRICE"]
test["CREDIT_TO_GOODS_RATIO"] = test["AMT_CREDIT"] / test["AMT_GOODS_PRICE"]
# AMT_CREDITとAMT_ANNUITYの比率
train["AMT_CREDIT_TO_AMT_ANNUITY"] = train["AMT_CREDIT"] / train["AMT_ANNUITY"]
test["AMT_CREDIT_TO_AMT_ANNUITY"] = test["AMT_CREDIT"] / test["AMT_ANNUITY"]
# DAYS_BIRTH/ -365
train["YEARS_BIRTH"] = train["DAYS_BIRTH"] / -365
test["YEARS_BIRTH"] = test["DAYS_BIRTH"] / -365

In [None]:
days_feature = ["DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH"]
# # 年単位でビニング
# train = utils.from_days_to_year_bin(train, days_feature)
# test = utils.from_days_to_year_bin(test, days_feature)
# 月単位でビニング
train = utils.from_days_to_month_bin(train, days_feature)
test = utils.from_days_to_month_bin(test, days_feature)
# 週単位でビニング
train = utils.from_days_to_week_bin(train, days_feature)
test = utils.from_days_to_week_bin(test, days_feature)


In [None]:
X_train, X_valid, y_train, y_valid, X_test, X, y = utils.split_data(train, test)

### 3. 観察

In [None]:
print(f"X_train:{X_train.shape}")
print(f"X_valid:{X_valid.shape}")
print(f"X_test:{X_test.shape}")
print(f"X:{X.shape}")
print(f"y_train:{y_train.shape}")
print(f"y_valid:{y_valid.shape}")
print(f"y:{y.shape}")

In [None]:
# import sweetviz as sv

# # Sweetvizの分析レポートを作成（学習データとテストデータの比較、目的変数を指定）
# report = sv.compare([train, "Train Data"], [test, "Test Data"], target_feat='TARGET')

# # レポートをHTMLファイルとして保存
# report.show_html(open_browser=True)

In [None]:
sns.scatterplot(data=train, x='EXT_SOURCE_1',y='DAYS_BIRTH_bin_week', size=0.1)

## 4. 予測

In [None]:
# 複数のモデルを使って予測値を出力
utils.multi_model_predict(X_train, y_train, X_valid, y_valid, hasNan=False)

In [None]:
def objective(args):
    lgb = LGBMClassifier(
        num_leaves = args['num_leaves'],
        max_depth = args['max_depth'],
        n_estimators = args['n_estimators'],
        learning_rate = args['learning_rate'],
        # min_child_samples = args['min_child_samples'],
        reg_lambda = args['reg_lamb'],
        bagging_freq = args['bagging_freq'],
        bagging_fraction = args['bagging_fraction'],
        feature_fraction = args['feature_fraction'],
        min_data_in_leaf = args['min_data_in_leaf'],
        random_state = params['random_state'],
        objective = params['objective'],
        metric = params['metric'],
        verbosity = params['verbosity'],
        boosting_type = params['boosting_type'],
        early_stopping_round = 50
        )
    lgb.fit(X_train, y_train,
            eval_set = [(X_valid, y_valid)]
        )
    lgb_valid_pred = lgb.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, lgb_valid_pred)
    return -1.0 * auc   

In [16]:
import datetime
n_iter = 50
trials = Trials()
time = datetime.datetime.now().strftime('%Y%m%d%H%M')
best = fmin(
    objective,
    space = params,
    algo=tpe.suggest,
    max_evals=n_iter,
    trials=trials,
    verbose=-1, # 試行の過程を出力
    early_stop_fn=None,
    show_progressbar = True,
    # trials_save_file=f"{output_path}_trials_save_file_{time}.pkl"
)

100%|██████████| 50/50 [06:31<00:00,  7.83s/trial, best loss: -0.7614571527982223]


In [None]:
lgb = LGBMClassifier(
        learning_rate = best['learning_rate'],
        objective = params['objective'],
        max_depth = best['max_depth'],
        # min_child_samples = best['min_child_samples'],
        n_estimators = best['n_estimators'],
        num_leaves = best['num_leaves'],
        reg_lambda= best['reg_lamb'],
        bagging_freq = best['bagging_freq'],
        bagging_fraction = best['bagging_fraction'],
        feature_fraction = best['feature_fraction'],
        min_data_in_leaf = best['min_data_in_leaf'],
        verbose = -1,
        random_state = 0,)
lgb.fit(X_train, y_train)

lgb_train_pred = lgb.predict_proba(X_train)[:, 1]
lgb_valid_pred = lgb.predict_proba(X_valid)[:, 1]
print(best)
print(f"Train Score: {roc_auc_score(y_train, lgb_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, lgb_valid_pred)}")

In [None]:
utils.save_score_to_exel(utils.exel_path, best, trials.best_trial['result']['loss'] * -1, time)

In [None]:
dic = {}
for i,  column in enumerate(test.columns):
    dic[i] = column
dic

In [None]:
lgbm.plot_importance(lgb,figsize=(5, 40))

In [None]:
lgbm.plot_metric(lgb)

#### 4. 予測結果の作成
最後にテストデータに対して予測を行い、提出用のcsvファイルを作成します。

In [None]:
# テストデータに対する予測値の作成
# pred = lgb.predict_proba(X_test, num_iteration=lgb.best_iteration)[:, 1]
pred = lgb.predict_proba(X_test)[:, 1]

In [None]:
# 予測結果を提出用のフォーマットに格納
sample_sub['TARGET'] = pred
sample_sub

In [None]:
# 提出用のcsvファイルを作成
# formatを指定して現在時刻を取得（yyyyMMddhhmm）

sample_sub.to_csv(f'{output_path}/{time}.csv',index=False)

以上で、Home Credit Default Riskコンペのチュートリアルは終了です。今回は、50種類ある特徴量のうち5種類しか使用していないので、まだまだ改善の余地があります。この後は、このnotebookやこれまでの教材を参考にして、さらなるスコアの向上を目指してください！