<h1>purposeの欠損値を'purpose_non'で埋める

# 準備

## Google DriveのマウントとSEED値とPathの設定

In [1]:
SEED = 42
K_FOLD_SEED = 225
NOTEBOOK = "欠損値なし2"
INPUT_PATH = "/content/drive/MyDrive/input/mufg2024/"
OUTPUT_PATH = "/content/drive/MyDrive/model_save/"

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## ライブラリの読み込み

In [2]:
!pip install category_encoders

import pandas as pd
import numpy as np
import seaborn as sns
import category_encoders as ce
import itertools
from sklearn.preprocessing import LabelEncoder
import warnings
import os


warnings.simplefilter('ignore')  # 不要な警告を表示しない

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


# 前処理

## データの読み込み

In [3]:
train = pd.read_csv(INPUT_PATH + "completed_train.csv", index_col=0)
test = pd.read_csv(INPUT_PATH + "completed_test.csv", index_col=0)

## 前処理・特徴量生成

### 関数

In [4]:
# 特徴量生成
def add_new_feature(dataset):
    # 時間データに対する処理
    dataset['days.with.cr.line_6month'] = dataset['days.with.cr.line'] // 182
    dataset['days.with.cr.line_1year'] = dataset['days.with.cr.line'] // 365
    dataset['days.with.cr.line_2year'] = dataset['days.with.cr.line'] // 730

    dataset['int.rate_divided_by_days.with.cr.line'] = dataset['int.rate'] / dataset['days.with.cr.line']
    dataset['delinq.2yrs_divided_by_days.with.cr.line_6month'] = np.where(dataset['days.with.cr.line_6month'] != 0, dataset['delinq.2yrs'] / dataset['days.with.cr.line_6month'], 0)



# 特徴量をまとめたときの他の特徴量の性質を調べる
def aggregated_features(cat_col, agg_col, train, test):
    if cat_col == agg_col:
        return
    agg_types = ['max', 'min', 'mean', 'std', 'median']

    for agg_type in agg_types:
        new_col_name = f"{cat_col}_{agg_col}_{agg_type}"
        temp = pd.concat([train[[cat_col, agg_col]], test[[cat_col, agg_col]]])
        temp = temp.groupby(cat_col)[agg_col].agg([agg_type]).reset_index().rename(columns={agg_type: new_col_name})
        temp.index = list(temp[cat_col])
        temp = temp[new_col_name].to_dict()

        # trainとtestの新しい特徴量を作成
        train[new_col_name] = train[cat_col].map(temp)
        test[new_col_name] = test[cat_col].map(temp)

    # max - min の集計
    max_col_name = f"{cat_col}_{agg_col}_max"
    min_col_name = f"{cat_col}_{agg_col}_min"
    max_min_col_name = f"{cat_col}_{agg_col}_max_min"

    temp = pd.concat([train[[cat_col, agg_col]], test[[cat_col, agg_col]]])
    max_min = temp.groupby(cat_col)[agg_col].agg(['max', 'min']).reset_index()
    max_min[max_min_col_name] = max_min['max'] - max_min['min']
    max_min = max_min[[cat_col, max_min_col_name]].set_index(cat_col).to_dict()[max_min_col_name]

    train[max_min_col_name] = train[cat_col].map(max_min)
    test[max_min_col_name] = test[cat_col].map(max_min)



# Target Encoding
def _target_encoding(train, test, column, target):
    target_encoder = ce.TargetEncoder(cols=[column])
    train_encoded = target_encoder.fit_transform(train[column], train[target])
    test_encoded = target_encoder.transform(test[column])
    return train_encoded, test_encoded
def target_encoding(target_encoding_columns, train, test):
    target_column = "not.fully.paid"
    for column in target_encoding_columns:
        # Target Encodingを実行
        train_encoded, test_encoded = _target_encoding(train, test, column, target_column)
        train.drop(column, axis=1, inplace=True)
        train[column] = train_encoded
        test.drop(column, axis=1, inplace=True)
        test[column] = test_encoded

### 処理

In [5]:
# 欠損値と外れ値除去をする
train.dropna(subset=['revol.util', 'revol.bal', 'installment'], inplace=True)
train = train[train['int.rate'] < 0.5]

# 特徴量生成
add_new_feature(train)
add_new_feature(test)

# Aggregated Features
time_features = [
    'days.with.cr.line_6month',
    'days.with.cr.line_1year',
    'days.with.cr.line_2year',
]
category_list = ["purpose", "delinq.2yrs", "inq.last.6mths", "pub.rec"] + time_features
non_category_list = ["int.rate", "fico", "inq.last.6mths", "installment"]
original_list = train.columns.tolist()
for cat_col in category_list:
    for agg_col in non_category_list:
        aggregated_features(cat_col, agg_col, train, test)
del_list = [col for col in test.columns if col not in original_list]
ng_list = ['inq.last.6mths', 'delinq.2yrs', 'pub.rec']

# Target Encoding
target_encoding_columns = ["purpose"]
target_encoding(target_encoding_columns, train, test)

# 前処理後のデータ
df_train = train.drop(columns=['not.fully.paid'])
df_train = df_train.astype(float)
df_test = test.astype(float)
y = train['not.fully.paid']

# 処理ができているかの確認をする
print(df_train.shape)
print(y.shape)
print(df_test.shape)

(40761, 180)
(40761,)
(40786, 180)


# 学習

In [6]:
selected_features = ['int.rate', 'fico', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'installment', 'dti', 'days.with.cr.line_1year', 'revol.util', 'days.with.cr.line', 'annual.inc', 'purpose', 'revol.bal', 'purpose_installment_max', 'inq.last.6mths_installment_max', 'days.with.cr.line_6month_installment_mean', 'days.with.cr.line_6month_fico_median', 'inq.last.6mths_int.rate_min']
df_train_selected = df_train[selected_features]
df_test_selected = df_test[selected_features]
y = y.reset_index(drop=True).values if isinstance(y, pd.Series) else y

# 説明変数と目的変数に分ける
X = df_train_selected.copy()
y = y.copy()
X_test = df_test_selected.copy()

In [7]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.preprocessing import StandardScaler


params = {
    'learning_rate': 0.020885306641593653,
    'num_leaves': 32,
    'colsample_bytree': 0.6031758618507362,
    'subsample': 0.9445306340392152,
    'max_depth': 3,
    'reg_alpha': 3.5882262964892297,
    'reg_lambda': 0.592022145325183,
    'min_child_weight': 0.0834484551537235,
    'feature_fraction': 0.4730667204518139,
    'bagging_fraction': 0.9423011633687252,
    'bagging_freq': 3,
    'objective': 'binary',
    'metric': 'auc',
    'extra_tree': True,
    'boosting_type': 'gbdt',
    'n_estimators': 20000,
    'seed': SEED,
    'verbosity': -1
}


# 各FoldのAUCスコアを格納
auc_scores = []
y_train_pred = np.zeros(X.shape[0])
y_test_pred = np.zeros(X_test.shape[0])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=K_FOLD_SEED)

# 各Foldに対してLightGBMモデルを訓練
for i, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    X_train, X_valid = X.iloc[tr_idx], X.iloc[va_idx]
    y_train, y_valid = y[tr_idx], y[va_idx]
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False), lgb.log_evaluation(0)])

    val_pred = model.predict_proba(X_valid, num_iteration=model.best_iteration_)[:, 1]
    auc = roc_auc_score(y_valid, val_pred)
    auc_scores.append(auc)
    print(f"Fold {i+1} のAUCスコア: {auc}")

    y_test_pred += model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1] / kf.n_splits
    y_train_pred[va_idx] = val_pred

# 各FoldのAUCスコアの平均を求める
cv_score = np.mean(auc_scores)
print(f'Cross-Validation AUC Score: {cv_score}')

Fold 1 のAUCスコア: 0.7994078803659017
Fold 2 のAUCスコア: 0.7972030480171112
Fold 3 のAUCスコア: 0.795155038850255
Fold 4 のAUCスコア: 0.7976460628922235
Fold 5 のAUCスコア: 0.7954975855915257
Cross-Validation AUC Score: 0.7969819231434034


## X_trainの予測値とX_testの予測値をcsv形式で保存する

In [8]:
train_path = NOTEBOOK + "_train.csv"
test_path = NOTEBOOK + "_test.csv"

# df_test_predをcsv形式にする
submit = pd.read_csv(INPUT_PATH + "sample_submission.csv", header=None)
submit[1] = y_test_pred
submit.to_csv(test_path, header=None, index=False)

#df_train_predをcsv形式にする
df_train_pred = pd.DataFrame({'prediction': y_train_pred})
df_train_pred.to_csv(train_path, header=None, index=True)