<h1>purposeの欠損値を'purpose_non'で埋める

# 準備

## Google DriveのマウントとSEED値とPathの設定

In [8]:
SEED = 42
K_FOLD_SEED = 225
NOTEBOOK = "欠損値あり7"
INPUT_PATH = "/content/drive/MyDrive/input/mufg2024/"
OUTPUT_PATH = "/content/drive/MyDrive/model_save/"

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## ライブラリの読み込み

In [9]:
!pip install category_encoders

import pandas as pd
import numpy as np
import seaborn as sns
import category_encoders as ce
import itertools
from sklearn.preprocessing import LabelEncoder
import warnings
import os


warnings.simplefilter('ignore')  # 不要な警告を表示しない



# 前処理

## データの読み込み

In [10]:
train = pd.read_csv(INPUT_PATH + "train.csv", index_col=0)
test = pd.read_csv(INPUT_PATH + "test.csv", index_col=0)

## 前処理・特徴量生成

### 関数

In [11]:
# 特徴量生成
def add_new_feature(dataset):
    # logを取った特徴量の追加
    dataset['annual.inc.log'] = np.log1p(dataset['annual.inc'])
    dataset['revol.bal.log'] = np.log1p(dataset['revol.bal'])

    # 時間データに対する処理
    dataset['days.with.cr.line_6month'] = dataset['days.with.cr.line'] // 182
    dataset['days.with.cr.line_1year'] = dataset['days.with.cr.line'] // 365
    dataset['days.with.cr.line_2year'] = dataset['days.with.cr.line'] // 730


# 特徴量をまとめたときの他の特徴量の性質を調べる
def aggregated_features(cat_col, agg_col, train, test):
    if cat_col == agg_col:
        return
    agg_types = ['max', 'min', 'mean', 'std', 'median']
    quantiles = [0.25, 0.75]

    for agg_type in agg_types:
        new_col_name = f"{cat_col}_{agg_col}_{agg_type}"
        temp = pd.concat([train[[cat_col, agg_col]], test[[cat_col, agg_col]]])
        temp = temp.groupby(cat_col)[agg_col].agg([agg_type]).reset_index().rename(columns={agg_type: new_col_name})
        temp.index = list(temp[cat_col])
        temp = temp[new_col_name].to_dict()

        # trainとtestの新しい特徴量を作成
        train[new_col_name] = train[cat_col].map(temp)
        test[new_col_name] = test[cat_col].map(temp)

    # max - min の集計
    max_col_name = f"{cat_col}_{agg_col}_max"
    min_col_name = f"{cat_col}_{agg_col}_min"
    max_min_col_name = f"{cat_col}_{agg_col}_max_min"

    temp = pd.concat([train[[cat_col, agg_col]], test[[cat_col, agg_col]]])
    max_min = temp.groupby(cat_col)[agg_col].agg(['max', 'min']).reset_index()
    max_min[max_min_col_name] = max_min['max'] - max_min['min']
    max_min = max_min[[cat_col, max_min_col_name]].set_index(cat_col).to_dict()[max_min_col_name]

    train[max_min_col_name] = train[cat_col].map(max_min)
    test[max_min_col_name] = test[cat_col].map(max_min)

    # quantile の集計
    for q in quantiles:
        quantile_col_name = f"{cat_col}_{agg_col}_quantile_{int(q*100)}"
        temp = pd.concat([train[[cat_col, agg_col]], test[[cat_col, agg_col]]])
        quantile_values = temp.groupby(cat_col)[agg_col].quantile(q).reset_index().rename(columns={agg_col: quantile_col_name})
        quantile_values.index = list(quantile_values[cat_col])
        quantile_dict = quantile_values[quantile_col_name].to_dict()

        train[quantile_col_name] = train[cat_col].map(quantile_dict)
        test[quantile_col_name] = test[cat_col].map(quantile_dict)



# Target Encoding
def _target_encoding(train, test, column, target):
    target_encoder = ce.TargetEncoder(cols=[column])
    train_encoded = target_encoder.fit_transform(train[column], train[target])
    test_encoded = target_encoder.transform(test[column])
    return train_encoded, test_encoded
def target_encoding(target_encoding_columns, train, test):
    target_column = "not.fully.paid"
    for column in target_encoding_columns:
        # Target Encodingを実行
        train_encoded, test_encoded = _target_encoding(train, test, column, target_column)
        train.drop(column, axis=1, inplace=True)
        train[column] = train_encoded
        test.drop(column, axis=1, inplace=True)
        test[column] = test_encoded


# 四則演算を適用する全ての特徴量のペアを生成
def make_new_features(df, del_list, ng_list, time_list):
  feature_names = [col for col in df.columns if col not in del_list]
  for i in range(len(feature_names)):
      for j in range(i, len(feature_names)):
          if feature_names[i] in ng_list and feature_names[j] in ng_list:continue
          if feature_names[i] in time_list and feature_names[j] in time_list:continue
          f1 = feature_names[i]
          f2 = feature_names[j]
          if f1 == f2:
            df[f'{f1}_times_{f2}'] = df[f1] * df[f2]
            continue
          df[f'{f1}_times_{f2}'] = df[f1] * df[f2]
          df[f'{f1}_divided_by_{f2}'] = np.where(df[f2] != 0, df[f1] / df[f2], 0)


### 処理

In [12]:
# 欠損値と外れ値除去をする
train.dropna(subset=['revol.util', 'revol.bal', 'installment'], inplace=True)
train = train[train['int.rate'] < 0.5]

# 特徴量生成
add_new_feature(train)
add_new_feature(test)

# 特徴量を区切った時の別の特徴量のmin, maxなどの情報を新たな特徴量として追加する
time_features = [
    'days.with.cr.line_6month',
    'days.with.cr.line_1year',
    'days.with.cr.line_2year',
]
category_list = ["purpose", "delinq.2yrs", "inq.last.6mths", "pub.rec"] + time_features
non_category_list = ["int.rate", "fico", "inq.last.6mths", "installment"]
original_list = train.columns.tolist()
for cat_col in category_list:
    for agg_col in non_category_list:
        aggregated_features(cat_col, agg_col, train, test)
del_list = [col for col in test.columns if col not in original_list]
ng_list = ['inq.last.6mths', 'delinq.2yrs', 'pub.rec']

# Target Encoding
target_encoding_columns = ["purpose"]
target_encoding(target_encoding_columns, train, test)

# 前処理後のデータ
df_train = train.drop(columns=['not.fully.paid'])
df_train = df_train.astype(float)
df_test = test.astype(float)
y = train['not.fully.paid']

# 特徴量生成(四則演算)
make_new_features(df_train, del_list, ng_list, time_features)
make_new_features(df_test, del_list, ng_list, time_features)

# 処理ができているかの確認をする
print(df_train.shape)
print(y.shape)
print(df_test.shape)

(40761, 504)
(40761,)
(40786, 504)


# 学習

In [13]:
selected_features = ['int.rate', 'fico', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'revol.bal', 'days.with.cr.line_1year', 'dti', 'purpose', 'revol.util', 'annual.inc', 'installment', 'inq.last.6mths_fico_quantile_75', 'delinq.2yrs_fico_min', 'inq.last.6mths_installment_median', 'days.with.cr.line_1year_fico_quantile_25', 'purpose_installment_min']
print(len(selected_features))
print(selected_features)
# 選択された特徴量でのデータセットの準備
df_train_selected = df_train[selected_features]
df_test_selected = df_test[selected_features]
y = y.reset_index(drop=True).values if isinstance(y, pd.Series) else y

# 説明変数と目的変数に分ける
X = df_train_selected.copy()
y = y.copy()
X_test = df_test_selected.copy()

17
['int.rate', 'fico', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'revol.bal', 'days.with.cr.line_1year', 'dti', 'purpose', 'revol.util', 'annual.inc', 'installment', 'inq.last.6mths_fico_quantile_75', 'delinq.2yrs_fico_min', 'inq.last.6mths_installment_median', 'days.with.cr.line_1year_fico_quantile_25', 'purpose_installment_min']


In [14]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

# モデルを読み込む関数
def load_model(idx):
    load_path = NOTEBOOK + f"_{idx}.pkl"
    with open(load_path, 'rb') as file:
        model = pickle.load(file)
    return model

# 各FoldのAUCスコアを格納
auc_scores = []
y_train_pred = np.zeros(X.shape[0])
y_test_pred = np.zeros(X_test.shape[0])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=K_FOLD_SEED)

# 各Foldに対してLightGBMモデルを訓練
for i, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    X_train, X_valid = X.iloc[tr_idx], X.iloc[va_idx]
    y_train, y_valid = y[tr_idx], y[va_idx]
    model = load_model(i)
    val_pred = model.predict_proba(X_valid, num_iteration=model.best_iteration_)[:, 1]
    auc = roc_auc_score(y_valid, val_pred)
    auc_scores.append(auc)
    print(f"Fold {i+1} のAUCスコア: {auc}")

    y_test_pred += model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1] / kf.n_splits
    y_train_pred[va_idx] = val_pred

# 各FoldのAUCスコアの平均を求める
cv_score = np.mean(auc_scores)
print(f'Cross-Validation AUC Score: {cv_score}')

Fold 1 のAUCスコア: 0.8006954981853267
Fold 2 のAUCスコア: 0.7969880829933225
Fold 3 のAUCスコア: 0.7939791568043674
Fold 4 のAUCスコア: 0.7991331769073355
Fold 5 のAUCスコア: 0.7958535654392084
Cross-Validation AUC Score: 0.7973298960659122


# X_trainの予測値とX_testの予測値をcsv形式で保存する

In [15]:
train_path = NOTEBOOK + "_train.csv"
test_path = NOTEBOOK + "_test.csv"

# df_test_predをcsv形式にする
submit = pd.read_csv(INPUT_PATH + "sample_submission.csv", header=None)
submit[1] = y_test_pred
submit.to_csv(test_path, header=None, index=False)

#df_train_predをcsv形式にする
df_train_pred = pd.DataFrame({'prediction': y_train_pred})
df_train_pred.to_csv(train_path, header=None, index=True)