<h1>purposeの欠損値をLightGBMにより補完する</h1>

# 前処理

## Google DriveのマウントとSEED値とPathの設定

In [1]:
SEED = 3407
notebook_name = "notebook2"
INPUT_PATH = "/content/drive/MyDrive/input/mufg2024/"

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## ライブラリの読み込み

In [2]:
import pandas as pd
import numpy as np
import pickle
import warnings
import os
warnings.simplefilter('ignore')  # 不要な警告を表示しない

## データの読み込み

In [3]:
train = pd.read_csv(INPUT_PATH + "train.csv", index_col=0)
test = pd.read_csv(INPUT_PATH + "test.csv", index_col=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# LightGBMによるpurposeの欠損値補完

In [4]:
#前処理
category_mapping = {
    'major_purchase': 0,
    'credit_card': 1,
    'debt_consolidation': 2,
    'all_other': 3,
    'small_business': 4,
    'home_improvement': 5,
    'educational': 6
}
# trainに対する操作
## 欠損値なし
train_df = train.dropna(subset=['purpose'])
train_df['purpose'] = train_df['purpose'].map(category_mapping)
X_train = train_df.drop(columns=['purpose', 'not.fully.paid'])
y_train = train_df['purpose']
## 欠損値あり
predict_df_train = train[train['purpose'].isnull()]
train_predict = predict_df_train.drop(columns=['purpose', 'not.fully.paid'])

# testに対する操作
## 欠損値あり
predict_df_test = test[test['purpose'].isnull()]
test_predict = predict_df_test.drop(columns=['purpose'])

In [5]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss

params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'boosting': 'gbdt',
    'num_class': 7,
    'n_estimators':1099,
    'learning_rate': 0.028887449069387925,
    'lambda_l1': 9.478586201360532e-06,
    'lambda_l2': 1.6363144983335496,
    'num_leaves': 4,
    'feature_fraction': 0.5339848265686352,
    'bagging_fraction': 0.6677098113435436,
    'bagging_freq': 3,
    'min_child_samples': 63,
    'seed': SEED,
    'verbosity': -1,
}


model = LGBMClassifier(**params)
model.fit(X_train, y_train)

# modelと予測結果のcsvファイルの保存

In [6]:
# 結果をデータフレームに変換
train_pred = model.predict_proba(train_predict)
test_pred = model.predict_proba(test_predict)
train_pred_df = pd.DataFrame(train_pred, columns=[i for i in range(train_pred.shape[1])])
test_pred_df = pd.DataFrame(test_pred, columns=[i for i in range(test_pred.shape[1])])

# データフレームをCSV形式で保存
train_csv_path = notebook_name + "_train.csv"
test_csv_path = notebook_name + "_test.csv"
train_pred_df.to_csv(train_csv_path, index=False)
test_pred_df.to_csv(test_csv_path, index=False)