<h1>purposeの欠損値をLightGBMにより補完する</h1>

# 前処理

## Google DriveのマウントとPathの設定

In [None]:
INPUT_PATH = "/content/drive/MyDrive/input/mufg2024/"

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## ライブラリの読み込み

In [None]:
import pandas as pd
import numpy as np
import pickle
import warnings
import os
warnings.simplefilter('ignore')  # 不要な警告を表示しない

## データの読み込み

In [None]:
train = pd.read_csv(INPUT_PATH + "train.csv", index_col=0)
test = pd.read_csv(INPUT_PATH + "test.csv", index_col=0)

# アンサンブル（平均）によるpurposeの欠損値補完

In [None]:
#前処理
category_mapping = {
    'major_purchase': 0,
    'credit_card': 1,
    'debt_consolidation': 2,
    'all_other': 3,
    'small_business': 4,
    'home_improvement': 5,
    'educational': 6
}
train['purpose_non'] = train['purpose'].isna().astype(int)
test['purpose_non'] = test['purpose'].isna().astype(int)
train['purpose'] = train['purpose'].map(category_mapping)
test['purpose'] = test['purpose'].map(category_mapping)

In [None]:
# 読み込むCSVファイルのリスト
csv_files_train = ["notebook1_train.csv", "notebook2_train.csv", "notebook3_train.csv", "notebook4_train.csv", "notebook5_train.csv"]
csv_files_test = ["notebook1_test.csv", "notebook2_test.csv", "notebook3_test.csv", "notebook4_test.csv", "notebook5_test.csv"]

# CSVファイルを読み込んで足し合わせる
def combine_csv_files(csv_files):
    combined_df = None
    weight_list = [3, 2, 1, 2, 5]
    for file, weight in zip(csv_files, weight_list):
        df = pd.read_csv(file)
        if combined_df is None:
            combined_df = df * weight
        else:
            combined_df += df * weight
    return combined_df


# trainデータの補完
summed_df_train = combine_csv_files(csv_files_train)
max_columns_train = summed_df_train.idxmax(axis=1)
missing_indices_train = train[train['purpose'].isna()].index
train['purpose'] = train['purpose'].fillna(pd.Series(max_columns_train.values, index=missing_indices_train))

# testデータの補完
summed_df_test = combine_csv_files(csv_files_test)
max_columns_test = summed_df_test.idxmax(axis=1)
missing_indices_test = test[test['purpose'].isna()].index
test['purpose'] = test['purpose'].fillna(pd.Series(max_columns_test.values, index=missing_indices_test))

# int型にそろえる
train['purpose'] = train['purpose'].astype(int)
test['purpose'] = test['purpose'].astype(int)

# csvファイルの保存

In [None]:
train.to_csv('completed_train.csv', index=True)
test.to_csv('completed_test.csv', index=True)