# 準備

## Google DriveのマウントとSEED値とPathの設定

In [1]:
INPUT_PATH = "/content/drive/MyDrive/input/mufg2024/"
OUTPUT_PATH = "/content/drive/MyDrive/model_save/"

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## ライブラリの読み込み

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import os
from sklearn.metrics import roc_auc_score


warnings.simplefilter('ignore')  # 不要な警告を表示しない

# データの読み込み

In [3]:
train = pd.read_csv(INPUT_PATH + "train.csv", index_col=0)
train.dropna(subset=['revol.util', 'revol.bal', 'installment'], inplace=True)
train = train[train['int.rate'] < 0.5]
y = train[['not.fully.paid']]
test = pd.read_csv(INPUT_PATH + "test.csv", index_col=0)
train_list = ['欠損値なし3_train.csv', '欠損値あり7_train.csv', '欠損値なし2_train.csv', '欠損値あり2_train.csv', '欠損値あり4_train.csv', '欠損値なし4_train.csv']
test_list = ['欠損値なし3_test.csv', '欠損値あり7_test.csv', '欠損値なし2_test.csv', '欠損値あり2_test.csv', '欠損値なし4_test.csv', '欠損値あり4_test.csv']

## すべてのファイルの予測値をデータフレームに変える

In [4]:
def load_and_concat_files(file_list):
    dfs = []
    for file in file_list:
        # ファイル名から特徴量名を抽出する
        feature_name = file.split('_')[0]

        df = pd.read_csv(file, header=None)
        df_2nd_column = df.iloc[:, [1]]
        df_2nd_column.columns = [feature_name]
        dfs.append(df_2nd_column)

    combined_df = pd.concat(dfs, axis=1, ignore_index=False)
    return combined_df

# train_listを読み込んで横に結合
train_df = load_and_concat_files(train_list)
test_df = load_and_concat_files(test_list)

# アンサンブル

## それぞれのaucスコアの算出

In [5]:
auc_scores = {}
for column in train_df.columns:
      auc = roc_auc_score(y, train_df[column])
      auc_scores[column] = auc
for feature, auc in auc_scores.items():
    print(f"Feature: {feature}, AUC: {auc}")

Feature: 欠損値なし3, AUC: 0.7970528351153632
Feature: 欠損値あり7, AUC: 0.797237481180898
Feature: 欠損値なし2, AUC: 0.7969078775784433
Feature: 欠損値あり2, AUC: 0.7967936912700211
Feature: 欠損値あり4, AUC: 0.7965918742607592
Feature: 欠損値なし4, AUC: 0.7963155454507777


## 加える重みの定義

In [6]:
# 1つ目
initial_feature_weights = {
    '欠損値なし3': 0.289,
    '欠損値あり7': 0.3,
    '欠損値なし2': 0.0,
    '欠損値あり2': 0.134,
    '欠損値あり4': 0.038,
    '欠損値なし4': 0.113,
}
total_weight = sum(initial_feature_weights.values())
feature_weights_first =  {file: weight / total_weight for file, weight in initial_feature_weights.items()}


# 2つ目
initial_feature_weights = {
    '欠損値なし3': 0.23,
    '欠損値あり7': 0.26,
    '欠損値なし2': 0.19,
    '欠損値あり2': 0.18,
    '欠損値あり4': 0.15,
    '欠損値なし4': 0.125
}
total_weight = sum(initial_feature_weights.values())
feature_weights_second = {file: weight / total_weight for file, weight in initial_feature_weights.items()}

## アンサンブルの処理

In [7]:
def load_and_weighted_avg(df, weights):
    weighted_avg = sum(df[feature_name] * weight for feature_name, weight in weights.items())
    weighted_avg /= sum(weights.values())
    return weighted_avg

train_df['average_first'] = load_and_weighted_avg(train_df, feature_weights_first)
train_df['average_second'] = load_and_weighted_avg(train_df, feature_weights_second)
test_df['average_first'] = load_and_weighted_avg(test_df, feature_weights_first)
test_df['average_second'] = load_and_weighted_avg(test_df, feature_weights_second)

# aucスコアの算出
print(roc_auc_score(y, train_df['average_first']))
print(roc_auc_score(y, train_df['average_second']))

0.7981302113510426
0.7980449828998151


# int.rateの外れ値を除去する

In [8]:
# テストデータのインデックスを0から振り直し
test_df.reset_index(drop=True, inplace=True)
test = test[['int.rate']]
test.reset_index(drop=True, inplace=True)
indices_to_zero = test[test['int.rate'] >= 0.5].index
test_df.loc[indices_to_zero, 'average_first'] = 0
test_df.loc[indices_to_zero, 'average_second'] = 0

# 提出ファイルの作成

In [9]:
"""
csvにuploadすると有効数字が変わるみたいなので、
再現するためにcsvファイルとしてアップロードしてから
再び読み込むようにする
"""
submit = pd.read_csv(INPUT_PATH + "sample_submission.csv", header=None)
submit[1] = test_df['average_first'].values
submit.to_csv('average_first', header=None, index=False)
submit[1] = test_df['average_second'].values
submit.to_csv('average_second', header=None, index=False)

"""
最後の処理
firstとsecondの平均を取って最終提出ファイルとする
"""
# 2列目のデータを取得（インデックス1が2列目）
first = pd.read_csv('average_first', header=None).iloc[:, 1]
second = pd.read_csv('average_second', header=None).iloc[:, 1]
combined_col = (first + second) / 2
submit[1] = combined_col.values
submit.to_csv("last_submissionfile.csv", header=None, index=False)