<a href="https://colab.research.google.com/github/hassaku12/manabiDX2025/blob/main/%E6%BC%94%E7%BF%9203%EF%BC%88%E8%A9%A6%E3%81%99%E7%94%A8%EF%BC%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## データの読み取り

In [None]:
# まずはGoogleドライブにアクセスする
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# japanize-matplotlibのインストール (実行環境に未導入の場合)
!pip install japanize-matplotlib

Collecting japanize-matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/4.1 MB[0m [31m30.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.1/4.1 MB[0m [31m81.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... [?25l[?25hdone
  Created wheel for japanize-matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120257 sha256=f507c6abc2039c2703f1c7197195b96b7aadfb9a060b3cb0006bc9fd38bcff92
  Stored in directory: /root/.cache/pip/wheels

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import japanize_matplotlib
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # この行を追加
from sklearn.linear_model import LogisticRegression # この行を追加
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score # この行を追加
from sklearn.preprocessing import StandardScaler # この行を追加
from sklearn.model_selection import cross_val_score # この行を追加
import warnings
warnings.filterwarnings('ignore')

In [None]:
base_dir = '/content/drive/MyDrive/Colab Notebooks/マナビDX'

In [None]:
# 学習用データ
train_df = pd.read_csv(base_dir + '/train.csv')

# 評価用データ
test_df = pd.read_csv(base_dir + '/test.csv')

# 提出ファイル形式見本
submission_df = pd.read_csv(base_dir + '/sample_submit.csv', header=None)

# 顧客の属性や投資経験に関するデータ
assessment_df = pd.read_csv(base_dir + '/適合性判定シート一覧表.csv')

## データ前処理と特徴量エンジニアリング（すべてを関数で統一）

In [None]:
# 顧客IDに重複があるassessment_dfから各顧客最新のデータだけを残す
assessment_df['取引日'] = pd.to_datetime(assessment_df['取引日'])
assessment_df_latest = assessment_df.sort_values(['顧客ID', '取引日']).drop_duplicates(subset='顧客ID', keep='last')
assessment_df_selected = assessment_df_latest[['顧客ID', '顧客年齢', '投資経験（株式）']]

In [None]:
# train_df と test_df に顧客の属性情報を結合
train_df = pd.merge(train_df, assessment_df_selected, on='顧客ID', how='left')
test_df = pd.merge(test_df, assessment_df_selected, on='顧客ID', how='left')

In [None]:
# 欠損値を補完
train_df['顧客年齢'] = train_df['顧客年齢'].fillna(train_df['顧客年齢'].mean())
test_df['顧客年齢'] = test_df['顧客年齢'].fillna(test_df['顧客年齢'].mean())
train_df['投資経験（株式）'] = train_df['投資経験（株式）'].fillna(0)
test_df['投資経験（株式）'] = test_df['投資経験（株式）'].fillna(0)

In [None]:
# 基準年月から年と月を抽出
train_df['year'] = train_df['基準年月'].str.split('-').str[0].astype(int)
train_df['month'] = train_df['基準年月'].str.split('-').str[1].astype(int)
test_df['year'] = test_df['基準年月'].str.split('-').str[0].astype(int)
test_df['month'] = test_df['基準年月'].str.split('-').str[1].astype(int)

In [None]:
def create_trend_features(df):
    """顧客の時系列トレンドを特徴量化"""
    df = df.copy()

    # 顧客ごとの時系列統計（過去から現在への変化）
    customer_trends = df.groupby('顧客ID').agg({
        '時価価額': ['first', 'last', 'std'],  # 最初・最後・変動
        '資産規模': ['first', 'last', 'std'],
        '評価損益': ['first', 'last', 'std'],
        '基準年月': ['count', 'nunique']  # 記録数・期間数
    }).fillna(0)

    # カラム名を平坦化
    customer_trends.columns = ['_'.join(col) for col in customer_trends.columns]

    # トレンド特徴量を計算
    customer_trends['時価価額_変化率'] = (
        (customer_trends['時価価額_last'] - customer_trends['時価価額_first']) /
        (customer_trends['時価価額_first'] + 1)
    )
    customer_trends['資産規模_変化率'] = (
        (customer_trends['資産規模_last'] - customer_trends['資産規模_first']) /
        (customer_trends['資産規模_first'] + 1)
    )
    customer_trends['損益_変化率'] = (
        (customer_trends['評価損益_last'] - customer_trends['評価損益_first']) /
        (abs(customer_trends['評価損益_first']) + 1)
    )

    # 変動性特徴量
    customer_trends['時価価額_変動係数'] = (
        customer_trends['時価価額_std'] / (customer_trends['時価価額_last'] + 1)
    )
    customer_trends['資産_安定性'] = (
        customer_trends['資産規模_std'] / (customer_trends['資産規模_last'] + 1)
    )

    # 元データに結合
    df = df.merge(customer_trends, left_on='顧客ID', right_index=True, how='left')

    # 欠損値処理
    trend_cols = [col for col in df.columns if any(x in col for x in ['変化率', '変動係数', '安定性'])]
    for col in trend_cols:
        df[col] = df[col].fillna(0)

    return df

In [None]:
def create_trading_behavior_features(df, base_dir):
    """約定データから行動パターン特徴量を作成"""
    try:
        # 約定データ読み込み
        trading_df = pd.read_csv(base_dir + '/約定データ一覧表.csv')
        trading_df['取引日'] = pd.to_datetime(trading_df['取引日'])

        # 2021年11月30日以前の取引
        past_trading = trading_df[trading_df['取引日'] <= '2021-11-30']

        # 【重要】顧客の取引行動パターン分析
        customer_behavior = past_trading.groupby('顧客ID').agg({
            '取得価額': ['count', 'mean', 'std', 'sum'],
            '売却損益': ['sum', 'mean', 'std', 'count'],
            '償還損益': ['sum', 'count'],
            'オンライン取引フラグ': ['mean', 'sum'],
            'ゴール設定実施': ['mean', 'sum'],
            'ロスカット設定実施': ['mean', 'sum'],
            '取引日': [lambda x: (x.max() - x.min()).days,  # 取引期間
                     lambda x: x.nunique(),  # 取引日数
                     'count']  # 総取引回数
        }).fillna(0)

        # カラム名整理
        customer_behavior.columns = [f'取引_{col[0]}_{col[1]}' if isinstance(col, tuple) else f'取引_{col}'
                                   for col in customer_behavior.columns]

        # 改名
        rename_dict = {
            '取引_取引日_<lambda_0>': '取引期間_日数',
            '取引_取引日_<lambda_1>': '取引日数',
            '取引_取引日_count': '総取引回数'
        }
        customer_behavior.rename(columns=rename_dict, inplace=True)

        # 【重要】派生特徴量（投資成功パターン）
        # Note: '過去取引回数', '過去累積損益', '投資成功体験', '顧客平均時価価額', '顧客記録数', '顧客資産変動', '相対時価価額'
        # These names do not match the column names generated above.
        # Let's create features that match the expected names.
        customer_behavior['過去取引回数'] = customer_behavior['総取引回数'] # Assuming this is the same as total transaction count
        customer_behavior['過去累積損益'] = customer_behavior['取引_売却損益_sum'] + customer_behavior['取引_償還損益_sum'] # Sum of sell and redemption profit/loss
        customer_behavior['投資成功体験'] = (customer_behavior['過去累積損益'] > 0).astype(int) # Flag for positive cumulative profit/loss
        customer_behavior['顧客平均時価価額'] = df.groupby('顧客ID')['時価価額'].mean() # Need to calculate from the original df
        customer_behavior['顧客記録数'] = df.groupby('顧客ID').size() # Need to calculate from the original df
        customer_behavior['顧客資産変動'] = df.groupby('顧客ID')['時価価額'].std().fillna(0) # Need to calculate from the original df
        # '相対時価価額' requires overall market context, which is not directly available here.
        # Let's create a placeholder or an approximation.
        # A simple approximation could be a customer's average market value relative to the overall average.
        overall_avg_market_value = df['時価価額'].mean()
        customer_behavior['相対時価価額'] = (customer_behavior['顧客平均時価価額'] / (overall_avg_market_value + 1)).fillna(0)


        customer_behavior['平均利益率'] = (
            customer_behavior['取引_売却損益_sum'] /
            (customer_behavior['取引_取得価額_sum'] + 1)
        )
        customer_behavior['取引頻度'] = (
            customer_behavior['総取引回数'] /
            (customer_behavior['取引期間_日数'] + 1) * 365
        )
        customer_behavior['数字活用度'] = customer_behavior['取引_オンライン取引フラグ_mean'] # Assuming 'デジタル活用度' is a typo and should be '数字活用度' based on 'オンライン取引フラグ'
        customer_behavior['リスク管理度'] = (
            customer_behavior['取引_ゴール設定実施_mean'] +
            customer_behavior['取引_ロスカット設定実施_mean']
        ) / 2

        # 【重要】投資家タイプ分類
        customer_behavior['投資家タイプ'] = 'その他'

        # アクティブ投資家：頻繁に取引、成功率高い
        active_mask = (
            (customer_behavior['総取引回数'] > customer_behavior['総取引回数'].quantile(0.7)) &
            (customer_behavior['投資成功体験'] > 0) # Using the newly created feature
        )
        customer_behavior.loc[active_mask, '投資家タイプ'] = 'アクティブ'

        # 慎重投資家：リスク管理重視
        careful_mask = (
            customer_behavior['リスク管理度'] > 0.5
        )
        customer_behavior.loc[careful_mask, '投資家タイプ'] = '慎重'

        # デジタル投資家：オンライン中心
        digital_mask = (
            customer_behavior['数字活用度'] > 0.5 # Using the corrected name
        )
        customer_behavior.loc[digital_mask, '投資家タイプ'] = 'デジタル'

        print("Debug: customer_behavior columns before merge:", customer_behavior.columns.tolist())
        print("Debug: customer_behavior shape before merge:", customer_behavior.shape)
        print("Debug: df columns before merge:", df.columns.tolist())
        print("Debug: df shape before merge:", df.shape)


        # 元データに結合
        df = df.merge(customer_behavior, left_on='顧客ID', right_index=True, how='left')

        print("Debug: df columns after merge:", df.columns.tolist())
        print("Debug: df shape after merge:", df.shape)


        # 欠損値処理
        # Update the list of trading_cols to include the newly created features
        trading_cols = [col for col in df.columns if col.startswith('取引_') or
                       col in ['過去取引回数', '過去累積損益', '投資成功体験', '顧客平均時価価額',
                               '顧客記録数', '顧客資産変動', '相対時価価額', '平均利益率',
                               '取引頻度', '数字活用度', 'リスク管理度', '投資家タイプ']]

        for col in trading_cols:
            if col != '投資家タイプ':
                df[col] = df[col].fillna(0)
            else:
                df[col] = df[col].fillna('その他')

        print("📈 取引行動特徴量を追加しました")
        return df

    except Exception as e:
        print(f"⚠️ 取引データ読み込みエラーまたは特徴量作成エラー: {e}")
        # ダミー特徴量
        # Ensure all expected features are added as dummy columns in case of error
        dummy_cols = ['過去取引回数', '過去累積損益', '投資成功体験', '顧客平均時価価額',
                      '顧客記録数', '顧客資産変動', '相対時価価額', '平均利益率',
                      '取引頻度', '数字活用度', 'リスク管理度', '投資家タイプ']
        for col in dummy_cols:
            if col not in df.columns:
                if col == '投資家タイプ':
                    df[col] = 'その他'
                else:
                    df[col] = 0
        return df

In [None]:
# 【改善3】住所・地域特徴量の強化
def create_regional_features(df):
    """住所データから地域特性を抽出"""
    df = df.copy()

    # 住所コード別の詳細統計
    if 'y' in df.columns:
        regional_stats = df.groupby('住所コード').agg({
            'y': ['mean', 'sum', 'count'],
            '時価価額': ['mean', 'median', 'std'],
            '資産規模': ['mean', 'median'],
            '顧客年齢': ['mean', 'std'],
            '投資方針': 'mean',
            '評価損益': ['mean', 'std']
        }).fillna(0)

        regional_stats.columns = [f'地域_{col[0]}_{col[1]}' for col in regional_stats.columns]

        # 地域特性の派生特徴量
        regional_stats['地域_取引活発度'] = regional_stats['地域_y_mean']
        regional_stats['地域_富裕度'] = regional_stats['地域_時価価額_mean']
        regional_stats['地域_安定度'] = 1 / (regional_stats['地域_時価価額_std'] + 1)
        regional_stats['地域_成熟度'] = regional_stats['地域_顧客年齢_mean']

        # 地域ランキング
        regional_stats['地域_取引活発度_順位'] = regional_stats['地域_取引活発度'].rank(pct=True)
        regional_stats['地域_富裕度_順位'] = regional_stats['地域_富裕度'].rank(pct=True)

        df = df.merge(regional_stats, left_on='住所コード', right_index=True, how='left')

        # 欠損値処理
        regional_cols = [col for col in df.columns if col.startswith('地域_')]
        for col in regional_cols:
            df[col] = df[col].fillna(df[col].median())

    return df

In [None]:
def create_advanced_features_v2(df, base_dir=None):
    """AUC向上に特化した高度な特徴量作成"""
    df = df.copy()

    # 基本特徴量
    df['資産規模'] = df['取得価額'] + df['時価価額']
    df['含み損益率'] = np.where(df['取得価額'] != 0,
                                 (df['時価価額'] - df['取得価額']) / df['取得価額'], 0)
    df['評価倍率'] = np.where(df['取得価額'] != 0, df['時価価額'] / df['取得価額'], 1)

    # 時系列
    df['quarter'] = ((df['month'] - 1) // 3) + 1
    df['is_quarter_end'] = df['month'].isin([3, 6, 9, 12]).astype(int)
    df['age_group'] = pd.cut(df['顧客年齢'], bins=[0, 50, 100], labels=['young', 'senior'])

    # ランキング特徴量
    df['時価価額_順位'] = df['時価価額'].rank(pct=True)
    df['資産規模_順位'] = df['資産規模'].rank(pct=True)
    df['評価損益_順位'] = df['評価損益'].rank(pct=True)

    # 相互作用
    df['年齢×資産規模'] = df['顧客年齢'] * df['資産規模']
    df['投資方針×時価価額'] = df['投資方針'] * df['時価価額']

    # 【新規】トレンド特徴量
    df = create_trend_features(df)
    print("📊 トレンド特徴量を追加")

    # 【新規】取引行動特徴量
    if base_dir:
        df = create_trading_behavior_features(df, base_dir)

    # 【新規】地域特徴量
    df = create_regional_features(df)
    print("🗺️ 地域特徴量を追加")

    # 基本フラグ
    df['高資産フラグ'] = (df['資産規模'] > df['資産規模'].quantile(0.8)).astype(int)
    df['含み損フラグ'] = (df['評価損益'] < 0).astype(int)
    df['シニアフラグ'] = (df['顧客年齢'] >= 65).astype(int)
    df['株式経験あり'] = (df['投資経験（株式）'] > 0).astype(int)

    return df

## 複数モデルのアンサンブル

In [None]:
class OptimizedEnsemblePredictor:
    def __init__(self):
        self.models = {
            'rf': RandomForestClassifier(
                  n_estimators=120,        # 100→120
                  max_depth=7,             # 6→7
                  min_samples_leaf=15,     # 20→15
                  min_samples_split=30,    # 40→30
                  max_features=0.6,        # 0.5→0.6
                  random_state=42,
                  n_jobs=-1
              ),
              'gb': GradientBoostingClassifier(
                  n_estimators=80,
                  max_depth=4,
                  learning_rate=0.1,
                  subsample=0.7,
                  min_samples_leaf=20,
                  random_state=42
              ),
        }

    def fit(self, X, y, term_id):
        print(f"\n学習データ形状: {X.shape}")
        print(f"陽性率: {y.mean():.4f}")

        # 各モデルを学習
        self.models['rf'].fit(X, y)
        self.models['gb'].fit(X, y)

        # 交差検証でパフォーマンス確認
        print("\n=== 交差検証結果 ===")
        for name, model in self.models.items():
            cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
            print(f"{name} - Term {term_id} CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

            # 学習データAUC（過学習チェック）
            train_pred = model.predict_proba(X)[:, 1]
            train_auc = roc_auc_score(y, train_pred)
            overfitting_gap = train_auc - cv_scores.mean()
            print(f"  Train AUC: {train_auc:.4f}")
            print(f"  過学習ギャップ: {overfitting_gap:.4f}")

            if overfitting_gap > 0.15:
                print(f"  ⚠️ 過学習が検出されました")
            elif overfitting_gap > 0.10:
                print(f"  📊 軽度の過学習")
            else:
                print(f"  ✅ 過学習は抑制されています")

    def predict_proba(self, X, term_id):
        rf_pred = self.models['rf'].predict_proba(X)[:, 1]
        gb_pred = self.models['gb'].predict_proba(X)[:, 1]

        # より保守的なアンサンブル
        ensemble_pred = 0.6 * rf_pred + 0.4 * gb_pred
        return ensemble_pred

## モデル評価の強化

In [None]:
## モデル評価関数
def evaluate_model_performance(y_true, y_pred_proba, threshold=0.23):
    """PoCで設定した指標で評価"""
    y_pred = (y_pred_proba >= threshold).astype(int)

    metrics = {
        'AUC': roc_auc_score(y_true, y_pred_proba),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'F1': f1_score(y_true, y_pred, zero_division=0)
    }

    print(f"AUC: {metrics['AUC']:.4f} (目標: 0.7以上)")
    print(f"Precision: {metrics['Precision']:.4f} (目標: 0.4以上)")
    print(f"Recall: {metrics['Recall']:.4f} (目標: 0.6以上)")
    print(f"F1 Score: {metrics['F1']:.4f} (目標: 0.5以上)")

    return metrics

## メイン処理（改善版）

In [None]:
def main_prediction_pipeline(train_df, test_df):
    """過学習を修正したメイン予測パイプライン"""

    all_available_columns = [col for col in train_df.columns if col not in ['y'] and not col.startswith('train_term_') and not col.startswith('test_term_')]

    feature_columns = [
        # Existing features
        '取得価額', '時価価額', '評価損益', '投資方針',
        '顧客年齢', '投資経験（株式）', 'year', 'month',
        '資産規模', '含み損益率', 'quarter', 'is_quarter_end', 'age_group',
        '時価価額_順位', '資産規模_順位',
        '年齢×資産規模',
        '高資産フラグ', '含み損フラグ', 'シニアフラグ', '株式経験あり',

        '過去取引回数', '過去累積損益', '投資成功体験',
        '顧客平均時価価額', '顧客記録数', '顧客資産変動', '相対時価価額',

        '時価価額_first', '時価価額_last', '時価価額_std', '資産規模_first', '資産規模_last', '資産規模_std',
        '評価損益_first', '評価損益_last', '評価損益_std', '基準年月_count', '基準年月_nunique',
        '時価価額_変化率', '資産規模_変化率', '損益_変化率', '時価価額_変動係数', '資産_安定性',

        '地域_y_mean', '地域_y_sum', '地域_y_count', '地域_時価価額_mean', '地域_時価価額_median', '地域_時価価額_std',
        '地域_資産規模_mean', '地域_資産規模_median', '地域_顧客年齢_mean', '地域_顧客年齢_std', '地域_投資方針_mean',
        '地域_評価損益_mean', '地域_評価損益_std', '地域_取引活発度', '地域_富裕度', '地域_安定度', '地域_成熟度',
        '地域_取引活発度_順位', '地域_富裕度_順位'
    ]


    print(f"使用特徴量数: {len(feature_columns)}")

    ensemble_models = {}
    y_pred_df = pd.DataFrame()
    all_auc_scores = []

    for i in range(1, 7):
        print(f"\n{'='*60}")
        print(f"Term {i} の処理")
        print(f"{'='*60}")

        # 学習データ準備
        train_term = train_df[train_df[f'train_term_{i}'] == 1]
        # Ensure that feature_columns only contains columns present in train_term
        valid_feature_columns = [col for col in feature_columns if col in train_term.columns]
        X_train = train_term[valid_feature_columns]
        y_train = train_term['y']

        print(f"Columns in X_train before encoding: {X_train.columns.tolist()}")


        # カテゴリ変数のエンコーディング
        # Ensure columns to encode are present in X_train
        categorical_cols = ['投資方針', 'age_group']
        categorical_cols_to_encode = [col for col in categorical_cols if col in X_train.columns]
        X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols_to_encode,
                                         drop_first=True, dtype=int)


        print(f"エンコーディング後特徴量数: {X_train_encoded.shape[1]}")
        print(f"学習データ数: {len(y_train)}, 陽性率: {y_train.mean():.4f}")

        # 過学習対策済みアンサンブルモデル学習
        ensemble = OptimizedEnsemblePredictor()
        ensemble.fit(X_train_encoded, y_train, i)
        ensemble_models[i] = ensemble

        # 学習データでの評価
        train_pred = ensemble.predict_proba(X_train_encoded, i)
        print("\n=== 学習データ評価 ===")
        metrics = evaluate_model_performance(y_train, train_pred, threshold=0.23)
        all_auc_scores.append(metrics['AUC'])

        # テストデータ予測
        test_term = test_df[test_df[f'test_term_{i}'] == 1]
        # Ensure that feature_columns only contains columns present in test_term
        valid_feature_columns_test = [col for col in feature_columns if col in test_term.columns]
        X_test = test_term[valid_feature_columns_test]
        X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols_to_encode, # Use the same columns to encode as train
                                         drop_first=True, dtype=int)

        # カラム調整
        missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
        for col in missing_cols:
            X_test_encoded[col] = 0
        X_test_encoded = X_test_encoded[X_train_encoded.columns]

        # 予測実行
        test_pred = ensemble.predict_proba(X_test_encoded, i)

        # 結果をDataFrameに追加
        term_results = pd.DataFrame({
            'ID': test_term['ID'],
            'predict': test_pred
        })

        if y_pred_df.empty:
            y_pred_df = term_results
        else:
            y_pred_df = pd.concat([y_pred_df, term_results])

    # 全体サマリー
    print(f"\n{'='*60}")
    print("過学習修正後のAUCサマリー")
    print(f"{'='*60}")
    for i, auc in enumerate(all_auc_scores, 1):
        print(f"Term {i}: {auc:.4f}")

    avg_auc = np.mean(all_auc_scores)
    print(f"\n平均AUC: {avg_auc:.4f}")

    if avg_auc >= 0.65:
        print("🎉 過学習修正成功！AUC 0.65達成")
    elif avg_auc >= 0.62:
        print("📈 過学習が改善されました")
    else:
        print(f"📊 現在のAUC: {avg_auc:.4f} (過学習ギャップを確認)")

    return y_pred_df, ensemble_models

In [None]:
# メイン処理の実行
if __name__ == "__main__":
    print("シンプル予測パイプライン開始")

    # Apply advanced feature engineering
    train_df = create_advanced_features_v2(train_df, base_dir)
    test_df = create_advanced_features_v2(test_df, base_dir)

    print("\n--- Debug: Columns after create_advanced_features_v2 ---")
    print("Train_df columns:", train_df.columns.tolist())
    print("Test_df columns:", test_df.columns.tolist())
    print("--- End Debug ---")


    # 予測実行
    y_pred_df, models = main_prediction_pipeline(train_df, test_df)

    # 特徴量重要度分析（Term 1のサンプル）
    train_term_1 = train_df[train_df['train_term_1'] == 1]

    feature_columns = [
        # 既存特徴量（変更なし）
        '取得価額', '時価価額', '評価損益', '投資方針',
        '顧客年齢', '投資経験（株式）', 'year', 'month',
        '資産規模', '含み損益率', 'quarter', 'is_quarter_end', 'age_group',
        '時価価額_順位', '資産規模_順位',
        '年齢×資産規模',
        '高資産フラグ', '含み損フラグ', 'シニアフラグ', '株式経験あり',

        # 【追加】新しい特徴量（7つ）
        '過去取引回数', '過去累積損益', '投資成功体験',
        '顧客平均時価価額', '顧客記録数', '顧客資産変動', '相対時価価額'
    ]

    # Check if the feature_columns are in train_term_1 before selecting
    missing_in_train_term_1 = [col for col in feature_columns if col not in train_term_1.columns]
    if missing_in_train_term_1:
        print(f"Warning: Missing columns in train_term_1 for feature importance analysis: {missing_in_train_term_1}")
        # Filter feature_columns to only include those present in train_term_1
        feature_columns_for_importance = [col for col in feature_columns if col in train_term_1.columns]
    else:
        feature_columns_for_importance = feature_columns


    X_sample = pd.get_dummies(train_term_1[feature_columns_for_importance],
                             columns=['投資方針', 'age_group'], drop_first=True, dtype=int)

    if 'analyze_feature_importance' in locals():
        importance_df = analyze_feature_importance(models, X_sample.columns)
    else:
        print("Error: analyze_feature_importance function is not defined.")

シンプル予測パイプライン開始
📊 トレンド特徴量を追加
Debug: customer_behavior columns before merge: ['取引_取得価額_count', '取引_取得価額_mean', '取引_取得価額_std', '取引_取得価額_sum', '取引_売却損益_sum', '取引_売却損益_mean', '取引_売却損益_std', '取引_売却損益_count', '取引_償還損益_sum', '取引_償還損益_count', '取引_オンライン取引フラグ_mean', '取引_オンライン取引フラグ_sum', '取引_ゴール設定実施_mean', '取引_ゴール設定実施_sum', '取引_ロスカット設定実施_mean', '取引_ロスカット設定実施_sum', '取引期間_日数', '取引日数', '総取引回数', '過去取引回数', '過去累積損益', '投資成功体験', '顧客平均時価価額', '顧客記録数', '顧客資産変動', '相対時価価額', '平均利益率', '取引頻度', '数字活用度', 'リスク管理度', '投資家タイプ']
Debug: customer_behavior shape before merge: (872, 31)
Debug: df columns before merge: ['ID', '顧客ID', '住所コード', '顧客氏名', '基準年月', '取得価額', '時価価額', '評価損益', '投資方針', 'train_term_1', 'train_term_2', 'train_term_3', 'train_term_4', 'train_term_5', 'train_term_6', '翌月_購入', '翌月_売却', '翌々月_購入', '翌々月_売却', 'y', '顧客年齢', '投資経験（株式）', 'year', 'month', '資産規模', '含み損益率', '評価倍率', 'quarter', 'is_quarter_end', 'age_group', '時価価額_順位', '資産規模_順位', '評価損益_順位', '年齢×資産規模', '投資方針×時価価額', '時価価額_first', '時価価額_last', '時価価額_std', '資

In [None]:
# 現在のモデルパフォーマンス詳細分析

def detailed_performance_analysis(train_df, models):
    """各Termの詳細パフォーマンス分析"""

    results_summary = []

    for term_id in range(1, 7):
        print(f"\n{'='*50}")
        print(f"Term {term_id} 詳細分析")
        print(f"{'='*50}")

        # 学習データ準備
        train_term = train_df[train_df[f'train_term_{term_id}'] == 1]

        # 基本統計
        print(f"データ数: {len(train_term)}")
        print(f"陽性率: {train_term['y'].mean():.4f}")
        print(f"陽性数: {train_term['y'].sum()}")
        print(f"陰性数: {(train_term['y'] == 0).sum()}")

        # クラス不均衡度
        imbalance_ratio = (train_term['y'] == 0).sum() / train_term['y'].sum()
        print(f"クラス不均衡比: {imbalance_ratio:.2f}:1")

        # データ品質チェック
        feature_columns = ['取得価額', '時価価額', '評価損益', '投資方針',
                          '顧客年齢', '投資経験（株式）', '資産規模', '含み損益率']

        missing_info = {}
        for col in feature_columns:
            if col in train_term.columns:
                missing_pct = train_term[col].isnull().mean() * 100
                missing_info[col] = missing_pct
                if missing_pct > 5:
                    print(f"⚠️ {col}: {missing_pct:.1f}% 欠損")

        # 特徴量の分布チェック
        outlier_info = {}
        for col in ['取得価額', '時価価額', '評価損益']:
            if col in train_term.columns:
                Q1 = train_term[col].quantile(0.25)
                Q3 = train_term[col].quantile(0.75)
                IQR = Q3 - Q1
                outliers = ((train_term[col] < Q1 - 1.5*IQR) |
                           (train_term[col] > Q3 + 1.5*IQR)).sum()
                outlier_pct = outliers / len(train_term) * 100
                outlier_info[col] = outlier_pct
                if outlier_pct > 10:
                    print(f"📊 {col}: {outlier_pct:.1f}% 外れ値")

        results_summary.append({
            'term': term_id,
            'data_count': len(train_term),
            'positive_rate': train_term['y'].mean(),
            'imbalance_ratio': imbalance_ratio,
            'missing_issues': len([k for k, v in missing_info.items() if v > 5]),
            'outlier_issues': len([k for k, v in outlier_info.items() if v > 10])
        })

    # サマリー表示
    print(f"\n{'='*60}")
    print("全Term サマリー")
    print(f"{'='*60}")

    summary_df = pd.DataFrame(results_summary)
    display(summary_df)

    # 問題のあるTermを特定
    problem_terms = []
    for _, row in summary_df.iterrows():
        issues = []
        if row['positive_rate'] < 0.05:
            issues.append("極度の不均衡")
        if row['imbalance_ratio'] > 50:
            issues.append("重度の不均衡")
        if row['missing_issues'] > 0:
            issues.append("欠損値問題")
        if row['outlier_issues'] > 0:
            issues.append("外れ値問題")

        if issues:
            problem_terms.append(f"Term {row['term']}: {', '.join(issues)}")

    if problem_terms:
        print("\n🚨 要注意Term:")
        for problem in problem_terms:
            print(f"  {problem}")
    else:
        print("\n✅ 全Termでデータ品質は良好")

    return summary_df

# 特徴量重要度の詳細分析
def analyze_feature_importance_detailed(models):
    """特徴量重要度の詳細分析"""

    print(f"\n{'='*60}")
    print("特徴量重要度分析")
    print(f"{'='*60}")

    # Term1のモデルを使用（サンプルとして）
    if 1 in models:
        rf_model = models[1].models['rf']
        gb_model = models[1].models['gb']

        # Get feature names from the trained model
        feature_names = rf_model.feature_names_in_

        rf_importance = rf_model.feature_importances_
        gb_importance = gb_model.feature_importances_

        importance_df = pd.DataFrame({
            'feature': feature_names,
            'rf_importance': rf_importance,
            'gb_importance': gb_importance
        })

        importance_df['avg_importance'] = (importance_df['rf_importance'] +
                                         importance_df['gb_importance']) / 2
        importance_df = importance_df.sort_values('avg_importance', ascending=False)

        print("Top 15 重要特徴量:")
        display(importance_df.head(15)[['feature', 'avg_importance']].to_string(index=False))

        # 低重要度特徴量（削除候補）
        low_importance = importance_df[importance_df['avg_importance'] < 0.01]
        print(f"\n削除候補特徴量（重要度 < 0.01）: {len(low_importance)}個")
        if len(low_importance) > 0:
            display(low_importance[['feature', 'avg_importance']].head(10).to_string(index=False))

        return importance_df
    else:
        print("モデルが見つかりません")
        return None

# 実行例（実際のデータで実行してください）
# summary_df = detailed_performance_analysis(train_df, models)
# importance_df = analyze_feature_importance_detailed(models, X_sample.columns)

In [None]:
# 現在のモデルの詳細分析を実行
summary_df = detailed_performance_analysis(train_df, models)

# 特徴量重要度の分析
importance_df = analyze_feature_importance_detailed(models)


Term 1 詳細分析
データ数: 80000
陽性率: 0.1972
陽性数: 15778
陰性数: 64222
クラス不均衡比: 4.07:1
📊 評価損益: 16.7% 外れ値

Term 2 詳細分析
データ数: 82000
陽性率: 0.1976
陽性数: 16204
陰性数: 65796
クラス不均衡比: 4.06:1
📊 評価損益: 16.6% 外れ値

Term 3 詳細分析
データ数: 84000
陽性率: 0.1980
陽性数: 16635
陰性数: 67365
クラス不均衡比: 4.05:1
📊 評価損益: 16.4% 外れ値

Term 4 詳細分析
データ数: 86000
陽性率: 0.1985
陽性数: 17067
陰性数: 68933
クラス不均衡比: 4.04:1
📊 評価損益: 16.3% 外れ値

Term 5 詳細分析
データ数: 88000
陽性率: 0.1989
陽性数: 17500
陰性数: 70500
クラス不均衡比: 4.03:1
📊 評価損益: 16.1% 外れ値

Term 6 詳細分析
データ数: 90000
陽性率: 0.1989
陽性数: 17899
陰性数: 72101
クラス不均衡比: 4.03:1
📊 評価損益: 16.0% 外れ値

全Term サマリー


Unnamed: 0,term,data_count,positive_rate,imbalance_ratio,missing_issues,outlier_issues
0,1,80000,0.197225,4.070351,0,1
1,2,82000,0.19761,4.060479,0,1
2,3,84000,0.198036,4.049594,0,1
3,4,86000,0.198453,4.038964,0,1
4,5,88000,0.198864,4.028571,0,1
5,6,90000,0.198878,4.028214,0,1



🚨 要注意Term:
  Term 1.0: 外れ値問題
  Term 2.0: 外れ値問題
  Term 3.0: 外れ値問題
  Term 4.0: 外れ値問題
  Term 5.0: 外れ値問題
  Term 6.0: 外れ値問題

特徴量重要度分析
Top 15 重要特徴量:


' feature  avg_importance\n    時価価額        0.316366\n 時価価額_順位        0.254873\n 年齢×資産規模        0.096884\n  過去取引回数        0.076896\n    資産規模        0.028906\n    取得価額        0.025357\n 資産規模_順位        0.022657\n   含み損益率        0.017532\n    評価損益        0.013338\n    year        0.010733\n評価損益_std        0.010666\n  損益_変化率        0.009611\n時価価額_変化率        0.006958\n資産規模_変化率        0.006803\n  資産_安定性        0.006494'


削除候補特徴量（重要度 < 0.01）: 52個


'  feature  avg_importance\n   損益_変化率        0.009611\n 時価価額_変化率        0.006958\n 資産規模_変化率        0.006803\n   資産_安定性        0.006494\n   過去累積損益        0.006301\n時価価額_変動係数        0.005800\n資産規模_last        0.005652\n   相対時価価額        0.005543\n時価価額_last        0.005154\n     顧客年齢        0.004657'

In [None]:
# 提出ファイル作成
submission_df[1] = y_pred_df['predict']
submission_df.to_csv('improved_submission.csv', index=False, header=False)

print("\n提出用ファイル 'improved_submission.csv' が作成されました。")
print("このファイルをコンペティションサイトに提出してください。")


提出用ファイル 'improved_submission.csv' が作成されました。
このファイルをコンペティションサイトに提出してください。
