# Allstate Claims Severity - Final Master Edition

## 1. Import Libraries / ライブラリのインポート
Import necessary libraries (Pandas, NumPy, Matplotlib, Seaborn).
必要なライブラリを読み込みます。日本語フォント対応もここで行います。

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Install and import Japanese font support
# 日本語フォント対応ライブラリのインストールとインポート
!pip install japanize-matplotlib
import japanize_matplotlib

%matplotlib inline
pd.set_option('display.max_columns', None)

## 2. Load Data / データの読み込み
Load the CSV files. Handling paths for both Kaggle and local environment.
CSVファイルを読み込みます。Kaggle環境とローカル環境の両方に対応できるようにパスを自動判定します。

In [None]:
# Determine the file path / ファイルパスの判定
if os.path.exists('/kaggle/input/allstate-claims-severity/train.csv'):
    base_path = '/kaggle/input/allstate-claims-severity/'
    print('Running on Kaggle (Kaggle環境で実行中)')
else:
    base_path = '../input/'
    print('Running Locally (ローカル環境で実行中)')

train = pd.read_csv(base_path + 'train.csv')
test = pd.read_csv(base_path + 'test.csv')
sample_submission = pd.read_csv(base_path + 'sample_submission.csv')

print(f'Train shape (学習データ): {train.shape}')
print(f'Test shape (テストデータ): {test.shape}')

## 9. Data Preprocessing / データ前処理
1. Log transform `loss` (to normalize distribution).
2. Combine train and test data (for consistent encoding).
3. Convert categorical variables to numbers (One-Hot Encoding).

1. 目的変数 `loss` を対数変換します（分布を整えるため）。
2. 学習データとテストデータを結合します（カテゴリ変数の変換を統一するため）。
3. カテゴリ変数を数字（0/1）に変換します（One-Hot Encoding）。

In [None]:
# Log transform the target variable / 目的変数を対数変換
train['log_loss'] = np.log1p(train['loss'])

# Identify Continuous columns / 連続値列の特定
cont_features = [col for col in train.columns if 'cont' in col]

# Drop 'id' and 'loss' from train for merging / 結合用に不要な列を一時的に削除
train_X = train.drop(['id', 'loss', 'log_loss'], axis=1)
test_X = test.drop(['id'], axis=1)

# Combine train and test / データの結合
# (Assigning a split identifier / 後で分割できるようにフラグを立てる)
train_X['is_train'] = 1
test_X['is_train'] = 0

all_data = pd.concat([train_X, test_X], axis=0)
print(f'Combined data shape (結合後のサイズ): {all_data.shape}')

# One-Hot Encoding (Convert text to numbers) / カテゴリ変数をダミー変数化
print('Processing One-Hot Encoding... (変換中...)')
all_data = pd.get_dummies(all_data)
print(f'Shape after encoding (変換後のサイズ): {all_data.shape}')

## 10. Split Data for General Models / 全体モデル用のデータ分割


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Split back to train and test / 再び学習用とテスト用に分割
X_train_all = all_data[all_data['is_train'] == 1].drop(['is_train'], axis=1)
X_test_final = all_data[all_data['is_train'] == 0].drop(['is_train'], axis=1)

y_train_all = train['log_loss']

# Split train data for validation (80% train, 20% validation) / 検証用にデータを8:2に分割
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

# Identify Category Columns (excluding cont features) for later use
# 後で「カテゴリ専門モデル」を作るために、カテゴリ列がどれか特定しておきます
all_columns = X_train.columns
cat_col_names = [c for c in all_columns if c not in cont_features]
print(f'Continuous Features: {len(cont_features)}, Categorical Features: {len(cat_col_names)}')

--- 
# Model 1: XGBoost (All Features) / 全データXGBoost


In [None]:
# Initialize XGBoost Regressor / モデルの定義
model_xgb = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    device='cuda',
    early_stopping_rounds=50
)

print('Training XGBoost (All Features)...')
model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# Save Predictions
pred_xgb_log = model_xgb.predict(X_test_final)
pred_xgb = np.expm1(pred_xgb_log)
val_pred_xgb_log = model_xgb.predict(X_val)
val_pred_xgb = np.expm1(val_pred_xgb_log)
print('XGBoost Finished')

--- 
# Model 2: LightGBM (All Features) / 全データLightGBM


In [None]:
import lightgbm as lgb

model_lgb = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1,
    device='gpu'
)

print('Training LightGBM (All Features)...')
model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse')

# Save Predictions
pred_lgb_log = model_lgb.predict(X_test_final)
pred_lgb = np.expm1(pred_lgb_log)
val_pred_lgb_log = model_lgb.predict(X_val)
val_pred_lgb = np.expm1(val_pred_lgb_log)
print('LightGBM Finished')

--- 
# Model 3: Neural Network (All Features) / 全データニューラルネット


In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Scale for NN
X_train_nn = X_train.copy()
X_val_nn = X_val.copy()
X_test_nn = X_test_final.copy()

scaler = StandardScaler()
X_train_nn[cont_features] = scaler.fit_transform(X_train_nn[cont_features])
X_val_nn[cont_features] = scaler.transform(X_val_nn[cont_features])
X_test_nn[cont_features] = scaler.transform(X_test_nn[cont_features])

def create_model(input_dim):
    model = Sequential([
        Dense(256, activation='relu', input_dim=input_dim),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_absolute_error')
    return model

print('Training Neural Network (All Features)...')
model_nn = create_model(X_train_nn.shape[1])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model_nn.fit(
    X_train_nn, y_train,
    validation_data=(X_val_nn, y_val),
    epochs=50,
    batch_size=256,
    callbacks=[early_stopping],
    verbose=0
)

# Save Predictions
pred_nn_log = model_nn.predict(X_test_nn).flatten()
pred_nn = np.expm1(pred_nn_log)
val_pred_nn_log = model_nn.predict(X_val_nn).flatten()
val_pred_nn = np.expm1(val_pred_nn_log)
print('Neural Network Finished')

--- 
# Expert Models (Diversity Strategy) / 専門家モデルの追加
As per your brilliant idea, we create "Specialist" models to ensure diversity.
あなたのアイデア通り、「専門家AI」を作ることで、視点の多様性を確保します。

1. **Model 4: NN (Continuous Only)** -> Focuses only on math/numbers.
2. **Model 5: XGB (Categorical Only)** -> Focuses only on types/categories.

1. **モデル4: 数値専門ニューラルネット** -> cont列しか見せません。
2. **モデル5: カテゴリ専門XGBoost** -> cat列しか見せません。

In [None]:
### Model 4: Continuous Only Neural Network ###
# Use only 'cont_features' (already scaled in X_train_nn)

X_train_cont = X_train_nn[cont_features]
X_val_cont = X_val_nn[cont_features]
X_test_cont = X_test_nn[cont_features]

print('Training Expert Model 4: NN (Continuous Only)...')
model_nn_cont = create_model(X_train_cont.shape[1]) # Create same model structure
model_nn_cont.fit(
    X_train_cont, y_train,
    validation_data=(X_val_cont, y_val),
    epochs=50,
    batch_size=256,
    callbacks=[early_stopping],
    verbose=0
)

# Save Predictions
pred_nn_cont_log = model_nn_cont.predict(X_test_cont).flatten()
pred_nn_cont = np.expm1(pred_nn_cont_log)
val_pred_nn_cont_log = model_nn_cont.predict(X_val_cont).flatten()
val_pred_nn_cont = np.expm1(val_pred_nn_cont_log)
print('Expert Model 4 Finished')

In [None]:
### Model 5: Categorical Only XGBoost ###
# Use only 'cat_col_names'

X_train_cat = X_train[cat_col_names]
X_val_cat = X_val[cat_col_names]
X_test_cat = X_test_final[cat_col_names]

print('Training Expert Model 5: XGBoost (Categorical Only)...')
model_xgb_cat = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    device='cuda',
    early_stopping_rounds=50
)

model_xgb_cat.fit(X_train_cat, y_train, eval_set=[(X_val_cat, y_val)], verbose=False)

# Save Predictions
pred_xgb_cat_log = model_xgb_cat.predict(X_test_cat)
pred_xgb_cat = np.expm1(pred_xgb_cat_log)
val_pred_xgb_cat_log = model_xgb_cat.predict(X_val_cat)
val_pred_xgb_cat = np.expm1(val_pred_xgb_cat_log)
print('Expert Model 5 Finished')

## 13. Final 5-Model Optimization / 5モデルの最適化
We now mix 5 models: 
1. XGB (All)
2. LGB (All)
3. NN (All)
4. NN (Continuous Specialist)
5. XGB (Categorical Specialist)

In [None]:
from scipy.optimize import minimize
from sklearn.metrics import mean_absolute_error

y_true = np.expm1(y_val)

# Collect Validation Predictions
V1 = val_pred_xgb
V2 = val_pred_lgb
V3 = val_pred_nn
V4 = val_pred_nn_cont
V5 = val_pred_xgb_cat

# Collect Test Predictions
T1 = pred_xgb
T2 = pred_lgb
T3 = pred_nn
T4 = pred_nn_cont
T5 = pred_xgb_cat

def loss_func(weights):
    final_pred = (weights[0]*V1) + (weights[1]*V2) + (weights[2]*V3) + (weights[3]*V4) + (weights[4]*V5)
    return mean_absolute_error(y_true, final_pred)

# Initial Guess (0.2 each)
init_weights = [0.2] * 5
constraints = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
bounds = [(0, 1)] * 5

print('Running 5-Model Optimization...')
res = minimize(loss_func, init_weights, method='SLSQP', bounds=bounds, constraints=constraints)

print('Optimal Weights (最適な重み):')
print(f'1. XGB (All)    : {res.x[0]:.4f}')
print(f'2. LGB (All)    : {res.x[1]:.4f}')
print(f'3. NN (All)     : {res.x[2]:.4f}')
print(f'4. NN (ContOnly): {res.x[3]:.4f}')
print(f'5. XGB (CatOnly): {res.x[4]:.4f}')
print(f'Best Validation MAE: {res.fun:.4f}')

# Apply weights to Test
pred_ensemble_final = (res.x[0]*T1) + (res.x[1]*T2) + (res.x[2]*T3) + (res.x[3]*T4) + (res.x[4]*T5)

submission_final = pd.DataFrame({
    'id': test['id'],
    'loss': pred_ensemble_final
})

submission_final.to_csv('submission_expert_5models.csv', index=False)
print('submission_expert_5models.csv created!')