In [22]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())

Current working directory: /Users/maxosone/Desktop/デスクトップ/VS Code/Python


In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import TargetEncoder  # sklearn's TargetEncoder
from xgboost import XGBClassifier
import lightgbm as lgb
import optuna
from sklearn.metrics import cohen_kappa_score

# -----------------------------
# Load cleaned data
# -----------------------------
train = pd.read_csv('/Users/maxosone/Downloads/train_clean.csv')
holdout = pd.read_csv('/Users/maxosone/Downloads/holdout_cleaned.csv')

y_train = train['damage_grade']
X_train = train.drop(columns=['damage_grade'])

y_holdout = holdout['damage_grade']
X_holdout = holdout.drop(columns=['damage_grade'])

# Features to target‐encode
geo_target = ['geo__geo_level_2_id', 'geo__geo_level_3_id']

In [31]:
def objective(trial):
    # Hyperparameters for LightGBM
    params = {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 8, 128),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'n_estimators': 200  # num_boost_roundに相当
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx].copy(), X_train.iloc[val_idx].copy()
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        te = TargetEncoder(cv=5, shuffle=True, random_state=42, target_type="multiclass")
        X_tr_enc = te.fit_transform(X_tr[geo_target], y_tr)
        X_val_enc = te.transform(X_val[geo_target])
        X_tr_enc = pd.DataFrame(X_tr_enc, index=X_tr.index, columns=te.get_feature_names_out(geo_target))
        X_val_enc = pd.DataFrame(X_val_enc, index=X_val.index, columns=te.get_feature_names_out(geo_target))

        X_tr_full = pd.concat([X_tr.drop(columns=geo_target), X_tr_enc], axis=1)
        X_val_full = pd.concat([X_val.drop(columns=geo_target), X_val_enc], axis=1)

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr_full, y_tr,
            eval_set=[(X_val_full, y_val)],
            eval_metric='multi_logloss',
            callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
        )

        y_pred = model.predict(X_val_full)
        acc = cohen_kappa_score(y_pred, y_val, weights = "quadratic")
        cv_scores.append(acc)

    return np.mean(cv_scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

print('Best trial:', study.best_trial.value)
print('Best params:', study.best_trial.params)

[I 2025-11-23 15:32:17,789] A new study created in memory with name: no-name-df0c307b-0147-45cf-9cff-bddc2f3326d1


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-23 15:33:16,029] Trial 0 finished with value: 0.5948502528627386 and parameters: {'learning_rate': 0.013567552816106715, 'num_leaves': 50, 'feature_fraction': 0.9884699186402135, 'bagging_fraction': 0.622445533739382, 'bagging_freq': 10, 'min_data_in_leaf': 100}. Best is trial 0 with value: 0.5948502528627386.
[I 2025-11-23 15:33:51,637] Trial 1 finished with value: 0.600502041204356 and parameters: {'learning_rate': 0.026250471756295447, 'num_leaves': 19, 'feature_fraction': 0.91605383974835, 'bagging_fraction': 0.7524873762958857, 'bagging_freq': 5, 'min_data_in_leaf': 27}. Best is trial 1 with value: 0.600502041204356.
[I 2025-11-23 15:34:51,028] Trial 2 finished with value: 0.6067756902282122 and parameters: {'learning_rate': 0.03820797515427053, 'num_leaves': 40, 'feature_fraction': 0.9439492623845227, 'bagging_fraction': 0.9640996599556884, 'bagging_freq': 7, 'min_data_in_leaf': 96}. Best is trial 2 with value: 0.6067756902282122.
[I 2025-11-23 15:36:08,087] Trial 3 fi

In [32]:
te_final = TargetEncoder(cv=5, shuffle=True, random_state=42)
X_train_enc = te_final.fit_transform(X_train[geo_target], y_train)
X_holdout_enc = te_final.transform(X_holdout[geo_target])

X_train_enc = pd.DataFrame(X_train_enc, index=X_train.index, columns=te_final.get_feature_names_out(geo_target))
X_holdout_enc = pd.DataFrame(X_holdout_enc, index=X_holdout.index, columns=te_final.get_feature_names_out(geo_target))

X_train_full = X_train.copy()
X_holdout_full = X_holdout.copy()
X_train_full.drop(columns=geo_target, inplace=True)
X_holdout_full.drop(columns=geo_target, inplace=True)

X_train_full = pd.concat([X_train_full, X_train_enc], axis=1)
X_holdout_full = pd.concat([X_holdout_full, X_holdout_enc], axis=1)

final_model = lgb.LGBMClassifier(**study.best_params, eval_metric='multi_logloss', random_state=42, n_jobs=1)
final_model.fit(X_train_full, y_train)

# -----------------------------
# Evaluate on holdout
# -----------------------------
y_pred = final_model.predict(X_holdout_full)
holdout_acc = cohen_kappa_score(y_pred, y_holdout, weights = "quadratic")
print("Holdout QWK:", holdout_acc)

Holdout QWK: 0.6165381154268332


In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
import optuna
import numpy as np
import pandas as pd

def objective(trial):
    params = {
        'iterations': 200,
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass',
        'random_seed': 42,
        'verbose': 0
    }
    
    params['learning_rate'] = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    params['depth'] = trial.suggest_int('depth', 4, 10)
    params['l2_leaf_reg'] = trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True)
    params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 1.0)
    params['border_count'] = trial.suggest_int('border_count', 32, 256)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx].copy(), X_train.iloc[val_idx].copy()
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        existing_geo = [c for c in geo_target if c in X_tr.columns]
        if len(existing_geo) == 0:
            X_tr_full, X_val_full = X_tr, X_val
        else:
            te = TargetEncoder(cv=5, shuffle=True, random_state=42, target_type="multiclass")
            X_tr_enc = te.fit_transform(X_tr[existing_geo], y_tr)
            X_val_enc = te.transform(X_val[existing_geo])

            X_tr_enc = pd.DataFrame(X_tr_enc, index=X_tr.index, columns=te.get_feature_names_out(existing_geo))
            X_val_enc = pd.DataFrame(X_val_enc, index=X_val.index, columns=te.get_feature_names_out(existing_geo))

            X_tr_full = pd.concat([X_tr.drop(columns=existing_geo), X_tr_enc], axis=1)
            X_val_full = pd.concat([X_val.drop(columns=existing_geo), X_val_enc], axis=1)

        train_pool = Pool(X_tr_full, y_tr)
        val_pool = Pool(X_val_full, y_val)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=10, verbose=False)

        y_pred = model.predict(X_val_full)
        acc = cohen_kappa_score(y_pred, y_val, weights = "quadratic")
        cv_scores.append(acc)

    return np.mean(cv_scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

print('Best trial:')
print(f'  Value (Accuracy): {study.best_trial.value:.5f}')
print('  Params:')
for key, value in study.best_trial.params.items():
    print(f'    {key}: {value}')
print('Best params:', study.best_trial.params)

[I 2025-11-22 19:57:26,294] A new study created in memory with name: no-name-d8767fb9-c2f9-4f47-9614-86d8e5eca634


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-22 19:59:36,531] Trial 0 finished with value: 0.5947543661219129 and parameters: {'learning_rate': 0.022480591105617977, 'depth': 9, 'l2_leaf_reg': 0.31431226631582976, 'bagging_temperature': 0.5018186592660068, 'border_count': 223}. Best is trial 0 with value: 0.5947543661219129.
[I 2025-11-22 20:00:25,796] Trial 1 finished with value: 0.5971536013566874 and parameters: {'learning_rate': 0.04392959414922933, 'depth': 6, 'l2_leaf_reg': 0.006421822818240997, 'bagging_temperature': 0.40395157498243195, 'border_count': 216}. Best is trial 1 with value: 0.5971536013566874.
[I 2025-11-22 20:01:11,435] Trial 2 finished with value: 0.6055408450495656 and parameters: {'learning_rate': 0.1724850548804507, 'depth': 6, 'l2_leaf_reg': 3.9099749513251387, 'bagging_temperature': 0.8342154965691257, 'border_count': 63}. Best is trial 2 with value: 0.6055408450495656.
[I 2025-11-22 20:02:16,265] Trial 3 finished with value: 0.6054497265656021 and parameters: {'learning_rate': 0.195620204569

In [None]:
te_final = TargetEncoder(cv=5, shuffle=True, random_state=42)
X_train_enc = te_final.fit_transform(X_train[geo_target], y_train)
X_holdout_enc = te_final.transform(X_holdout[geo_target])

X_train_enc = pd.DataFrame(X_train_enc, index=X_train.index, columns=te_final.get_feature_names_out(geo_target))
X_holdout_enc = pd.DataFrame(X_holdout_enc, index=X_holdout.index, columns=te_final.get_feature_names_out(geo_target))

X_train_full = X_train.copy()
X_holdout_full = X_holdout.copy()
X_train_full.drop(columns=geo_target, inplace=True)
X_holdout_full.drop(columns=geo_target, inplace=True)

X_train_full = pd.concat([X_train_full, X_train_enc], axis=1)
X_holdout_full = pd.concat([X_holdout_full, X_holdout_enc], axis=1)

final_model = CatBoostClassifier(**study.best_params,  loss_function='MultiClass', random_state=42)
final_model.fit(X_train_full, y_train)

y_pred = final_model.predict(X_holdout_full)
holdout_acc = cohen_kappa_score(y_pred, y_holdout, weights = "quadratic")
print("Holdout QWK:", holdout_acc)

0:	learn: 0.9312297	total: 79.5ms	remaining: 1m 19s
1:	learn: 0.8361982	total: 122ms	remaining: 1m 1s
2:	learn: 0.7723165	total: 176ms	remaining: 58.6s
3:	learn: 0.7268016	total: 232ms	remaining: 57.7s
4:	learn: 0.6959437	total: 287ms	remaining: 57.1s
5:	learn: 0.6724144	total: 340ms	remaining: 56.4s
6:	learn: 0.6549074	total: 402ms	remaining: 57.1s
7:	learn: 0.6410758	total: 460ms	remaining: 57s
8:	learn: 0.6304408	total: 517ms	remaining: 56.9s
9:	learn: 0.6222625	total: 573ms	remaining: 56.7s
10:	learn: 0.6159562	total: 614ms	remaining: 55.2s
11:	learn: 0.6112611	total: 658ms	remaining: 54.2s
12:	learn: 0.6069389	total: 701ms	remaining: 53.2s
13:	learn: 0.6031005	total: 744ms	remaining: 52.4s
14:	learn: 0.6004101	total: 785ms	remaining: 51.5s
15:	learn: 0.5986301	total: 823ms	remaining: 50.6s
16:	learn: 0.5969228	total: 859ms	remaining: 49.7s
17:	learn: 0.5954799	total: 897ms	remaining: 48.9s
18:	learn: 0.5941291	total: 938ms	remaining: 48.4s
19:	learn: 0.5932615	total: 977ms	remaini

In [28]:
print('Best params:', study.best_trial.params)

Best params: {'learning_rate': 0.23057059179496858, 'depth': 7, 'l2_leaf_reg': 0.12453715643078471, 'bagging_temperature': 0.8221747147870803, 'border_count': 194}


In [None]:
best_params = {'learning_rate': 0.08253087342734368,'objective': 'multiclass',
    'num_class': 3, 'num_leaves': 103, 'feature_fraction': 0.6482331618745639, 'bagging_fraction': 0.8866985645041257, 'bagging_freq': 7, 'min_data_in_leaf': 10}

final_model = lgb.LGBMClassifier(**best_params, eval_metric='multi_logloss', random_state=42, n_jobs=1)
final_model.fit(X_train_full, y_train)

y_pred = final_model.predict(X_holdout_full)
holdout_acc = accuracy_score(y_pred, y_holdout)
print("Holdout ACC:", holdout_acc)

Holdout ACC: 0.74885


In [None]:
best_params= {'learning_rate': 0.23057059179496858, 'depth': 7, 'l2_leaf_reg': 0.12453715643078471, 'bagging_temperature': 0.8221747147870803, 'border_count': 194}


final_model = CatBoostClassifier(**best_params,  loss_function='MultiClass', random_state=42)
final_model.fit(X_train_full, y_train)

# -----------------------------
# Evaluate on holdout
# -----------------------------
y_pred = final_model.predict(X_holdout_full)
holdout_acc = accuracy_score(y_pred, y_holdout)
print("Holdout ACC:", holdout_acc)

0:	learn: 0.9312297	total: 81.6ms	remaining: 1m 21s
1:	learn: 0.8361982	total: 119ms	remaining: 59.4s
2:	learn: 0.7723165	total: 163ms	remaining: 54s
3:	learn: 0.7268016	total: 203ms	remaining: 50.4s
4:	learn: 0.6959437	total: 244ms	remaining: 48.6s
5:	learn: 0.6724144	total: 286ms	remaining: 47.4s
6:	learn: 0.6549074	total: 336ms	remaining: 47.7s
7:	learn: 0.6410758	total: 385ms	remaining: 47.7s
8:	learn: 0.6304408	total: 429ms	remaining: 47.3s
9:	learn: 0.6222625	total: 474ms	remaining: 46.9s
10:	learn: 0.6159562	total: 519ms	remaining: 46.6s
11:	learn: 0.6112611	total: 570ms	remaining: 46.9s
12:	learn: 0.6069389	total: 613ms	remaining: 46.5s
13:	learn: 0.6031005	total: 657ms	remaining: 46.3s
14:	learn: 0.6004101	total: 700ms	remaining: 46s
15:	learn: 0.5986301	total: 747ms	remaining: 46s
16:	learn: 0.5969228	total: 786ms	remaining: 45.5s
17:	learn: 0.5954799	total: 826ms	remaining: 45.1s
18:	learn: 0.5941291	total: 866ms	remaining: 44.7s
19:	learn: 0.5932615	total: 909ms	remaining: 