# Import

In [1]:
import pandas as pd
import numpy as np
import joblib
import optuna

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.inspection import permutation_importance

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# 데이터 불러오기

In [2]:
# ✅ 데이터 로드
x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')

# ✅ 컬럼 정의
env_cols = ['in_temp', 'in_hum', 'in_co2', 'out_temp', 'out_hum', 'solar_rad', 'wind_speed', 'wind_direction', 'rain_sensor']
growth_cols = ['plant_height', 'crown_diameter', 'petiole_length', 'leaf_count', 'leaf_length', 'leaf_width', 'fruit_count', 'flower_count']
actuator_cols = ['fan', 'co2', 'heater', 'window1', 'window2', 'curtain1', 'curtain2', 'curtain3', 'side_curtain']

In [3]:
x_train['numbers of plant'].unique()

array([600, 500])

# 파생변수 생성

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# ✅ wind_direction 전처리
for df in [x_train, x_test]:
    df['wind_direction_sin'] = np.sin(np.deg2rad(df['wind_direction']))
    df['wind_direction_cos'] = np.cos(np.deg2rad(df['wind_direction']))

# ✅ 컬럼 정의
env_cols = ['in_temp', 'in_hum', 'in_co2', 'out_temp', 'out_hum', 'solar_rad', 'wind_speed', 'wind_direction', 'rain_sensor']
growth_cols = ['plant_height', 'crown_diameter', 'petiole_length', 'leaf_count', 'leaf_length', 'leaf_width', 'fruit_count', 'flower_count']
actuator_cols = ['fan', 'co2', 'heater', 'window1', 'window2', 'curtain1', 'curtain2', 'curtain3', 'side_curtain']

# ✅ 전체 파생변수 생성 함수
def advanced_features_with_week_v2(df):
    feats = []
    for sample, group in df.groupby('Sample_Number'):
        d = {'Sample_Number': sample}
        d.update(extract_basic_stats(group))
        d.update(extract_wind_features(group))
        d.update(extract_temp_hum_diff(group))
        d.update(extract_growth_features(group))
        d.update(extract_time_based_features(group))
        d.update(extract_cumulative_features(group))
        feats.append(d)
    return pd.DataFrame(feats)

# ✅ 기울기 계산 함수
def calc_slope(series):
    y = series.values
    x = np.arange(len(y)).reshape(-1, 1)
    if len(y) < 2 or np.isnan(y).all():
        return np.nan
    try:
        lr = LinearRegression()
        lr.fit(x[~np.isnan(y)], y[~np.isnan(y)])
        return lr.coef_[0]
    except:
        return np.nan

# ✅ 기본 통계 및 시계열 기반 특징
def extract_basic_stats(group):
    result = {}
    for col in actuator_cols + env_cols:
        s = group[col]
        diff = s.diff()
        result.update({
            f'{col}_mean': s.mean(),
            f'{col}_std': s.std(),
            f'{col}_max': s.max(),
            f'{col}_min': s.min(),
            f'{col}_median': s.median(),
            f'{col}_slope': calc_slope(s),
            f'{col}_q80': s.quantile(0.8),
            f'{col}_q20': s.quantile(0.2),
            f'{col}_diffmean': diff.mean(),
            f'{col}_diffstd': diff.std(),
            f'{col}_increase_cnt': (diff > 0).sum(),
            f'{col}_decrease_cnt': (diff < 0).sum(),
            f'{col}_rolling3_mean': s.rolling(3, min_periods=1).mean().mean(),
            f'{col}_rolling6_std': s.rolling(6, min_periods=1).std().mean(),
            f'{col}_rolling3_max': s.rolling(3, min_periods=1).max().mean(),
            f'{col}_rolling3_min': s.rolling(3, min_periods=1).min().mean(),
            f'{col}_expanding_mean': s.expanding(min_periods=1).mean().iloc[-1],
        })
        if col in actuator_cols:
            result.update({
                f'{col}_on_ratio': (s == 1).mean(),
                f'{col}_on_off_switch_cnt': (diff != 0).sum(),
            })
    # 강우센서 특화
    rain_diff = group['rain_sensor'].diff()
    result.update({
        'rain_sensor_sum': group['rain_sensor'].sum(),
        'rain_sensor_mean': group['rain_sensor'].mean(),
        'rain_sensor_diffmean': rain_diff.mean(),
        'rain_sensor_diffstd': rain_diff.std()
    })
    return result

# ✅ 풍향 관련 파생변수
def extract_wind_features(group):
    return {
        'wind_direction_sin_mean': group['wind_direction_sin'].mean(),
        'wind_direction_cos_mean': group['wind_direction_cos'].mean(),
        'wind_direction_sin_std': group['wind_direction_sin'].std(),
        'wind_direction_cos_std': group['wind_direction_cos'].std()
    }

# ✅ 온습도 차이 관련
def extract_temp_hum_diff(group):
    in_out_temp = group['in_temp'] - group['out_temp']
    in_out_hum = group['in_hum'] - group['out_hum']
    return {
        'temp_diff_mean': in_out_temp.mean(),
        'temp_diff_max': in_out_temp.max(),
        'temp_diff_min': in_out_temp.min(),
        'temp_diff_std': in_out_temp.std(),
        'hum_diff_mean': in_out_hum.mean(),
        'hum_diff_max': in_out_hum.max(),
        'hum_diff_min': in_out_hum.min(),
        'hum_diff_std': in_out_hum.std()
    }

# ✅ 성장 지표 파생변수
def extract_growth_features(group):
    result = {}
    group = group.reset_index(drop=True)
    for col in growth_cols:
        s = group[col]
        result.update({
            f'{col}_first': s.iloc[0],
            f'{col}_last': s.iloc[-1],
            f'{col}_mean': s.mean(),
            f'{col}_median': s.median(),
            f'{col}_diff': s.iloc[-1] - s.iloc[0],
            f'{col}_slope': calc_slope(s),
            f'{col}_rolling3_mean': s.rolling(3, min_periods=1).mean().mean(),
            f'{col}_rolling3_std': s.rolling(3, min_periods=1).std().mean(),
        })
    return result

# ✅ 시간대별 온도 파생변수
def extract_time_based_features(group):
    try:
        group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
        result = {}
        for hour in [0, 6, 12, 18]:
            hour_temp = group[group['hour'] == hour]['in_temp']
            result[f'in_temp_hour{hour}_mean'] = hour_temp.mean() if not hour_temp.empty else np.nan
            result[f'in_temp_hour{hour}_std'] = hour_temp.std() if not hour_temp.empty else np.nan
        return result
    except:
        return {}

# ✅ 누적 기반 파생변수
def extract_cumulative_features(group):
    result = {}
    for col in actuator_cols + env_cols:
        s = group[col]
        result[f'{col}_cumsum'] = s.cumsum().iloc[-1]
        result[f'{col}_cummean'] = s.expanding(min_periods=1).mean().iloc[-1]
    return result

In [5]:
# 함수 실행
train_feats = advanced_features_with_week_v2(x_train)
test_feats = advanced_features_with_week_v2(x_test)

  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], errors='coerce').dt.hour
  group['hour'] = pd.to_datetime(group['time'], err

In [6]:
train_feats.shape

(154, 446)

# Feature Selection으로 중요도기준 상위20개 컬럼 선택

In [7]:
def prepare_features(train_feats: pd.DataFrame, test_feats: pd.DataFrame, y_train: pd.DataFrame, N: int = 20, target_keyword: str = 'co2') -> tuple:
    # ✅ 불필요한 변수 제거 (number + plant 포함된 컬럼)
    numplant_candidates = [col for col in train_feats.columns if 'number' in col.lower() and 'plant' in col.lower()]
    if len(numplant_candidates) > 0:
        train_feats = train_feats.drop(columns=numplant_candidates)
        test_feats = test_feats.drop(columns=[c for c in numplant_candidates if c in test_feats.columns])

    # ✅ 타겟 컬럼 추출
    y_train.columns = y_train.columns.str.strip()
    target_col = [col for col in y_train.columns if target_keyword in col.lower()][-1]

    # ✅ train에 타겟 병합
    train_feats = pd.merge(train_feats, y_train[['Sample_Number', target_col]], on='Sample_Number', how='left')

    # ✅ Feature selection
    X_all = train_feats.drop(columns=['Sample_Number', target_col])
    y_all = train_feats[target_col]
    model_fs = XGBRegressor(n_estimators=100, random_state=42)
    model_fs.fit(X_all, y_all)

    result = permutation_importance(
        model_fs, X_all, y_all,
        n_repeats=3, random_state=42,
        scoring='neg_root_mean_squared_error'
    )

    sorted_idx = result.importances_mean.argsort()[::-1]
    top_features = X_all.columns[sorted_idx[:N]].tolist()
    print(f"[Feature Selection] Top {N} features: {top_features}")

    train_data = train_feats[top_features]
    X_test = test_feats[top_features]
    target = train_feats[target_col]

    return train_data, X_test, target, top_features

In [8]:
train_data, X_test, target, top_features = prepare_features(train_feats, test_feats, y_train, N=20, target_keyword='co2')

[Feature Selection] Top 20 features: ['in_co2_q80', 'leaf_length_first', 'out_temp_mean', 'co2_mean', 'crown_diameter_slope', 'in_temp_diffstd', 'in_co2_q20', 'out_temp_q80', 'heater_rolling3_max', 'curtain1_std', 'side_curtain_diffstd', 'fan_mean', 'heater_rolling3_min', 'curtain2_mean', 'in_temp_hour18_mean', 'curtain3_mean', 'heater_rolling6_std', 'curtain3_slope', 'out_temp_q20', 'out_temp_rolling3_max']


# Optuna로 최적의 하이퍼파라미터 탐색

XGB, LGBM, RF, cat을 각각 100회씩 옵튜나로 학습

In [9]:
# # ✅ 결과 저장용 딕셔너리
# best_trials = {}

# # ✅ 공통 CV 설정
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# groups = pd.qcut(target, 10, labels=np.arange(10))

# # ✅ 모델별 objective 함수 정의
# def get_objective(model_name):
#     def objective(trial):
#         if model_name == "xgb":
#             params = {
#                 'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#                 'max_depth': trial.suggest_int('max_depth', 2, 10),
#                 'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#                 'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#                 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#                 'random_state': 42,
#                 'n_jobs': -1,
#                 'verbosity': 0,
#             }
#             model_cls = XGBRegressor

#         elif model_name == "lgbm":
#             params = {
#                 'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#                 'max_depth': trial.suggest_int('max_depth', 2, 10),
#                 'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#                 'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#                 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#                 'random_state': 42,
#                 'n_jobs': -1,
#                 'verbose': -1
#             }
#             model_cls = LGBMRegressor

#         elif model_name == "cat":
#             params = {
#                 'iterations': trial.suggest_int('iterations', 100, 500),
#                 'depth': trial.suggest_int('depth', 3, 10),
#                 'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#                 'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
#                 'random_seed': 42,
#                 'verbose': 0
#             }
#             model_cls = CatBoostRegressor

#         else:  
#             params = {
#                 "n_estimators": trial.suggest_int("n_estimators", 100, 500),
#                 "max_depth": trial.suggest_int("max_depth", 3, 20),
#                 "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
#                 "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
#                 "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
#                 "random_state": 42,
#                 "n_jobs": -1,
#             }
#             model_cls = RandomForestRegressor

#         oof_pred = np.zeros(len(train_data))

#         for train_idx, val_idx in cv.split(train_data, groups):
#             X_train, y_train = train_data.iloc[train_idx], target.iloc[train_idx]
#             X_valid, y_valid = train_data.iloc[val_idx], target.iloc[val_idx]

#             model = model_cls(**params)
#             model.fit(X_train, y_train)
#             val_pred = model.predict(X_valid)
#             oof_pred[val_idx] = val_pred

#         rmse_score = np.sqrt(mean_squared_error(target, oof_pred))
#         return rmse_score
#     return objective

# # ✅ 모델별 튜닝 루프
# for model_name in ['xgb', 'lgbm', 'rf', 'cat']:
#     print(f"🔍 Start tuning for: {model_name}")
#     study = optuna.create_study(direction='minimize')
#     study.optimize(get_objective(model_name), n_trials=100)

#     best_trials[model_name] = {
#         'score': study.best_trial.value,
#         'params': study.best_trial.params
#     }

# # ✅ 결과 출력
# for model_name, result in best_trials.items():
#     print(f"\n✅ Best for {model_name.upper()}:")
#     print(f"RMSE: {result['score']:.4f}")
#     print("Params:")
#     for k, v in result['params'].items():
#         print(f"  {k}: {v}")

# 2. OOF예측

In [10]:
# ✅ 고정 파라미터
xgb_params = {
    'n_estimators': 359,
    'max_depth': 3,
    'learning_rate': 0.03457124302213052,
    'subsample': 0.7464586207674122,
    'colsample_bytree': 0.6430979570192329
}

lgb_params = {
    'n_estimators': 347,
    'max_depth': 7,
    'learning_rate': 0.05346663875437355,
    'subsample': 0.7199898600953255,
    'colsample_bytree': 0.6231337990398494,
    'verbose': -1
}

rf_params = {
    'n_estimators': 166,
    'max_depth': 16,
    'min_samples_split': 3,
    'min_samples_leaf': 1,
    'max_features': None,
    'n_jobs': -1,
    'random_state': 42
}

cat_params = {
    'iterations': 452,
    'depth': 6,
    'learning_rate': 0.07344315262684052,
    'l2_leaf_reg': 0.04226007463989213,
    'random_seed': 42,
    'verbose': 0
}

def run_oof_regression(train_data: pd.DataFrame, target: pd.Series,
                       xgb_params: dict, lgb_params: dict,
                       rf_params: dict, cat_params: dict,
                       n_splits: int = 5, save_model: bool = True) -> None:
    groups = pd.qcut(target, 10, labels=False)
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof_preds_xgb = np.zeros(len(train_data))
    oof_preds_lgb = np.zeros(len(train_data))
    oof_preds_rf = np.zeros(len(train_data))
    oof_preds_cat = np.zeros(len(train_data))

    for i, (tri, vai) in enumerate(cv.split(train_data, groups)):
        x_train, y_train = train_data.iloc[tri], target.iloc[tri]
        x_valid, y_valid = train_data.iloc[vai], target.iloc[vai]

        # XGB
        xgb = XGBRegressor(**xgb_params)
        xgb.fit(x_train, y_train)
        if save_model:
            joblib.dump(xgb, f"model-xgb-{i}.pkl")
        oof_preds_xgb[vai] = xgb.predict(x_valid)

        # LGB
        lgb = LGBMRegressor(**lgb_params)
        lgb.fit(x_train, y_train)
        if save_model:
            joblib.dump(lgb, f"model-lgb-{i}.pkl")
        oof_preds_lgb[vai] = lgb.predict(x_valid)

        # RF
        rf = RandomForestRegressor(**rf_params)
        rf.fit(x_train, y_train)
        if save_model:
            joblib.dump(rf, f"model-rf-{i}.pkl")
        oof_preds_rf[vai] = rf.predict(x_valid)

        # CAT
        cat = CatBoostRegressor(**cat_params)
        cat.fit(x_train, y_train)
        if save_model:
            joblib.dump(cat, f"model-cat-{i}.pkl")
        oof_preds_cat[vai] = cat.predict(x_valid)

    print(f"[XGB_OOF] RMSE: {np.sqrt(mean_squared_error(target, oof_preds_xgb)):.4f}, R2: {r2_score(target, oof_preds_xgb):.4f}")
    print(f"[LGB_OOF] RMSE: {np.sqrt(mean_squared_error(target, oof_preds_lgb)):.4f}, R2: {r2_score(target, oof_preds_lgb):.4f}")
    print(f"[RF_OOF ] RMSE: {np.sqrt(mean_squared_error(target, oof_preds_rf)):.4f}, R2: {r2_score(target, oof_preds_rf):.4f}")
    print(f"[CAT_OOF] RMSE: {np.sqrt(mean_squared_error(target, oof_preds_cat)):.4f}, R2: {r2_score(target, oof_preds_cat):.4f}")

In [11]:
# ✅ 실행 코드
run_oof_regression(
    train_data,
    target,
    xgb_params=xgb_params,
    lgb_params=lgb_params,
    rf_params=rf_params,
    cat_params=cat_params,
    n_splits=5,
    save_model=True
)

[XGB_OOF] RMSE: 1.5663, R2: 0.8955
[LGB_OOF] RMSE: 1.8933, R2: 0.8473
[RF_OOF ] RMSE: 1.7493, R2: 0.8696
[CAT_OOF] RMSE: 1.5392, R2: 0.8991
