In [5]:
import pandas as pd
import numpy as np


In [7]:
# 데이터 불러오기
df = pd.read_csv("/Users/nelllio/Desktop/Machine_Learning/playground-series/train.csv")
print(df.shape)
df.head()
#df 변수 지정 후 head()를 통해 5행까지 데이터 확인

(750000, 9)


Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [9]:
train = df.info()
print(train)
# count에 non-null로 결측치 없는 것 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB
None


In [11]:
# ===============================
# 0. 라이브러리 (동일)
# ===============================
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import numpy as np, pandas as pd

# ===============================
# 1. 데이터 & FE (동일)
# ===============================

def create_golden_features(df):
    df = df.copy()
    df['Duration_x_HeartRate']   = df['Duration'] * df['Heart_Rate']
    df['Age_adjusted_HeartRate'] = df['Heart_Rate'] / (df['Age'] + 1)
    df['BMI']                    = df['Weight'] / (df['Height'] / 100) ** 2
    df['Metabolic_Intensity']    = df['Heart_Rate'] * df['Body_Temp']
    df['Steps_per_min']          = df['Heart_Rate'] / (df['Duration'] / 60)
    df['Weight_to_Age']          = df['Weight'] / (df['Age'] + 1)
    df['HeartRate_to_BMI']       = df['Heart_Rate'] / (df['BMI'] + 1)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    return df

df = create_golden_features(df)
features = [
    'Duration','Heart_Rate','BMI','Body_Temp',
    'Duration_x_HeartRate','Metabolic_Intensity',
    'Age','Height','Weight',
    'Steps_per_min','Age_adjusted_HeartRate',
    'Weight_to_Age','HeartRate_to_BMI','Sex'
]
X, y = df[features], df['Calories']

# ===============================
# 2. RMSLE 스코어러 + 5-fold CV
# ===============================
def rmsle(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0.1)                      # log 안정화
    return np.sqrt(mean_squared_error(np.log1p(y_true),
                                      np.log1p(y_pred)))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)  # 음수 반환(작을수록 좋음)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ===============================
# 3. Base learner – 미세 튜닝
#    (n_estimators ↑, learning_rate ↓)
# ===============================
xgb = XGBRegressor(
        n_estimators=500, learning_rate=0.05,
        max_depth=6, subsample=0.9, colsample_bytree=0.9,
        n_jobs=-1, random_state=42, verbosity=0)

lgb = LGBMRegressor(
        n_estimators=500, learning_rate=0.05,
        max_depth=-1, subsample=0.9, colsample_bytree=0.9,
        n_jobs=-1, random_state=42)

cat = CatBoostRegressor(
        n_estimators=500, learning_rate=0.05,
        depth=8, l2_leaf_reg=3,
        verbose=0, random_state=42)

# ===============================
# 4. 스태킹 정의 (동일)
# ===============================
stack = StackingRegressor(
    estimators=[('xgb', xgb), ('lgb', lgb), ('cat', cat)],
    final_estimator=RidgeCV(),
    n_jobs=-1
)

# ===============================
# 5-1. 교차검증 RMSLE 확인
# ===============================
cv_scores = cross_val_score(stack, X, np.log1p(y),   # log1p 변환 ⇐ 동일 기준
                            cv=kf, scoring=rmsle_scorer,
                            n_jobs=-1)
print(f"📊 5-fold CV RMSLE: {(-cv_scores.mean()):.4f} ± {cv_scores.std():.4f}")

# ===============================
# 5-2. 홀드아웃 학습/평가
# ===============================
X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42)
print("\n📚 모델 학습 중...")
stack.fit(X_train, np.log1p(y_train))
y_pred = np.expm1(stack.predict(X_val))
print(f"🎯 Hold-out RMSLE: {rmsle(y_val, y_pred):.4f}")

# ===============================
# 6. Kaggle 제출용 코드 추가
# ===============================
print("\n🚀 Kaggle 제출 파일 생성 중...")

# 6-1. 테스트 데이터 로드
test_df = pd.read_csv("/Users/nelllio/Desktop/Machine_Learning/playground-series/test.csv")

# 6-2. 테스트 데이터에 동일한 피처 엔지니어링 적용
test_df = create_golden_features(test_df)
X_test = test_df[features]

# 6-3. 전체 훈련 데이터로 최종 모델 학습
print("📚 전체 훈련 데이터로 최종 모델 학습 중...")
stack.fit(X, np.log1p(y))

# 6-4. 테스트 데이터 예측
print("🔮 테스트 데이터 예측 중...")
test_predictions = np.expm1(stack.predict(X_test))

# 6-5. submission 파일 생성
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': test_predictions
})

# 6-6. submission 파일 저장
submission_path = "/Users/nelllio/Desktop/Machine_Learning/playground-series/submission.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Submission 파일이 생성되었습니다: {submission_path}")
print(f"📊 예측된 칼로리 범위: {test_predictions.min():.2f} ~ {test_predictions.max():.2f}")
print(f"📊 평균 예측 칼로리: {test_predictions.mean():.2f}")
print(f"📋 Submission 형태:")
print(submission.head())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2141
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 14
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2143
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 14
[LightGBM] [Info] Start training from score 4.140724
[LightGBM] [Info] Start training from score 4.141876
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total 

In [None]:
# ===============================
# 0.05739 → 0.04대 돌파 최적화 파이프라인
# 기존 코드 기반 점진적 개선
# ===============================

from sklearn.ensemble import StackingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV, HuberRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import numpy as np, pandas as pd
import warnings
warnings.filterwarnings('ignore')

# ===============================
# 1. 확장된 골든 특성 엔지니어링
# ===============================

def create_enhanced_golden_features(df):
    """기존 특성 + 고급 특성들 추가"""
    df = df.copy()
    
    # 🏆 기존 검증된 특성들 (유지)
    df['Duration_x_HeartRate']   = df['Duration'] * df['Heart_Rate']
    df['Age_adjusted_HeartRate'] = df['Heart_Rate'] / (df['Age'] + 1)
    df['BMI']                    = df['Weight'] / (df['Height'] / 100) ** 2
    df['Metabolic_Intensity']    = df['Heart_Rate'] * df['Body_Temp']
    df['Steps_per_min']          = df['Heart_Rate'] / (df['Duration'] / 60)
    df['Weight_to_Age']          = df['Weight'] / (df['Age'] + 1)
    df['HeartRate_to_BMI']       = df['Heart_Rate'] / (df['BMI'] + 1)
    
    # 🚀 NEW: 고급 특성들 추가
    # 1. 3중 교호작용
    df['Triple_Intensity'] = df['Duration'] * df['Heart_Rate'] * df['Body_Temp']
    df['Duration_x_BodyTemp'] = df['Duration'] * df['Body_Temp']
    df['HeartRate_x_BodyTemp'] = df['Heart_Rate'] * df['Body_Temp']
    
    # 2. 고차 특성들
    df['Duration_squared'] = df['Duration'] ** 2
    df['HeartRate_squared'] = df['Heart_Rate'] ** 2
    df['Duration_cubed'] = df['Duration'] ** 3
    
    # 3. 생리학적 지표들
    df['BSA'] = 0.007184 * (df['Weight'] ** 0.425) * (df['Height'] ** 0.725)  # Body Surface Area
    df['BMR_estimate'] = 88.362 + (13.397 * df['Weight']) + (4.799 * df['Height']) - (5.677 * df['Age'])  # 남성 기준 BMR
    df['BodyTemp_deviation'] = df['Body_Temp'] - 37.0  # 정상 체온에서의 편차
    
    # 4. 효율성 지표들
    df['Calorie_efficiency'] = df['Duration_x_HeartRate'] / (df['Weight'] + df['Age'])
    df['Exercise_intensity_per_BMI'] = df['Duration_x_HeartRate'] / df['BMI']
    df['Metabolic_rate'] = df['Metabolic_Intensity'] / df['Weight']
    
    # 5. 로그 변환들 (RMSLE 최적화)
    df['log_Duration'] = np.log1p(df['Duration'])
    df['log_HeartRate'] = np.log1p(df['Heart_Rate'])
    df['log_Duration_HeartRate'] = np.log1p(df['Duration_x_HeartRate'])
    
    # 6. 비율 및 상대적 지표들
    df['HeartRate_intensity'] = df['Heart_Rate'] / (220 - df['Age'])  # 최대심박수 대비 비율
    df['Duration_per_Age'] = df['Duration'] / df['Age']
    df['BodyTemp_per_Age'] = df['Body_Temp'] / df['Age']
    
    # 7. 복합 지표들
    df['Fitness_score'] = (df['Duration'] * df['Heart_Rate']) / (df['Age'] + df['BMI'])
    df['Exercise_load'] = df['Duration'] * (df['Heart_Rate'] / 220) * df['Body_Temp']
    df['Power_index'] = (df['Duration'] ** 1.5) * (df['Heart_Rate'] ** 0.8)
    
    # 성별 인코딩
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    
    return df

# ===============================
# 2. 확장된 모델 앙상블
# ===============================

def create_enhanced_models():
    """기존 3개 모델 + 추가 모델들"""
    
    # 🏆 기존 핵심 모델들 (하이퍼파라미터 정밀 튜닝)
    xgb = XGBRegressor(
        n_estimators=800,           # 500 → 800
        learning_rate=0.03,         # 0.05 → 0.03 (더 보수적)
        max_depth=7,                # 6 → 7
        subsample=0.85,             # 0.9 → 0.85
        colsample_bytree=0.85,      # 0.9 → 0.85
        reg_alpha=0.1,              # L1 정규화 추가
        reg_lambda=0.1,             # L2 정규화 추가
        n_jobs=-1, random_state=42, verbosity=0
    )
    
    lgb = LGBMRegressor(
        n_estimators=800,           # 500 → 800
        learning_rate=0.03,         # 0.05 → 0.03
        max_depth=8,                # -1 → 8 (명시적 깊이)
        subsample=0.85,             # 0.9 → 0.85
        colsample_bytree=0.85,      # 0.9 → 0.85
        reg_alpha=0.1,
        reg_lambda=0.1,
        n_jobs=-1, random_state=42, verbosity=-1
    )
    
    cat = CatBoostRegressor(
        n_estimators=600,           # 500 → 600
        learning_rate=0.04,         # 0.05 → 0.04
        depth=9,                    # 8 → 9
        l2_leaf_reg=5,              # 3 → 5
        subsample=0.85,             # 추가
        verbose=0, random_state=42
    )
    
    # 🚀 NEW: 추가 모델들 (다양성 확보)
    rf = RandomForestRegressor(
        n_estimators=400,
        max_depth=12,
        min_samples_split=3,
        min_samples_leaf=2,
        max_features='sqrt',
        n_jobs=-1, random_state=42
    )
    
    extra = ExtraTreesRegressor(
        n_estimators=300,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        n_jobs=-1, random_state=42
    )
    
    # XGBoost 변형 (다른 설정)
    xgb_alt = XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0.05,
        reg_lambda=0.05,
        n_jobs=-1, random_state=123, verbosity=0  # 다른 시드
    )
    
    return [
        ('xgb', xgb),
        ('lgb', lgb), 
        ('cat', cat),
        ('rf', rf),
        ('extra', extra),
        ('xgb_alt', xgb_alt)
    ]

# ===============================
# 3. 고급 메타 러너들
# ===============================

def create_meta_learners():
    """여러 메타 러너 후보들"""
    meta_candidates = {
        'ridge': RidgeCV(alphas=[0.1, 0.5, 1.0, 2.0, 5.0]),
        'lasso': LassoCV(alphas=[0.01, 0.05, 0.1, 0.5, 1.0], cv=3),
        'elastic': ElasticNetCV(alphas=[0.1, 0.5, 1.0], l1_ratio=[0.1, 0.5, 0.9], cv=3),
        'huber': HuberRegressor(epsilon=1.5, alpha=0.1)
    }
    return meta_candidates

# ===============================
# 4. 특성 선택 최적화
# ===============================

def optimize_features(X, y, max_features=None):
    """최적 특성 선택"""
    if max_features is None:
        max_features = min(30, X.shape[1])  # 최대 30개 특성
    
    # SelectKBest로 최적 특성 선택
    selector = SelectKBest(score_func=f_regression, k=max_features)
    X_selected = selector.fit_transform(X, y)
    
    selected_features = X.columns[selector.get_support()]
    
    print(f"🎯 선택된 특성 수: {len(selected_features)}")
    print("🔝 Top 10 특성:")
    feature_scores = list(zip(selected_features, selector.scores_[selector.get_support()]))
    feature_scores.sort(key=lambda x: x[1], reverse=True)
    for feat, score in feature_scores[:10]:
        print(f"   {feat}: {score:.2f}")
    
    return X_selected, selected_features

# ===============================
# 5. 다층 스태킹 구현
# ===============================

def create_multi_level_stacking(base_models, meta_candidates, X, y):
    """다층 스태킹으로 최적 메타 러너 선택"""
    
    print("🔍 최적 메타 러너 탐색 중...")
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    best_score = float('inf')
    best_meta_name = None
    best_stack = None
    
    for meta_name, meta_model in meta_candidates.items():
        stack = StackingRegressor(
            estimators=base_models,
            final_estimator=meta_model,
            cv=3,  # 내부 CV
            n_jobs=-1
        )
        
        # 교차 검증
        cv_scores = cross_val_score(
            stack, X, np.log1p(y),
            cv=kf, 
            scoring=make_scorer(lambda y_true, y_pred: 
                               np.sqrt(mean_squared_error(y_true, y_pred)), 
                               greater_is_better=False),
            n_jobs=-1
        )
        
        avg_score = -cv_scores.mean()
        print(f"   {meta_name}: {avg_score:.6f} ± {cv_scores.std():.6f}")
        
        if avg_score < best_score:
            best_score = avg_score
            best_meta_name = meta_name
            best_stack = stack
    
    print(f"🏆 최적 메타 러너: {best_meta_name} (Score: {best_score:.6f})")
    return best_stack, best_meta_name, best_score

# ===============================
# 6. 메인 파이프라인
# ===============================

def run_enhanced_pipeline(train_path, test_path):
    """향상된 파이프라인 실행"""
    
    print("🚀 0.05739 → 0.04대 돌파 파이프라인 시작!")
    print("="*60)
    
    # 데이터 로드
    print("📊 데이터 로딩...")
    df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # 확장된 특성 엔지니어링
    print("🔧 확장된 특성 엔지니어링...")
    df = create_enhanced_golden_features(df)
    test_df = create_enhanced_golden_features(test_df)
    
    # 특성 선택
    feature_cols = [col for col in df.columns if col not in ['id', 'Calories']]
    X = df[feature_cols]
    y = df['Calories']
    
    print(f"📋 전체 특성 수: {len(feature_cols)}")
    
    # 특성 최적화
    X_optimized, selected_features = optimize_features(X, y, max_features=25)
    X_optimized = pd.DataFrame(X_optimized, columns=selected_features)
    X_test_optimized = test_df[selected_features]
    
    # 모델 생성
    print("🤖 확장된 모델 앙상별 구성...")
    base_models = create_enhanced_models()
    meta_candidates = create_meta_learners()
    
    # 다층 스태킹
    print("🏗️ 다층 스태킹 최적화...")
    best_stack, best_meta, best_cv_score = create_multi_level_stacking(
        base_models, meta_candidates, X_optimized, y
    )
    
    # RMSLE 계산
    def rmsle(y_true, y_pred):
        y_pred = np.maximum(y_pred, 0.1)
        return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))
    
    # 홀드아웃 검증
    print("\n📚 홀드아웃 검증...")
    X_train, X_val, y_train, y_val = train_test_split(
        X_optimized, y, test_size=0.2, random_state=42
    )
    
    best_stack.fit(X_train, np.log1p(y_train))
    y_pred = np.expm1(best_stack.predict(X_val))
    holdout_rmsle = rmsle(y_val, y_pred)
    
    print(f"🎯 Hold-out RMSLE: {holdout_rmsle:.6f}")
    
    # 최종 모델 훈련 및 예측
    print("\n🚀 최종 모델 훈련 및 예측...")
    best_stack.fit(X_optimized, np.log1p(y))
    test_predictions = np.expm1(best_stack.predict(X_test_optimized))
    
    # 제출 파일 생성
    submission = pd.DataFrame({
        'id': test_df['id'],
        'Calories': test_predictions
    })
    
    submission_path = "enhanced_submission.csv"
    submission.to_csv(submission_path, index=False)
    
    print(f"\n✅ 제출 파일 생성: {submission_path}")
    print(f"📊 예측 통계:")
    print(f"   평균: {test_predictions.mean():.2f}")
    print(f"   범위: {test_predictions.min():.2f} ~ {test_predictions.max():.2f}")
    print(f"   표준편차: {test_predictions.std():.2f}")
    
    print(f"\n🏆 예상 성능: RMSLE ~{holdout_rmsle:.6f}")
    
    if holdout_rmsle < 0.050:
        print("🔥 목표 달성! 0.04대 진입!")
    elif holdout_rmsle < 0.055:
        print("💪 매우 근접! 추가 튜닝으로 달성 가능!")
    else:
        print("📈 좋은 개선! 계속 최적화 진행!")
    
    return submission, holdout_rmsle, selected_features

# ===============================
# 7. 실행 코드
# ===============================

if __name__ == "__main__":
    # 파일 경로 설정 (사용자 환경에 맞게 수정)
    train_path = "/Users/nelllio/Desktop/Machine_Learning/playground-series/train.csv"
    test_path = "/Users/nelllio/Desktop/Machine_Learning/playground-series/test.csv"
    
    # 파이프라인 실행
    submission, final_score, features = run_enhanced_pipeline(train_path, test_path)
    
    print(f"\n📋 최종 결과:")
    print(f"   기존 성과: 0.05739")
    print(f"   예상 성과: {final_score:.6f}")
    print(f"   개선도: {0.05739 - final_score:.6f}")
    
    if final_score < 0.05739:
        improvement = ((0.05739 - final_score) / 0.05739) * 100
        print(f"   🎉 {improvement:.2f}% 개선!")

"""
🎯 주요 개선사항:
1. 🔧 25+ 새로운 고급 특성 추가
2. 🤖 3개 → 6개 모델로 다양성 확대
3. 🏗️ 4가지 메타 러너 중 최적 선택
4. 🎯 특성 선택 최적화 (SelectKBest)
5. ⚙️ 하이퍼파라미터 정밀 튜닝
6. 📊 다층 스태킹 구현

🚀 예상 성능: 0.05739 → 0.045-0.055
💪 목표: 0.04대 돌파!
"""

🚀 0.05739 → 0.04대 돌파 파이프라인 시작!
📊 데이터 로딩...
🔧 확장된 특성 엔지니어링...
📋 전체 특성 수: 35
🎯 선택된 특성 수: 25
🔝 Top 10 특성:
   Exercise_load: 16303298.13
   Triple_Intensity: 16303298.13
   Duration_x_HeartRate: 15988970.37
   Power_index: 15634030.11
   Duration_squared: 10229560.24
   Exercise_intensity_per_BMI: 9624436.75
   Duration_x_BodyTemp: 9034282.78
   Duration: 8794708.88
   Duration_cubed: 5579898.15
   Metabolic_Intensity: 4418224.83
🤖 확장된 모델 앙상별 구성...
🏗️ 다층 스태킹 최적화...
🔍 최적 메타 러너 탐색 중...
