In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from xgboost import XGBRegressor
from category_encoders import TargetEncoder

# 1. 데이터 불러오기
train = pd.read_csv('/kaggle/input/spotify-da-ml/train.csv')
test = pd.read_csv('/kaggle/input/spotify-da-ml/test.csv')
submission = pd.read_csv('/kaggle/input/spotify-da-ml/sample_submission.csv')

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
# 2. 타깃 인코딩 (track_genre) with smoothing
te = TargetEncoder(cols=['track_genre'], smoothing=10)
train['track_genre_te'] = te.fit_transform(train['track_genre'], train['popularity'])
test['track_genre_te'] = te.transform(test['track_genre'])
train.drop(columns=['track_genre'], inplace=True)
test.drop(columns=['track_genre'], inplace=True)


In [None]:
# 3. 파생변수 생성
train['log_duration'] = np.log1p(train['duration_ms'])
test['log_duration'] = np.log1p(test['duration_ms'])
train['dance_energy'] = train['danceability'] * train['energy']
test['dance_energy'] = test['danceability'] * test['energy']
train['loudness_tempo'] = train['loudness'] / (train['tempo'] + 1e-5)
test['loudness_tempo'] = test['loudness'] / (test['tempo'] + 1e-5)

In [None]:
# 4. 상호작용 변수 (PolynomialFeatures)
audio_cols = ['danceability','energy','loudness','speechiness','acousticness',
              'instrumentalness','liveness','valence','tempo','log_duration']
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_feats = poly.fit_transform(train[audio_cols])
poly_cols = poly.get_feature_names_out(audio_cols)
train_poly = pd.DataFrame(poly_feats, columns=poly_cols, index=train.index)
test_poly = pd.DataFrame(poly.transform(test[audio_cols]), columns=poly_cols, index=test.index)
interaction_cols = [c for c in poly_cols if ' ' in c][:15]
train = pd.concat([train, train_poly[interaction_cols]], axis=1)
test = pd.concat([test, test_poly[interaction_cols]], axis=1)


In [None]:
# 5. 클리핑 & 스케일링
raw_cols = audio_cols + ['dance_energy','loudness_tempo']
for col in raw_cols:
    low, high = train[col].quantile([0.01, 0.99]).values
    train[col] = train[col].clip(lower=low, upper=high)
    test[col] = test[col].clip(lower=low, upper=high)
scale_cols = ['track_genre_te'] + raw_cols + interaction_cols
scaler = StandardScaler()
train[raw_cols + interaction_cols] = scaler.fit_transform(train[raw_cols + interaction_cols])
test[raw_cols + interaction_cols] = scaler.transform(test[raw_cols + interaction_cols])


In [None]:
# 6. 데이터 준비
X = train[scale_cols]
y = train['popularity']
X_test = test[scale_cols]

In [None]:
# 7. XGBoost 모델 & 탐색 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)
param_dist = {
    'n_estimators': [600,800,1000],
    'learning_rate': [0.01,0.02,0.03],
    'max_depth': [6,8,10],
    'min_child_weight': [1,2,4],
    'subsample': [0.6,0.75,0.9],
    'colsample_bytree': [0.6,0.75,0.9],
    'gamma': [0,0.1,0.2],
    'reg_alpha': [0,0.05,0.1],
    'reg_lambda': [1,2,3]
}
xgb_base = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    grow_policy='lossguide',
    max_bin=256,
    random_state=42,
    verbosity=0,
    early_stopping_rounds=50  # constructor에 설정
)
rs = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=1
)
rs.fit(X, y, eval_set=[(X, y)], verbose=False)
print('Best CV RMSE:', -rs.best_score_)
print('Best Params:', rs.best_params_)


In [None]:
# 8. OOF 검증 및 예측
oof = np.zeros(X.shape[0]); preds = np.zeros(X_test.shape[0])
best = rs.best_estimator_
for tr_idx, val_idx in kf.split(X, y):
    best.fit(X.iloc[tr_idx], y.iloc[tr_idx], eval_set=[(X.iloc[val_idx], y.iloc[val_idx])], verbose=False)
    oof[val_idx] = best.predict(X.iloc[val_idx])
    preds += best.predict(X_test) / kf.n_splits
rmse_oof = np.sqrt(mean_squared_error(y, oof))
print(f'OOF CV RMSE: {rmse_oof:.4f}')

In [None]:
# 9. 제출 파일
submission['popularity'] = preds
submission.to_csv('xgb_final_160.csv', index=False)
print('✅ xgb_final_160.csv 생성 완료!')