In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

all_df = pd.concat([train_df.drop('song_popularity', axis=1), test_df], axis=0)
categorical_features = ['key', 'audio_mode', 'time_signature']
numerical_features = [col for col in train_df.columns if col not in ['id', 'song_popularity'] + categorical_features]

for col in numerical_features:
    median_val = all_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)

for col in categorical_features:
    mode_val = all_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    test_df[col].fillna(mode_val, inplace=True)

print('Missing values handled.')

Missing values handled.


In [3]:
def create_features(df):
    df['loudness_to_energy'] = df['loudness'] / (df['energy'] + 1e-6)
    df['dance_energy_product'] = df['danceability'] * df['energy']
    df['valence_acoustic_product'] = df['audio_valence'] * df['acousticness']
    df['speechiness_liveness_sum'] = df['speechiness'] + df['liveness']
    df['tempo_loudness_ratio'] = df['tempo'] / (df['loudness'] * -1 + 1e-6)
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

for col in categorical_features:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

print('New features created.')
train_df.head()

New features created.


Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity,loudness_to_energy,dance_energy_product,valence_acoustic_product,speechiness_liveness_sum,tempo_loudness_ratio
0,0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,0.135947,-5.619088,0,0.08257,158.386236,4,0.734642,0,-7.94696,0.605622,0.47185,0.218517,28.187174
1,1,186660.0,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1,0.127358,102.752988,3,0.711531,1,-6.267717,0.612696,0.039039,0.563787,19.620709
2,2,193213.0,0.140532,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0,0.052282,178.685791,3,0.425536,0,-6.319851,0.147606,0.059802,0.22278,36.08531
3,3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0,0.035618,128.71563,3,0.453597,0,-14.282423,0.32345,0.221655,0.130423,16.306131
4,4,165969.0,0.493017,0.608234,0.740982,0.002033,10.0,0.094891,-2.684095,0,0.050746,121.928157,4,0.741311,0,-3.622345,0.45069,0.365479,0.145637,45.426155


In [4]:
X = train_df.drop(['id', 'song_popularity'], axis=1)
y = train_df['song_popularity']
target_counts = train_df['song_popularity'].value_counts(normalize=True)
scale_pos_weight = target_counts[0] / target_counts[1]

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'n_estimators': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'scale_pos_weight': scale_pos_weight,
        'random_state': 42
    }

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    oof_aucs = []
    for train_index, val_index in kf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                  callbacks=[lgb.early_stopping(50, verbose=False)])
        oof_probs = model.predict_proba(X_val)[:, 1]
        oof_aucs.append(roc_auc_score(y_val, oof_probs))

    return np.mean(oof_aucs)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30) # Increase n_trials for better results

print('Best trial found:')
best_params = study.best_trial.params
print(f"AUC: {study.best_value}")
print(best_params)

[I 2025-10-06 22:55:35,692] A new study created in memory with name: no-name-d6b04024-cbd1-427b-ab5a-8761801ed13e
[I 2025-10-06 22:55:36,328] Trial 0 finished with value: 0.559595703988061 and parameters: {'learning_rate': 0.04419458748622545, 'num_leaves': 66, 'max_depth': 10, 'colsample_bytree': 0.9598524186566759, 'subsample': 0.9429105933577766, 'reg_alpha': 0.2442632440125377, 'reg_lambda': 1.0794587407903637}. Best is trial 0 with value: 0.559595703988061.
[I 2025-10-06 22:55:36,950] Trial 1 finished with value: 0.5624067048614733 and parameters: {'learning_rate': 0.05530177185496359, 'num_leaves': 54, 'max_depth': 15, 'colsample_bytree': 0.7714006891170913, 'subsample': 0.7583743909860676, 'reg_alpha': 0.02534788014092775, 'reg_lambda': 8.989165236044203}. Best is trial 1 with value: 0.5624067048614733.
[I 2025-10-06 22:55:37,746] Trial 2 finished with value: 0.5625477665352518 and parameters: {'learning_rate': 0.012621589855269197, 'num_leaves': 81, 'max_depth': 8, 'colsample_b

Best trial found:
AUC: 0.5664302906784361
{'learning_rate': 0.023108819106936934, 'num_leaves': 20, 'max_depth': 5, 'colsample_bytree': 0.647220741116671, 'subsample': 0.8611236125736238, 'reg_alpha': 6.645908172096258, 'reg_lambda': 0.020046242327485465}


In [5]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost

X_test = test_df.drop('id', axis=1)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_preds = np.zeros(len(test_df))
xgb_preds = np.zeros(len(test_df))
cat_preds = np.zeros(len(test_df))

# Add necessary params to best_params from Optuna
final_lgb_params = best_params.copy()
final_lgb_params.update({
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
    'n_estimators': 2000,
    'random_state': 42,
    'scale_pos_weight': scale_pos_weight
})

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f'--- Fold {fold+1} ---')
    X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # categorical dtypes for all models
    for col in categorical_features:
        X_train[col] = X_train[col].astype('category')
        X_val[col] = X_val[col].astype('category')
        X_test[col] = X_test[col].astype('category')

    # LightGBM
    lgb_model = lgb.LGBMClassifier(**final_lgb_params)
    lgb_model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(100, verbose=False)])
    lgb_preds += lgb_model.predict_proba(X_test)[:, 1] / kf.n_splits

    # XGBoost
    xgb_model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        enable_categorical=True
    )
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    xgb_preds += xgb_model.predict_proba(X_test)[:, 1] / kf.n_splits

    # CatBoost
    X_train_cb = X_train.copy()
    X_val_cb = X_val.copy()
    X_test_cb = X_test.copy()
    for col in categorical_features:
        X_train_cb[col] = X_train_cb[col].astype(str)
        X_val_cb[col] = X_val_cb[col].astype(str)
        X_test_cb[col] = X_test_cb[col].astype(str)

    cat_model = catboost.CatBoostClassifier(
        verbose=0,
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        eval_metric='AUC'
    )
    cat_model.fit(
        X_train_cb, y_train,
        cat_features=categorical_features,
        eval_set=(X_val_cb, y_val),
        early_stopping_rounds=100,
        use_best_model=True
    )
    cat_preds += cat_model.predict_proba(X_test_cb)[:, 1] / kf.n_splits




--- Fold 1 ---
--- Fold 2 ---
--- Fold 3 ---
--- Fold 4 ---
--- Fold 5 ---


In [6]:
# Weighted average of model predictions (probabilities)
ensemble_probs = (lgb_preds * 0.75) + (xgb_preds * 0.125) + (cat_preds * 0.125)

# Apply threshold to get final 0/1 predictions
final_preds = (ensemble_probs > 0.51).astype(int) # Matching no of 0s and 1s in train

submission_df = pd.DataFrame({'id': test_df['id'], 'song_popularity': final_preds})
submission_df.to_csv('submission_advanced.csv', index=False)

print('percentage of ones in final predictions:', submission_df['song_popularity'].mean())

percentage of ones in final predictions: 0.3667


In [8]:
# Number of ones in train

print('Number of ones in training set:', train_df['song_popularity'].sum())
print('Number of zeros in training set:', len(train_df) - train_df['song_popularity'].sum())

print('percentage of ones:', train_df['song_popularity'].mean())

Number of ones in training set: 10932
Number of zeros in training set: 19068
percentage of ones: 0.3644
