In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.base import clone
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import tensorflow as tf
from tensorflow.keras import layers, models

# get csvs
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train.drop(columns=['Listening_Time_minutes'])
y = np.log1p(train['Listening_Time_minutes'])


X_test = test[X.columns]


categorical = X.select_dtypes(include='object').columns.tolist()
numerical = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numerical),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical)
])

X_proc = np.asarray(preprocessor.fit_transform(X))
X_test_proc = np.asarray(preprocessor.transform(X_test))
y = np.asarray(y)

# CNN shaping
X_cnn = X_proc.reshape((X_proc.shape[0], X_proc.shape[1], 1))
X_test_cnn = X_test_proc.reshape((X_test_proc.shape[0], X_test_proc.shape[1], 1))

# === CNN model ===
cnn_model = models.Sequential([
    layers.Conv1D(64, 3, activation='relu', input_shape=(X_cnn.shape[1], 1)),
    layers.Conv1D(64, 3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-04-15 14:00:20.594101: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-04-15 14:00:20.594150: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-04-15 14:00:20.594158: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2025-04-15 14:00:20.594185: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-04-15 14:00:20.594199: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
# cnn fit
cnn_model.compile(optimizer='adam', loss='mse')
cnn_model.fit(X_cnn, y, validation_split=0.2, epochs=50, batch_size=64, verbose=1)

cnn_preds = cnn_model.predict(X_test_cnn).flatten()
cnn_oof = cnn_model.predict(X_cnn).flatten()

# tree wrapper
class ModelWrapper:
    def __init__(self, model, name):
        self.model = model
        self.name = name
        self.oof = None
        self.models = []
        self.test_preds = None

    def fit_predict(self, X, y, X_test, folds=5):
        kf = KFold(n_splits=folds, shuffle=True, random_state=42)
        self.oof = np.zeros(X.shape[0])
        self.test_preds = np.zeros(X_test.shape[0])
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            model = clone(self.model)
            model.fit(X_train, y_train)
            self.oof[val_idx] = model.predict(X_val)
            self.test_preds += model.predict(X_test) / folds
            self.models.append(model)

        rmse = np.sqrt(mean_squared_error(y, self.oof))
        print(f"{self.name} CV RMSE: {rmse:.5f}")
        return self

# hyperparameter tuning
lgb_model = ModelWrapper(
    lgb.LGBMRegressor(n_estimators=1200, learning_rate=0.01, num_leaves=64, max_depth=8,
                      subsample=0.8, colsample_bytree=0.8, random_state=42),
    "LightGBM"
)

xgb_model = ModelWrapper(
    xgb.XGBRegressor(n_estimators=1200, learning_rate=0.01, max_depth=6,
                     subsample=0.8, colsample_bytree=0.8, random_state=42),
    "XGBoost"
)

cat_model = ModelWrapper(
    cb.CatBoostRegressor(iterations=1200, learning_rate=0.01, depth=6,
                         verbose=0, random_seed=42),
    "CatBoost"
)

# train base models
lgb_model.fit_predict(X_proc, y, X_test_proc)
xgb_model.fit_predict(X_proc, y, X_test_proc)
cat_model.fit_predict(X_proc, y, X_test_proc)

# stack preds
stacked_X = np.vstack([
    lgb_model.oof,
    xgb_model.oof,
    cat_model.oof,
    cnn_oof
]).T

stacked_test = np.vstack([
    lgb_model.test_preds,
    xgb_model.test_preds,
    cat_model.test_preds,
    cnn_preds
]).T

# meta model
meta_model = Ridge(alpha=1.0)
meta_model.fit(stacked_X, y)
final_preds = meta_model.predict(stacked_test)
final_preds = np.expm1(final_preds)  # inverse log1p

# submission csv
submission = pd.DataFrame({
    'id': test['id'],
    'Listening_Time_minutes': final_preds
})
submission.to_csv('ensemble_submission.csv', index=False)
print("\n✅ Ensemble submission saved as 'ensemble_submission.csv'")


Epoch 1/25


2025-04-15 14:00:35.466323: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 5ms/step - loss: 0.4409 - val_loss: 0.1902
Epoch 2/25
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 5ms/step - loss: 0.1907 - val_loss: 0.1858
Epoch 3/25
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 5ms/step - loss: 0.1785 - val_loss: 0.1893
Epoch 4/25
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 5ms/step - loss: 0.1736 - val_loss: 0.1844
Epoch 5/25
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 5ms/step - loss: 0.1755 - val_loss: 0.1888
Epoch 6/25
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 5ms/step - loss: 0.1736 - val_loss: 0.1723
Epoch 7/25
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 5ms/step - loss: 0.1728 - val_loss: 0.1694
Epoch 8/25
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 5ms/step - loss: 0.1695 - val_loss: 0.1750
Epoch 9/25
[1m9375/9375[0

  final_preds = np.expm1(final_preds)  # inverse log1p



✅ Ensemble submission saved as 'ensemble_submission.csv'


In [3]:
print("Submission preview:")
print(submission.head())

print("\nSubmission shape:", submission.shape)
print("Any NaNs?", submission['Listening_Time_minutes'].isna().sum())
print("All test IDs matched?", all(submission['id'] == test['id']))
print("Any infinities?", np.isinf(submission['Listening_Time_minutes']).sum())

# Save (final correct version)
submission.to_csv('ensemble_submission.csv', index=False)


Submission preview:
       id  Listening_Time_minutes
0  750000               49.894409
1  750001               17.326102
2  750002               46.652108
3  750003               78.203118
4  750004               47.110390

Submission shape: (250000, 2)
Any NaNs? 0
All test IDs matched? True
Any infinities? 1


In [4]:
# Replace infinities with a high reasonable value or the max non-infinite prediction
finite_values = submission['Listening_Time_minutes'].replace([np.inf, -np.inf], np.nan).dropna()
max_valid = finite_values.max()

submission['Listening_Time_minutes'] = submission['Listening_Time_minutes'].replace([np.inf, -np.inf], max_valid)


In [7]:
submission.to_csv('ensemble_submission2.csv', index=False)
print("✅ Fixed and saved: 'ensemble_submission2.csv'")



✅ Fixed and saved: 'ensemble_submission2.csv'
