In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import xgboost as xgb


In [2]:

# ---------- 1. Load and Prepare Data ----------
def load_preprocess():
    train_df = pd.read_csv('Data/train.csv')
    test_df = pd.read_csv('Data/test.csv')

    train_df['is_train'] = 1
    test_df['is_train'] = 0
    full_df = pd.concat([train_df, test_df])

    # Clean specific columns
    full_df["Number_of_Ads"] = full_df["Number_of_Ads"].fillna(0)
    full_df["Number_of_Ads"] = full_df["Number_of_Ads"].where(full_df["Number_of_Ads"].isin([0, 1, 2, 3]), 1)
    full_df["Episode_Number"] = full_df["Episode_Title"].str.extract(r"(\d+)").astype(float)
    full_df["Guest_Popularity_percentage"] = full_df["Guest_Popularity_percentage"].fillna(0)
    full_df["Episode_Length_minutes"] = full_df["Episode_Length_minutes"].fillna(full_df["Episode_Length_minutes"].median())
    full_df.drop(columns=["Episode_Title"], inplace=True)

    # Categorical encoding
    for col in ["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]:
        full_df[col] = full_df[col].astype('category')

    # Split again
    train_df = full_df[full_df['is_train'] == 1].drop(columns=['is_train'])
    test_df = full_df[full_df['is_train'] == 0].drop(columns=['is_train', 'Listening_Time_minutes'])

    return train_df, test_df


In [3]:

# ---------- 2. Model and Transform Evaluation ----------
def evaluate_model(name, model, X, y, inverse_transform=None):
    model.fit(X, y)
    preds = model.predict(X)
    if inverse_transform:
        preds = inverse_transform(preds)
    rmse = np.sqrt(mean_squared_error(y_true if inverse_transform else y, preds))
    print(f"{name} RMSE: {rmse:.4f}")
    return rmse



In [9]:
# ---------- 3. Main Execution ----------
train_df, test_df = load_preprocess()
X = train_df.drop(columns=["Listening_Time_minutes", "id"])
y = train_df["Listening_Time_minutes"]
y_true = y.copy()

# Base XGB
base_model = xgb.XGBRegressor(max_depth=6, colsample_bytree=0.5, subsample=0.8,
                              n_estimators=10000, learning_rate=0.02,
                              enable_categorical=True, min_child_weight=10, verbosity=0)

rmse_xgb =evaluate_model("XGBoost (original)", base_model, X, y)


XGBoost (original) RMSE: 12.0839


np.float64(12.08390804635549)

In [10]:
# Log Transform
rmse_log = evaluate_model("Log Transform", base_model, X, np.log1p(y), inverse_transform=np.expm1)


Log Transform RMSE: 12.8522


np.float64(12.85217391363556)

In [11]:
# Sqrt Transform
rmse_sqrt =evaluate_model("Sqrt Transform", base_model, X, np.sqrt(y), inverse_transform=lambda x: x**2)


Sqrt Transform RMSE: 12.2526


np.float64(12.25264517989383)

In [None]:
rmse_xgb = evaluate_model("XGBoost (original)", base_model, X, y)
rmse_log = evaluate_model("Log Transform", base_model, X, np.log1p(y), inverse_transform=np.expm1)
rmse_sqrt = evaluate_model("Sqrt Transform", base_model, X, np.sqrt(y), inverse_transform=lambda x: x**2)


XGBoost (original) RMSE: 12.0839
Log Transform RMSE: 12.8522


In [12]:
#  
results = {
    "Model": ["XGBoost", "Log", "Sqrt" ],
    "RMSE": [rmse_xgb, rmse_log, rmse_sqrt ]  
}
pd.DataFrame(results).to_csv("model_comparison.csv", index=False)

NameError: name 'rmse_xgb' is not defined

In [None]:
# Stacking Regressor
estimators = [
    ('xgb', xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=4, enable_categorical=True, verbosity=0)),
    ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.1)),
    ('rf', RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42))
]

stacked_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression(), passthrough=True)
evaluate_model("Stacked Model", stacked_model, X, y)
