In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna
import warnings
warnings.filterwarnings('ignore')

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Exploratory Data Analysis
print("\\nTrain data info:")
print(train.info())

print("\\nTrain data description:")
print(train.describe())

print("\\nMissing values in train data:")
print(train.isnull().sum())

# Check the target variable distribution
plt.figure(figsize=(10, 6))
sns.histplot(train['Listening_Time_minutes'], kde=True)
plt.title('Distribution of Listening Time')
plt.savefig('listening_time_distribution.png')
plt.close()

# Feature Engineering

# Combine train and test for preprocessing
target = 'Listening_Time_minutes'
train_id = train['id']
test_id = test['id']
y_train = train[target]

# Drop id and target from train
train = train.drop(['id', target], axis=1)
test = test.drop(['id'], axis=1)

# Identify numeric and categorical features
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object']).columns.tolist()

print(f"\\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Create new features
def create_features(df):
    # Example feature engineering (you would customize this based on the actual data)
    # For podcast data, we might create features like:

    # 1. Extract time-related features if timestamps are available
    if 'Release_Date' in df.columns:
        df['Release_Date'] = pd.to_datetime(df['Release_Date'])
        df['Release_Year'] = df['Release_Date'].dt.year
        df['Release_Month'] = df['Release_Date'].dt.month
        df['Release_Day'] = df['Release_Date'].dt.day
        df['Release_DayOfWeek'] = df['Release_Date'].dt.dayofweek

    # 2. Text length features if text descriptions are available
    for col in categorical_features:
        if df[col].dtype == 'object':
            df[f'{col}_Length'] = df[col].astype(str).apply(len)

    # 3. Interaction features between numeric columns
    for i, col1 in enumerate(numeric_features):
        for col2 in numeric_features[i+1:]:
            df[f'{col1}_times_{col2}'] = df[col1] * df[col2]
            df[f'{col1}_plus_{col2}'] = df[col1] + df[col2]
            df[f'{col1}_minus_{col2}'] = df[col1] - df[col2]
            # Avoid division by zero
            df[f'{col1}_div_{col2}'] = df[col1] / (df[col2] + 1e-5)

    return df

# Apply feature engineering
train = create_features(train)
test = create_features(test)

# Update feature lists after feature engineering
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model Selection and Training

# Function to evaluate models with cross-validation
def evaluate_model(model, X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_root_mean_squared_error')
    return scores.mean(), scores.std()

# Prepare data for modeling
X = train.copy()
y = y_train.copy()

# Define models to try
models = {
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=0.001, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Evaluate each model
print("\\nModel Evaluation:")
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    rmse_mean, rmse_std = evaluate_model(pipeline, X, y)
    results[name] = rmse_mean
    print(f"{name}: RMSE = {rmse_mean:.4f} (±{rmse_std:.4f})")

# Find the best model
best_model_name = min(results, key=results.get)
print(f"\\nBest model: {best_model_name} with RMSE = {results[best_model_name]:.4f}")

# Hyperparameter tuning with Optuna for the best model
def objective(trial):
    if best_model_name == 'Ridge':
        model = Ridge(
            alpha=trial.suggest_float('alpha', 0.01, 10.0, log=True),
            random_state=42
        )
    elif best_model_name == 'Lasso':
        model = Lasso(
            alpha=trial.suggest_float('alpha', 0.0001, 1.0, log=True),
            random_state=42
        )
    elif best_model_name == 'RandomForest':
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 3, 15),
            min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
            min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
            random_state=42
        )
    elif best_model_name == 'GradientBoosting':
        model = GradientBoostingRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            max_depth=trial.suggest_int('max_depth', 3, 10),
            min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
            min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            random_state=42
        )
    elif best_model_name == 'XGBoost':
        model = xgb.XGBRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            max_depth=trial.suggest_int('max_depth', 3, 10),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
            min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
            random_state=42
        )
    else:  # LightGBM
        model = lgb.LGBMRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            max_depth=trial.suggest_int('max_depth', 3, 10),
            num_leaves=trial.suggest_int('num_leaves', 20, 100),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
            min_child_samples=trial.suggest_int('min_child_samples', 5, 100),
            random_state=42
        )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    rmse_mean, _ = evaluate_model(pipeline, X, y)
    return rmse_mean

print("\\nHyperparameter tuning for the best model...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

print(f"Best trial: RMSE = {study.best_value:.4f}")
print("Best hyperparameters:", study.best_params)

# Train the final model with the best hyperparameters
if best_model_name == 'Ridge':
    final_model = Ridge(**study.best_params, random_state=42)
elif best_model_name == 'Lasso':
    final_model = Lasso(**study.best_params, random_state=42)
elif best_model_name == 'RandomForest':
    final_model = RandomForestRegressor(**study.best_params, random_state=42)
elif best_model_name == 'GradientBoosting':
    final_model = GradientBoostingRegressor(**study.best_params, random_state=42)
elif best_model_name == 'XGBoost':
    final_model = xgb.XGBRegressor(**study.best_params, random_state=42)
else:  # LightGBM
    final_model = lgb.LGBMRegressor(**study.best_params, random_state=42)

# Create the final pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', final_model)
])

# Train on the full training data
final_pipeline.fit(X, y)

# Make predictions on the test set
test_predictions = final_pipeline.predict(test)

# Create submission file
submission = pd.DataFrame({
    'id': test_id,
    'Listening_Time_minutes': test_predictions
})

submission.to_csv('submission.csv', index=False)
print("\\nSubmission file created!")

# Feature importance (if applicable)
if hasattr(final_model, 'feature_importances_'):
    # Get feature names after preprocessing
    feature_names = []
    for name, transformer, features in preprocessor.transformers_:
        if name == 'num':
            feature_names.extend(features)
        elif name == 'cat':
            for feature in features:
                # Get one-hot encoded feature names
                categories = transformer.named_steps['onehot'].categories_[features.index(feature)]
                feature_names.extend([f"{feature}_{category}" for category in categories])

    # Get feature importances
    importances = final_model.feature_importances_

    # Sort feature importances
    if len(importances) == len(feature_names):
        indices = np.argsort(importances)[::-1]
        top_features = [(feature_names[i], importances[i]) for i in indices[:20]]

        print("\\nTop 20 important features:")
        for feature, importance in top_features:
            print(f"{feature}: {importance:.4f}")

        # Plot feature importances
        plt.figure(figsize=(12, 8))
        plt.title('Feature Importances')
        plt.barh(range(len(top_features)), [imp for _, imp in top_features], align='center')
        plt.yticks(range(len(top_features)), [feat for feat, _ in top_features])
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.savefig('feature_importances.png')
        plt.close()

# Stacking multiple models for better performance
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# Define base models
base_models = [
    ('ridge', Ridge(alpha=1.0, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBRegressor(n_estimators=100, random_state=42)),
    ('lgb', lgb.LGBMRegressor(n_estimators=100, random_state=42))
]

# Create stacking regressor
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(),
    cv=5
)

# Create stacking pipeline
stacking_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', stacking_regressor)
])

# Evaluate stacking model
stacking_rmse, stacking_std = evaluate_model(stacking_pipeline, X, y)
print(f"\\nStacking Model: RMSE = {stacking_rmse:.4f} (±{stacking_std:.4f})")

# If stacking is better, use it as the final model
if stacking_rmse < results[best_model_name]:
    print("Stacking model is better than the best single model!")

    # Train stacking model on full data
    stacking_pipeline.fit(X, y)

    # Make predictions
    stacking_predictions = stacking_pipeline.predict(test)

    # Create submission file
    stacking_submission = pd.DataFrame({
        'id': test_id,
        'Listening_Time_minutes': stacking_predictions
    })

    stacking_submission.to_csv('stacking_submission.csv', index=False)
    print("Stacking submission file created!")

print("\\nModel training and prediction completed!")

Train shape: (662635, 12)
Test shape: (250000, 11)
\nTrain data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 662635 entries, 0 to 662634
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           662635 non-null  int64  
 1   Podcast_Name                 662635 non-null  object 
 2   Episode_Title                662635 non-null  object 
 3   Episode_Length_minutes       585556 non-null  float64
 4   Genre                        662634 non-null  object 
 5   Host_Popularity_percentage   662634 non-null  float64
 6   Publication_Day              662634 non-null  object 
 7   Publication_Time             662634 non-null  object 
 8   Guest_Popularity_percentage  533675 non-null  float64
 9   Number_of_Ads                662633 non-null  float64
 10  Episode_Sentiment            662634 non-null  object 
 11  Listening_Time_minutes       662634 non-null  f