In [None]:
!pip install catboost==1.1
!pip install optuna==3.0
!pip install sktime==0.13 # Although not directly used in this version, keeping as per previous code
!pip install geopy==2.3 # Although not directly used in this version, keeping as per previous code
!pip install holidays==0.18 # Although not directly used in this version, keeping as per previous code
!pip install xgboost==1.6
!pip install lightgbm==3.3
!pip install scikit-learn==1.0

[0m[31mERROR: Could not find a version that satisfies the requirement catboost==1.1 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for catboost==1.1[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement optuna==3.0 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for optuna==3.0[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement sktime==0.13 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sktime==0.13[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement geopy==2.3 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for geopy==2.3[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement holidays==0.18 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for holidays==0.18[0m[31m
[0m[31mERROR: Could not find a version that satisfies

In [None]:
# ========== Libraries ==========
import pandas as pd  # For handling dataframes
import numpy as np  # For numerical operations
from sklearn.impute import KNNImputer  # For filling missing values
from sklearn.model_selection import StratifiedKFold, cross_val_predict  # For cross-validation
from sklearn.metrics import mean_squared_error  # For model evaluation
from sklearn.ensemble import StackingRegressor  # For ensemble learning
from sklearn.linear_model import Ridge  # Final estimator for stacking
from lightgbm import LGBMRegressor  # LightGBM model
from xgboost import XGBRegressor  # XGBoost model
from catboost import CatBoostRegressor  # CatBoost model
import optuna  # For hyperparameter tuning
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# ========== Load Data ==========
train = pd.read_csv("/kaggle/input/urban-air-pollution-challenge-csec/Train.csv")
test = pd.read_csv("/kaggle/input/urban-air-pollution-challenge-csec/Test.csv")

# ========== Drop Duplicates Function ==========
# Remove duplicated columns with identical values
def drop_duplicate_columns(df):
    duplicates = set()
    columns = df.columns
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            if df[columns[i]].equals(df[columns[j]]):
                duplicates.add(columns[j])
    return df.drop(columns=list(duplicates))

# ========== Prepare Features ==========
# Drop unnecessary or target-related columns
drop_cols = ['Place_ID X Date', 'Date', 'Place_ID', 'target_min', 'target_max', 'target_variance', 'target_count']
# Select only feature columns for training and test
features = [col for col in train.columns if col not in drop_cols + ['target']]

# Combine train and test for consistent preprocessing
combined = pd.concat([train[features], test[features]], axis=0)
# Drop duplicate columns
combined = drop_duplicate_columns(combined)

# ========== Impute Missing Values ==========
# Fill missing values using KNN imputation
knn = KNNImputer(n_neighbors=5)
combined_imputed = knn.fit_transform(combined)

# Split the data back into train and test
X_train_full = combined_imputed[:len(train)]
X_test_full = combined_imputed[len(train):]
y = train['target']  # Target variable

# ========== OPTUNA TUNING FUNCTION ==========
# Function to tune hyperparameters of a given model using Optuna
def tune_model(model_type, X, y):
    def objective(trial):
        if model_type == 'lgbm':
            # Define LGBM hyperparameter search space
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'num_leaves': trial.suggest_int('num_leaves', 31, 256),
                'max_depth': trial.suggest_int('max_depth', 4, 16),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
            }
            model = LGBMRegressor(**params, random_state=42)

        elif model_type == 'xgb':
            # Define XGBoost hyperparameter search space
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 4, 16),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'gamma': trial.suggest_float('gamma', 0.0, 5.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
            }
            model = XGBRegressor(**params, random_state=42, verbosity=0)

        else:  # CatBoost
            # Define CatBoost hyperparameter search space
            params = {
                'iterations': trial.suggest_int('iterations', 300, 1200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
                'random_strength': trial.suggest_float('random_strength', 0.0, 1.0)
            }
            model = CatBoostRegressor(**params, random_state=42, verbose=0)

        # Use cross-validation RMSE as objective to minimize
        preds = cross_val_predict(model, X, y, cv=5)
        return mean_squared_error(y, preds, squared=False)  # RMSE

    # Create study and optimize the objective
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=25)
    return study.best_params

# ========== Tune Each Model ==========

print("🔧 Tuning LGBM...")
lgbm_params = tune_model('lgbm', X_train_full, y)

print("🔧 Tuning XGBoost...")
xgb_params = tune_model('xgb', X_train_full, y)

print("🔧 Tuning CatBoost...")
cat_params = tune_model('catboost', X_train_full, y)

# ========== Train Models and Stack ==========
print("✅ Training and stacking...")

# Initialize models with best parameters
lgbm_model = LGBMRegressor(**lgbm_params, random_state=42)
xgb_model = XGBRegressor(**xgb_params, random_state=42, verbosity=0)
cat_model = CatBoostRegressor(**cat_params, random_state=42, verbose=0)

# Use Ridge as final estimator in stacking ensemble
stack = StackingRegressor(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('cat', cat_model)
    ],
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    passthrough=True,
    n_jobs=-1
)

# Train stacking model
stack.fit(X_train_full, y)

# ========== Make Predictions ==========
preds = stack.predict(X_test_full)

# ========== Save Submission ==========
submission = pd.DataFrame({
    'Place_ID X Date': test['Place_ID X Date'],
    'target': preds
})
submission.to_csv("submission.csv", index=False)
print("✅ Final submission saved as 'submission.csv'")
