In [17]:
    # Colab Enviornment

# from google.colab import drive
# drive.mount('/content/drive')

# sample_df = pd.read_csv("/content/drive/MyDrive/ml_class/kaggle_1/submission_sample.csv")
# train_df = pd.read_csv("/content/drive/MyDrive/ml_class/kaggle_1/train_dataset.csv")
# test_sample_df = pd.read_csv("/content/drive/MyDrive/ml_class/kaggle_1/test_sample.csv")
# test_df = pd.read_csv('/content/drive/MyDrive/ml_class/kaggle_1/test_dataset.csv')
# station_info_df = pd.read_csv("/content/drive/MyDrive/ml_class/kaggle_1/station_info.csv")

In [18]:
import numpy as np
import pandas as pd
from natsort import natsorted

from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
import optuna



from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.svm import SVR

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import KFold
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder

In [19]:
# Local Enviornment

sample_df = pd.read_csv("submission_sample.csv")
train_df = pd.read_csv("train_dataset.csv")
test_df = pd.read_csv('test_dataset.csv')
station_info_df = pd.read_csv("station_info.csv")

In [20]:
# All columns

# n24 columns
    # cloud_cover_n: 증하층운량(10분위)
    # dew_point_n: 이슬점 온도(°C)
    # humidity_n: 습도(%)
    # local_pressure_n: 현지기압(hPa)
    # min_cloud_height_n: 최저운고(100m)
    # precipitation_n: 강수량(mm)
    # sea_level_pressure_n: 해면기압(hPa)
    # snow_depth_n: 적설(cm)
    # sunshine_duration_n: 일조(hr)
    # surface_temp_n: 지면온도(°C)
    # vapor_pressure_n: 증기압(hPa)
    # visibility_n: 시정(10m)
    # wind_speed_n: 풍속(m/s)

# Special comumns
    # wind_direction_n
    # climatology_temp
    # date
    # station
    #     station number
    #     Convert to latitude, longitude(maybe drop), elevation, 

# Others
    # id
    # station_name

In [21]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_col='date'):
        self.date_col = date_col
        self.days_before_month = [0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334]  # for non-leap year

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        
        # Convert to datetime
        df['date_dt'] = pd.to_datetime('2000-' + df[self.date_col], format='%Y-%m-%d')
    
        # Extract raw date parts
        df['month'] = df['date_dt'].dt.month
        df['day'] = df['date_dt'].dt.day
        df['day_of_year'] = df.apply(lambda row: self.days_before_month[row['month'] - 1] + row['day'], axis=1)
        df['day_of_week'] = df['date_dt'].dt.weekday
        df['week_of_year'] = df['date_dt'].dt.isocalendar().week.astype('int32')
        df['quarter'] = df['date_dt'].dt.quarter
    
        # Cyclical encodings
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
        df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
        df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
    
        # df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        # df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
        df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
        df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
        df['week_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
        df['week_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)
    
        df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
        df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

        # Extract raw date parts
        df['month'] = df['date_dt'].dt.month.astype('category')
        df['day'] = df['date_dt'].dt.day.astype('category')
        df['day_of_week'] = df['date_dt'].dt.weekday.astype('category')
        df['week_of_year'] = df['date_dt'].dt.isocalendar().week.astype('int32')
        df['quarter'] = df['date_dt'].dt.quarter.astype('category')
    
        # Drop raw versions
        df.drop(columns=[self.date_col, 'date_dt'], inplace=True)
        
        return df


In [22]:
class DropFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self  # Nothing to learn

    def transform(self, X):
        # vapor_pressure_n = [f"vapor_pressure_{i}" for i in range(24)]

        final_drop_cols = self.drop_cols

        return X.drop(columns=final_drop_cols, errors = 'ignore')

In [23]:
hour_col_map = {
    "cloud_cover_n":        [f"cloud_cover_{i}" for i in range(24)],
    "dew_point_n":          [f"dew_point_{i}" for i in range(24)],
    "humidity_n":           [f"humidity_{i}" for i in range(24)],
    "local_pressure_n":     [f"local_pressure_{i}" for i in range(24)],
    "min_cloud_height_n":   [f"min_cloud_height_{i}" for i in range(24)],
    "precipitation_n":      [f"precipitation_{i}" for i in range(24)],
    "sea_level_pressure_n": [f"sea_level_pressure_{i}" for i in range(24)],
    "snow_depth_n":         [f"snow_depth_{i}" for i in range(24)],
    "sunshine_duration_n":  [f"sunshine_duration_{i}" for i in range(24)],
    "surface_temp_n":       [f"surface_temp_{i}" for i in range(24)],
    "vapor_pressure_n":     [f"vapor_pressure_{i}" for i in range(24)],
    "visibility_n":         [f"visibility_{i}" for i in range(24)],
    "wind_speed_n":         [f"wind_speed_{i}" for i in range(24)],
    "wind_direction_n":     [f"wind_direction_{i}" for i in range(24)],
    "station_info":         ["station"]
}


feature_config = {
    "station_info": {
        "add_station_info": False
    },
    "cloud_cover_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": False, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": True, "last_condition_hour": True, "total_mean": True
    },
    "dew_point_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": False, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": False, "last_condition_hour": False, "estimate_air_temp" : True
    },
    "humidity_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": True, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": False, "last_condition_hour": False, "total_mean": True
    },
    "local_pressure_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": True, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": False, "last_condition_hour": False, "total_mean": True
    },
    "min_cloud_height_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": False, "direction_category": False,
        "when_min": False, "when_max": False, "first_condition_hour": False, "last_condition_hour": False, "total_mean": True
    },
    "precipitation_n": {
        "early_night_sum": True, "late_night_sum": True, "morning_sum": True,
        "afternoon_sum": True, "evening_sum": True, "late_evening_sum": True,
        "total_sum": True, "range": True, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": True, "last_condition_hour": True, "rain_event" : True
    },
    "sea_level_pressure_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": True, "direction_category": False,
        "when_min": False, "when_max": False, "first_condition_hour": False, "last_condition_hour": False, "total_mean": True
    },
    "snow_depth_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": False, "direction_category": False,
        "when_min": False, "when_max": True, "first_condition_hour": True, "last_condition_hour": True, "snow_event" : True
    },
    "sunshine_duration_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": False, "direction_category": False,
        "when_min": False, "when_max": False, "first_condition_hour": True, "last_condition_hour": True, "total_mean": True
    },
    "surface_temp_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": True, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": False, "last_condition_hour": False, "total_mean": True
    },
    "vapor_pressure_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": True, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": False, "last_condition_hour": False
    },
    "visibility_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": True, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": False, "last_condition_hour": False, "total_mean": True
    },
    "wind_speed_n": {
        "early_night_mean": True, "late_night_mean": True, "morning_mean": True,
        "afternoon_mean": True, "evening_mean": True, "late_evening_mean": True,
        "total_sum": False, "range": True, "direction_category": False,
        "when_min": True, "when_max": True, "first_condition_hour": False, "last_condition_hour": False, "total_mean": True
    },
    "wind_direction_n": {
        "early_night_mean": False, "late_night_mean": False, "morning_mean": False,
        "afternoon_mean": False, "evening_mean": False, "late_evening_mean": False,
        "total_sum": False, "range": False, "direction_category": False,
        "when_min": False, "when_max": False, "first_condition_hour": False, "last_condition_hour": False, "total_mean": False
    }
}



In [24]:
class Condenser(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name, hour_cols, config):
        self.feature_name = feature_name
        self.hour_cols = hour_cols
        self.config = config

    def fit(self, X, y=None):
        return self  # No training needed

    def transform(self, X):
        df = X[self.hour_cols].copy()
        result = pd.DataFrame(index=X.index)

        if self.config.get("add_station_info"):
            # Merge station metadata into df based on 'station' ↔ '지점'
            merged = df.merge(
                station_info_df[["지점", "위도", "경도", "노장해발고도(m)", '기압계(관측장비지상높이(m))']],
                how="left",
                left_on="station",
                right_on="지점"
            )
        
            result[f"{self.feature_name}_lat"] = merged["위도"]
            result[f"{self.feature_name}_height"] = merged["노장해발고도(m)"]

            # Early Night (0–3)
        if self.config.get("early_night_mean"):
            idx = [i for i in range(0, 4)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_early_night_mean"] = df[cols].mean(axis=1)
    
        # Late Night (4–7)
        if self.config.get("late_night_mean"):
            idx = [i for i in range(4, 8)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_late_night_mean"] = df[cols].mean(axis=1)
    
        # Morning (8–11)
        if self.config.get("morning_mean"):
            idx = [i for i in range(8, 12)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_morning_mean"] = df[cols].mean(axis=1)
    
        # Afternoon (12–15)
        if self.config.get("afternoon_mean"):
            idx = [i for i in range(12, 16)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_afternoon_mean"] = df[cols].mean(axis=1)
    
        # Evening (16–19)
        if self.config.get("evening_mean"):
            idx = [i for i in range(16, 20)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_evening_mean"] = df[cols].mean(axis=1)
    
        # Late Evening (20–23)
        if self.config.get("late_evening_mean"):
            idx = [i for i in range(20, 24)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_late_evening_mean"] = df[cols].mean(axis=1)

        # Sum – Early Night (0–3)
        if self.config.get("early_night_sum"):
            idx = [i for i in range(0, 4)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_early_night_sum"] = df[cols].sum(axis=1)
        
        # Sum – Late Night (4–7)
        if self.config.get("late_night_sum"):
            idx = [i for i in range(4, 8)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_late_night_sum"] = df[cols].sum(axis=1)
        
        # Sum – Morning (8–11)
        if self.config.get("morning_sum"):
            idx = [i for i in range(8, 12)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_morning_sum"] = df[cols].sum(axis=1)
        
        # Sum – Afternoon (12–15)
        if self.config.get("afternoon_sum"):
            idx = [i for i in range(12, 16)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_afternoon_sum"] = df[cols].sum(axis=1)
        
        # Sum – Evening (16–19)
        if self.config.get("evening_sum"):
            idx = [i for i in range(16, 20)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_evening_sum"] = df[cols].sum(axis=1)
        
        # Sum – Late Evening (20–23)
        if self.config.get("late_evening_sum"):
            idx = [i for i in range(20, 24)]
            cols = [self.hour_cols[i] for i in idx]
            result[f"{self.feature_name}_late_evening_sum"] = df[cols].sum(axis=1)

        # First and Last Condition Hour
        if self.config.get("first_condition_hour") or self.config.get("last_condition_hour"):
            cols = self.hour_cols  # list of 24 hourly columns
            valid_mask = (df[cols].notna()) & (df[cols] != 0)  # treat 0 as invalid too
            valid_counts = valid_mask.sum(axis=1)  # how many valid values per row?
        
            if self.config.get("first_condition_hour"):
                first_hour = valid_mask.idxmax(axis=1)
                first_hour[valid_counts == 0] = np.nan  # force NaN when no valid data
                result[f"{self.feature_name}_first_condition_hour"] = (
                    first_hour.str.extract(r"_(\d+)$").astype(float)
                )
        
            if self.config.get("last_condition_hour"):
                last_hour = valid_mask.iloc[:, ::-1].idxmax(axis=1)
                last_hour[valid_counts == 0] = np.nan  # force NaN when no valid data
                result[f"{self.feature_name}_last_condition_hour"] = (
                    last_hour.str.extract(r"_(\d+)$").astype(float)
                )

       # When max
        if self.config.get("when_max"):
            cols = self.hour_cols
            valid_mask = df[cols].notna().sum(axis=1) > 0
            max_idx = pd.Series(index=df.index, dtype="object")
            max_idx[valid_mask] = df.loc[valid_mask, cols].idxmax(axis=1, skipna=True)
            result[f"{self.feature_name}_when_max"] = (
                max_idx.str.extract(r"_(\d+)$")[0].astype(float).where(valid_mask, -1)
            )
        
        # When min
        if self.config.get("when_min"):
            cols = self.hour_cols
            valid_mask = df[cols].notna().sum(axis=1) > 0
            min_idx = pd.Series(index=df.index, dtype="object")
            min_idx[valid_mask] = df.loc[valid_mask, cols].idxmin(axis=1, skipna=True)
            result[f"{self.feature_name}_when_min"] = (
                min_idx.str.extract(r"_(\d+)$")[0].astype(float).where(valid_mask, -1)
            )

        
        
        if self.config.get("mean_minus_climatology"):
            result[f"{self.feature_name}_mean_minus_climatology"] = df.mean(axis=1) - X['climatology_temp']
            
        if self.config.get("total_mean"):
            result[f"{self.feature_name}_total_mean"] = df.mean(axis=1)
            
        if self.config.get("total_sum"):
            result[f"{self.feature_name}_total_sum"] = df.sum(axis=1)

        if self.config.get("range"):
            cols = self.hour_cols
            result[f"{self.feature_name}_range"] = df[cols].max(axis=1) - df[cols].min(axis=1)
        return result



In [25]:
class FeatureCondensationPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, hour_col_map, feature_config):
        self.hour_col_map = hour_col_map
        self.feature_config = feature_config
        self.condensers = []

    def fit(self, X, y=None):
        # Instantiate and store a condenser for each feature
        self.condensers = [
            Condenser(feature, hour_cols, self.feature_config[feature])
            for feature, hour_cols in self.hour_col_map.items()
        ]
        # Fit all condensers (usually a no-op unless you add logic inside them)
        for condenser in self.condensers:
            condenser.fit(X, y)
        return self

    def transform(self, X):
        # Transform all condensers and concatenate results
        transformed_parts = [
            condenser.transform(X) for condenser in self.condensers
        ]

        # Drop original n24 columns
        drop_cols = [col for cols in self.hour_col_map.values() for col in cols]

        # Drop them from X
        X_cleaned = X.drop(columns=drop_cols)

        # Concatenate all transformed parts and cleaned X
        return pd.concat(transformed_parts + [X_cleaned], axis=1)


In [26]:
class IQRBasedOutlierCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, base_features):
        self.base_features = base_features  # e.g. ["humidity_n", "local_pressure_n", ...]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        df.replace(-9999.0, np.nan, inplace=True)
        df.replace(20000.0, np.nan, inplace=True)

        return df

In [27]:

class ManualFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

         # Fog factor (visibility × humidity)
        X["fog_factor"] = X["humidity_n_total_mean"]*X["visibility_n_total_mean"]

        # Apparent Temperature
        X["heat_index_like"] = X["surface_temp_n_total_mean"] + 0.1*X["humidity_n_total_mean"]

        # Wind chill factor
        X["wind_chill"] = X["surface_temp_n_total_mean"] - 0.7*X["wind_speed_n_total_mean"]

        X.drop(columns=["humidity_n_total_mean", "visibility_n_total_mean"], inplace=True)

        # Calculate ln(local and sea level pressure)
        P_local = X['local_pressure_n_total_mean']
        P_sea = X['sea_level_pressure_n_total_mean']
        # Avoid division by zero or invalid values
        with np.errstate(divide='ignore', invalid='ignore'):
            X['ln_local_sea_level_pressure'] = -1 / np.log(P_local / P_sea)
        X.drop(columns = ['local_pressure_n_total_mean', 'sea_level_pressure_n_total_mean'], inplace = True)
        

        return X


In [28]:
# Main Pipeline

pipeline = Pipeline([
    ("drop_features", DropFeatureTransformer(["target", "id", "station_name"])),
    ("iqr_cleaning", IQRBasedOutlierCleaner(
        base_features=[
            "humidity", "local_pressure", "precipitation",
            "sea_level_pressure", "surface_temp",
            "visibility", "wind_speed"
        ]
    )),
    ("date_features", DateFeatureExtractor()),

    ("feature_engineering", FeatureCondensationPipeline(hour_col_map, feature_config)),

    ("manual_features", ManualFeatureEngineer()),
])


In [35]:
# Preprocess Function

X_train_ori = pipeline.fit_transform(train_df)
y_train_ori = train_df['target']

X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_train_ori, test_size = 0.2, random_state=42)


models = {
    "CatBoost": CatBoostRegressor(
    boosting_type= 'Plain', 
    grow_policy= 'SymmetricTree', 
    iterations= 3600, 
    learning_rate= 0.054179676266114606, 
    depth= 9, 
    l2_leaf_reg= 5.425455174508331, 
    random_strength= 0.632030143949728, 
    bagging_temperature= 0.9513670435640664, 
    border_count= 215,
        
    verbose=0,
    random_state=42,
),

    "XGBoost": XGBRegressor(
        random_state=42,
        tree_method='hist',
        enable_categorical=True,


        n_estimators= 4000, 
        max_depth= 5, 
        learning_rate= 0.055025307082608235, 
        subsample= 0.6272611512543017, 
        colsample_bytree= 0.9867043517730124, 
        reg_alpha= 0.5543753187724325, 
        reg_lambda= 4.41395878314679
    ),

    "LightGBM": LGBMRegressor(
        random_state=42,
        verbose=-1,

        n_estimators= 3000, 
        max_depth= 11, 
        learning_rate= 0.09216865202064772, 
        num_leaves= 28, 
        min_child_samples= 40, 
        subsample= 0.5276614407170144, 
        colsample_bytree= 0.8239904218170874, 
        reg_alpha= 0.0787097082876258, 
        reg_lambda=9.831250240403401

    ),

    "RandomForest": RandomForestRegressor(
        # n_estimators=100,
        n_estimators=900,
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=42,
        verbose=0,
        max_features = None
    ),

    "Ridge": Ridge(alpha=1.0),

    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=5000, random_state=42)

}

def preprocess_for_model(model_name, X_train, X_test):
    X_train = X_train.copy()
    X_test = X_test.copy()

    if model_name == "CatBoost":
        X_train["day"] = X_train["day"].astype(int)
        X_test["day"] = X_test["day"].astype(int)
        X_train["month"] = X_train["month"].astype(int)
        X_test["month"] = X_test["month"].astype(int)
        
        # vapor_pressure_n_late_evening_mean times week_cos
        X_train["vapor_pressure_late_evening_mean_times_week_cos"] = X_train['vapor_pressure_n_late_evening_mean'] * X_train['week_cos']
        X_test["vapor_pressure_late_evening_mean_times_week_cos"] = X_test['vapor_pressure_n_late_evening_mean'] * X_test['week_cos']

        # dew_point_n_late_evening_mean humidity_n_late_evening_mean: 2.0209
        X_train["dew_point_n_late_evening_mean_times_humidity_n_late_evening_mean"] = X_train['dew_point_n_late_evening_mean'] * X_train['humidity_n_late_evening_mean']
        X_test["dew_point_n_late_evening_mean_times_humidity_n_late_evening_mean"] = X_test['dew_point_n_late_evening_mean'] * X_test['humidity_n_late_evening_mean']

        # dew_point_n_late_evening_mean * humidity_n_afternoon_mean: 1.5551
        X_train["dew_point_n_late_evening_mean_times_humidity_n_afternoon_mean"] = (
            X_train["dew_point_n_late_evening_mean"] * X_train["humidity_n_afternoon_mean"]
        )
        X_test["dew_point_n_late_evening_mean_times_humidity_n_afternoon_mean"] = (
            X_test["dew_point_n_late_evening_mean"] * X_test["humidity_n_afternoon_mean"]
        )

        # Drop features
        X_train = X_train.drop(columns=['ln_local_sea_level_pressure'], errors='ignore')
        X_test = X_test.drop(columns=['ln_local_sea_level_pressure'], errors='ignore')
        

        cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
        for col in cat_features:
            X_train[col] = X_train[col].astype(str).fillna("Missing")
            X_test[col] = X_test[col].astype(str).fillna("Missing")

    elif model_name == "LightGBM":

        # humidity_n_late_evening_mean climatology_temp: 31.0000
        X_train["humidity_n_late_evening_mean_times_climatology_temp"] = X_train["humidity_n_late_evening_mean"] * X_train["climatology_temp"]
        X_test["humidity_n_late_evening_mean_times_climatology_temp"] = X_test["humidity_n_late_evening_mean"] * X_test["climatology_temp"]
        
        # vapor_pressure_n_late_evening_mean week_cos: 26.0000
        X_train["vapor_pressure_n_late_evening_mean_times_week_cos"] = X_train["vapor_pressure_n_late_evening_mean"] * X_train["week_cos"]
        X_test["vapor_pressure_n_late_evening_mean_times_week_cos"] = X_test["vapor_pressure_n_late_evening_mean"] * X_test["week_cos"]
        
        # dew_point_n_late_evening_mean humidity_n_late_evening_mean: 17.0000
        X_train["dew_point_n_late_evening_mean_times_humidity_n_late_evening_mean"] = X_train["dew_point_n_late_evening_mean"] * X_train["humidity_n_late_evening_mean"]
        X_test["dew_point_n_late_evening_mean_times_humidity_n_late_evening_mean"] = X_test["dew_point_n_late_evening_mean"] * X_test["humidity_n_late_evening_mean"]
        
        # dew_point_n_late_evening_mean month_cos: 15.0000
        X_train["dew_point_n_late_evening_mean_times_month_cos"] = X_train["dew_point_n_late_evening_mean"] * X_train["month_cos"]
        X_test["dew_point_n_late_evening_mean_times_month_cos"] = X_test["dew_point_n_late_evening_mean"] * X_test["month_cos"]
        
        # vapor_pressure_n_late_evening_mean week_sin: 14.0000
        X_train["vapor_pressure_n_late_evening_mean_times_week_sin"] = X_train["vapor_pressure_n_late_evening_mean"] * X_train["week_sin"]
        X_test["vapor_pressure_n_late_evening_mean_times_week_sin"] = X_test["vapor_pressure_n_late_evening_mean"] * X_test["week_sin"]
        
        # climatology_temp week_of_year: 14.0000
        X_train["climatology_temp_times_week_of_year"] = X_train["climatology_temp"] * X_train["week_of_year"]
        X_test["climatology_temp_times_week_of_year"] = X_test["climatology_temp"] * X_test["week_of_year"]
        
        # surface_temp_n_late_evening_mean week_sin: 13.0000
        X_train["surface_temp_n_late_evening_mean_times_week_sin"] = X_train["surface_temp_n_late_evening_mean"] * X_train["week_sin"]
        X_test["surface_temp_n_late_evening_mean_times_week_sin"] = X_test["surface_temp_n_late_evening_mean"] * X_test["week_sin"]
        
        # climatology_temp week_cos: 13.0000
        X_train["climatology_temp_times_week_cos"] = X_train["climatology_temp"] * X_train["week_cos"]
        X_test["climatology_temp_times_week_cos"] = X_test["climatology_temp"] * X_test["week_cos"]
        
        # humidity_n_evening_mean climatology_temp: 12.0000
        X_train["humidity_n_evening_mean_times_climatology_temp"] = X_train["humidity_n_evening_mean"] * X_train["climatology_temp"]
        X_test["humidity_n_evening_mean_times_climatology_temp"] = X_test["humidity_n_evening_mean"] * X_test["climatology_temp"]
        
        # dew_point_n_late_evening_mean week_cos: 11.0000
        X_train["dew_point_n_late_evening_mean_times_week_cos"] = X_train["dew_point_n_late_evening_mean"] * X_train["week_cos"]
        X_test["dew_point_n_late_evening_mean_times_week_cos"] = X_test["dew_point_n_late_evening_mean"] * X_test["week_cos"]

        # 🔥 Drop range features
        range_cols = [col for col in X_train.columns if col.endswith("_range")]
        X_train = X_train.drop(columns=range_cols)
        X_test = X_test.drop(columns=range_cols)

        slope_cols = [col for col in X_train.columns if col.endswith("_hourly_linear_slope")]
        X_train = X_train.drop(columns=slope_cols)
        X_test = X_test.drop(columns=slope_cols)

        when_cols = [col for col in X_train.columns if col.endswith("_when_max") or col.endswith("_when_min")]
        X_train = X_train.drop(columns=when_cols, errors='ignore')
        X_test = X_test.drop(columns=when_cols, errors='ignore')

        cyclical_cols = [
            "month_sin", "month_cos", "day_sin", "day_cos", "doy_sin", "doy_cos", "dow_sin", "dow_cos", "week_sin", "week_cos", "quarter_sin", "quarter_cos"]
        
        X_train = X_train.drop(columns=cyclical_cols, errors='ignore')
        X_test = X_test.drop(columns=cyclical_cols, errors='ignore')

        cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

    elif model_name == "XGBoost":

        # dew_point_n_late_evening_mean * humidity_n_afternoon_mean: 0.0821
        X_train["dew_point_n_late_evening_mean_times_humidity_n_afternoon_mean"] = (
            X_train["dew_point_n_late_evening_mean"] * X_train["humidity_n_afternoon_mean"]
        )
        X_test["dew_point_n_late_evening_mean_times_humidity_n_afternoon_mean"] = (
            X_test["dew_point_n_late_evening_mean"] * X_test["humidity_n_afternoon_mean"]
        )
    
        # wind_speed_n_range * week_of_year: 0.0219
        X_train["wind_speed_n_range_times_week_of_year"] = (
            X_train["wind_speed_n_range"] * X_train["week_of_year"]
        )
        X_test["wind_speed_n_range_times_week_of_year"] = (
            X_test["wind_speed_n_range"] * X_test["week_of_year"]
        )
    
        # vapor_pressure_n_late_evening_mean * week_cos: 0.0203
        X_train["vapor_pressure_n_late_evening_mean_times_week_cos"] = (
            X_train["vapor_pressure_n_late_evening_mean"] * X_train["week_cos"]
        )
        X_test["vapor_pressure_n_late_evening_mean_times_week_cos"] = (
            X_test["vapor_pressure_n_late_evening_mean"] * X_test["week_cos"]
        )
    
        # snow_depth_n_when_max * week_sin: 0.0185
        X_train["snow_depth_n_when_max_times_week_sin"] = (
            X_train["snow_depth_n_when_max"] * X_train["week_sin"]
        )
        X_test["snow_depth_n_when_max_times_week_sin"] = (
            X_test["snow_depth_n_when_max"] * X_test["week_sin"]
        )
    
        # surface_temp_n_evening_mean * month_sin: 0.0149
        X_train["surface_temp_n_evening_mean_times_month_sin"] = (
            X_train["surface_temp_n_evening_mean"] * X_train["month_sin"]
        )
        X_test["surface_temp_n_evening_mean_times_month_sin"] = (
            X_test["surface_temp_n_evening_mean"] * X_test["month_sin"]
        )
    
        # surface_temp_n_afternoon_mean * quarter_sin: 0.0132
        X_train["surface_temp_n_afternoon_mean_times_quarter_sin"] = (
            X_train["surface_temp_n_afternoon_mean"] * X_train["quarter_sin"]
        )
        X_test["surface_temp_n_afternoon_mean_times_quarter_sin"] = (
            X_test["surface_temp_n_afternoon_mean"] * X_test["quarter_sin"]
        )
    
        # surface_temp_n_late_evening_mean * quarter: 0.0121
        X_train["quarter"] = X_train["quarter"].astype(int)
        X_test["quarter"] = X_test["quarter"].astype(int)
        X_train["surface_temp_n_late_evening_mean_times_quarter"] = (
            X_train["surface_temp_n_late_evening_mean"] * X_train["quarter"]
        )
        X_test["surface_temp_n_late_evening_mean_times_quarter"] = (
            X_test["surface_temp_n_late_evening_mean"] * X_test["quarter"]
        )
    
        # dew_point_n_late_evening_mean * quarter: 0.0119
        X_train["dew_point_n_late_evening_mean_times_quarter"] = (
            X_train["dew_point_n_late_evening_mean"] * X_train["quarter"]
        )
        X_test["dew_point_n_late_evening_mean_times_quarter"] = (
            X_test["dew_point_n_late_evening_mean"] * X_test["quarter"]
        )
    
        # climatology_temp * fog_factor: 0.0111
        X_train["climatology_temp_times_fog_factor"] = (
            X_train["climatology_temp"] * X_train["fog_factor"]
        )
        X_test["climatology_temp_times_fog_factor"] = (
            X_test["climatology_temp"] * X_test["fog_factor"]
        )
    
        # dew_point_n_late_evening_mean * humidity_n_late_evening_mean: 0.0108
        X_train["dew_point_n_late_evening_mean_times_humidity_n_late_evening_mean"] = (
            X_train["dew_point_n_late_evening_mean"] * X_train["humidity_n_late_evening_mean"]
        )
        X_test["dew_point_n_late_evening_mean_times_humidity_n_late_evening_mean"] = (
            X_test["dew_point_n_late_evening_mean"] * X_test["humidity_n_late_evening_mean"]
        )

        # # 🔥 Drop range features + ln_local_sea_level_pressure
        range_cols = [col for col in X_train.columns if col.endswith("_range")]
        range_cols = range_cols + ['ln_local_sea_level_pressure']
        X_train = X_train.drop(columns=range_cols)
        X_test = X_test.drop(columns=range_cols)

        when_cols = [col for col in X_train.columns if col.endswith("_when_max") or col.endswith("_when_min")]
        X_train = X_train.drop(columns=when_cols)
        X_test = X_test.drop(columns=when_cols)

        cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()


    elif model_name == "RandomForest":

        # Identify all object/category columns
        cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

        when_cols = [col for col in X_train.columns if col.endswith("_when_max") or col.endswith("_when_min")]
        when_cols = when_cols + ['ln_local_sea_level_pressure']
        X_train = X_train.drop(columns=when_cols)
        X_test = X_test.drop(columns=when_cols)

        # Fill NaNs in categorical columns
        for col in cat_cols:
            X_train[col] = X_train[col].astype(str).fillna("Missing")
            X_test[col] = X_test[col].astype(str).fillna("Missing")
    
        # One-hot encode ALL categorical columns
        X_train = pd.get_dummies(X_train, columns=cat_cols)
        X_test = pd.get_dummies(X_test, columns=cat_cols)
    
        # Align columns across train/test to avoid mismatch
        X_train, X_test = X_train.align(X_test, join="outer", axis=1, fill_value=0)
    
        # Fill NaNs in numeric columns with -1
        num_cols = X_train.select_dtypes(include=["float", "int"]).columns
        X_train[num_cols] = X_train[num_cols].fillna(-1)
        X_test[num_cols] = X_test[num_cols].fillna(-1)
    
        cat_features = []  # not needed for RF

    elif model_name in ["Ridge", "ElasticNet", "SVR"]:
        # Fill NaNs with column-wise means (numeric only)
        X_train = X_train.fillna(X_train.mean(numeric_only=True))
        X_test = X_test.fillna(X_train.mean(numeric_only=True))  # use train stats
    
        # Encode categorical columns using one-hot
        cat_cols = X_train.select_dtypes(include=["object", "category"]).columns
        X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
        X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
    
        # Align train/test columns (important after encoding!)
        X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)

        cat_features = []


    return X_train, X_test, cat_features



In [36]:
# Main Prediction

# Prediction for Individual Models
for name, model in models.items():
    cat_features = []
    X_train_mod, X_test_mod, cat_features = preprocess_for_model(name, X_train, X_test)

    if name == "CatBoost":
        model.fit(X_train_mod, y_train, cat_features=cat_features)

    elif name == "LightGBM":
        model.fit(X_train_mod, y_train, categorical_feature=cat_features)

    elif name in ["XGBoost", "RandomForest", "Ridge", "ElasticNet", "SVR"]:
        model.fit(X_train_mod, y_train)

    preds = model.predict(X_test_mod)
    r2 = r2_score(y_test, preds)
    print(f"{name} R² score: {r2:.4f}")

    # # Check for feature importance support
    # if hasattr(model, "feature_importances_"):
    #     importances = np.array(model.feature_importances_)
    #     features = X_train_mod.columns

    #     # Sort top 30
    #     sorted_indices = importances.argsort()[::-1][:20]
    #     print(f"\nTop 30 features for {name}:")
    #     for i in sorted_indices:
    #         print(f"{features[i]}: {importances[i]:.4f}")
    # else:
    #     print(f"{name} does not support feature_importances_")

    # print()



# === Level 1 meta model Prediction===
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

meta_models = {
    "RidgeCV": RidgeCV(
        alphas=[0.01, 0.1, 1.0, 10.0, 100.0],
        cv=5,
        scoring='r2',
        fit_intercept=True
    )
}

# Out-of-fold predictions for train set
train_meta = np.zeros((X_train.shape[0], len(models)))
test_meta = np.zeros((X_test.shape[0], len(models)))

for i, (name, model) in enumerate(models.items()):
    test_meta_fold = np.zeros((X_test.shape[0], n_folds))

    # Preprocess X_test ONCE per model
    X_test_preprocessed = preprocess_for_model(name, X_train, X_test)[1]


    for j, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_tr_raw = X_train.iloc[train_idx]
        X_val_raw = X_train.iloc[val_idx]
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]

        # Preprocess fold data
        cat_features = []
        X_tr, X_val, cat_features = preprocess_for_model(name, X_tr_raw, X_val_raw)
        # Align all 3 sets to ensure identical columns
        X_tr, X_val = X_tr.align(X_val, join="outer", axis=1, fill_value=0)
        X_tr, X_test_preprocessed = X_tr.align(X_test_preprocessed, join="outer", axis=1, fill_value=0)
        X_val, X_test_preprocessed = X_val.align(X_test_preprocessed, join="outer", axis=1, fill_value=0)

        model_clone = clone(model)

        if name == "LightGBM":
            model_clone.fit(X_tr, y_tr, categorical_feature=cat_features)
        elif name == "CatBoost":
            model_clone.fit(X_tr, y_tr, cat_features=cat_features)
        else:
            model_clone.fit(X_tr, y_tr)

        train_meta[val_idx, i] = model_clone.predict(X_val)
        test_meta_fold[:, j] = model_clone.predict(X_test_preprocessed)

    # Average test predictions across folds
    test_meta[:, i] = test_meta_fold.mean(axis=1)

# Fit and evaluate each meta-model
for name, model in meta_models.items():
    model.fit(train_meta, y_train)
    stack_preds = model.predict(test_meta)
    stack_r2 = r2_score(y_test, stack_preds)
    print(f"Level 1 meta model {name} R² score: {stack_r2:.4f}")


# Submit logic
# Out-of-fold predictions for train set

X_train = pipeline.fit_transform(train_df)
y_train = train_df['target']
X_test = pipeline.fit_transform(test_df)

train_meta = np.zeros((X_train.shape[0], len(models)))
test_meta = np.zeros((X_test.shape[0], len(models)))

for i, (name, model) in enumerate(models.items()):
    test_meta_fold = np.zeros((X_test.shape[0], n_folds))

    # Preprocess X_test ONCE per model
    X_test_preprocessed = preprocess_for_model(name, X_train, X_test)[1]


    for j, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_tr_raw = X_train.iloc[train_idx]
        X_val_raw = X_train.iloc[val_idx]
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]

        # Preprocess fold data
        cat_features = []
        X_tr, X_val, cat_features = preprocess_for_model(name, X_tr_raw, X_val_raw)
        # Align all 3 sets to ensure identical columns
        X_tr, X_val = X_tr.align(X_val, join="outer", axis=1, fill_value=0)
        X_tr, X_test_preprocessed = X_tr.align(X_test_preprocessed, join="outer", axis=1, fill_value=0)
        X_val, X_test_preprocessed = X_val.align(X_test_preprocessed, join="outer", axis=1, fill_value=0)

        model_clone = clone(model)

        if name == "LightGBM":
            model_clone.fit(X_tr, y_tr, categorical_feature=cat_features)
        elif name == "CatBoost":
            model_clone.fit(X_tr, y_tr, cat_features=cat_features)
        else:
            model_clone.fit(X_tr, y_tr)

        train_meta[val_idx, i] = model_clone.predict(X_val)
        test_meta_fold[:, j] = model_clone.predict(X_test_preprocessed)

    # Average test predictions across folds
    test_meta[:, i] = test_meta_fold.mean(axis=1)

# Fit and evaluate each meta-model
for name, model in meta_models.items():
    model.fit(train_meta, y_train)
    stack_preds = model.predict(test_meta)
    # stack_r2 = r2_score(y_test, stack_preds)
    # print(f"Level 1 meta model {name} R² score: {stack_r2:.4f}")
    submit_df = pd.DataFrame({'id': range(len(stack_preds)), 'target':stack_preds})
    submit_df.to_csv('submit.csv', index = False)


CatBoost R² score: 0.8525
XGBoost R² score: 0.8679
LightGBM R² score: 0.8603
RandomForest R² score: 0.7550
Ridge R² score: 0.6586
ElasticNet R² score: 0.6390
Level 1 meta model RidgeCV R² score: 0.8751


In [None]:
CatBoost R² score: 0.8525
XGBoost R² score: 0.8679
LightGBM R² score: 0.8603
RandomForest R² score: 0.7550
Ridge R² score: 0.6586
ElasticNet R² score: 0.6390
Level 1 meta model RidgeCV R² score: 0.8751

# Last hyperparameter tuning
CatBoost R² score: 0.8525
XGBoost R² score: 0.8679
LightGBM R² score: 0.8603
RandomForest R² score: 0.7550
Level 1 meta model RidgeCV R² score: 0.8739

# With station info
CatBoost R² score: 0.8525
XGBoost R² score: 0.8706
LightGBM R² score: 0.8617
RandomForest R² score: 0.7308
Level 1 meta model RidgeCV R² score: 0.8752

# After hyper parameter tuning
CatBoost R² score: 0.8536
XGBoost R² score: 0.8631
LightGBM R² score: 0.8572
RandomForest R² score: 0.7544
Level 1 meta model RidgeCV R² score: 0.8716

CatBoost R² score: 0.8120
XGBoost R² score: 0.8121
LightGBM R² score: 0.7900
RandomForest R² score: 0.7296
Level 1 meta model RidgeCV R² score: 0.8343

CatBoost R² score: 0.8142
XGBoost R² score: 0.8099
LightGBM R² score: 0.7877
RandomForest R² score: 0.7319
Level 1 meta model RidgeCV R² score: 0.8332

CatBoost R² score: 0.8102
XGBoost R² score: 0.8109
LightGBM R² score: 0.7893
RandomForest R² score: 0.7318
Level 1 meta model RidgeCV R² score: 0.8316

CatBoost R² score: 0.8110
XGBoost R² score: 0.8114
LightGBM R² score: 0.7810
RandomForest R² score: 0.7502
Level 1 meta model RidgeCV R² score: 0.8308

# After hyper parameter tuning
CatBoost R² score: 0.8536
XGBoost R² score: 0.8631
LightGBM R² score: 0.8572
RandomForest R² score: 0.7544
Level 1 meta model RidgeCV R² score: 0.8716

CatBoost R² score: 0.8110
XGBoost R² score: 0.8114
LightGBM R² score: 0.7715
RandomForest R² score: 0.7542
Level 1 meta model RidgeCV R² score: 0.8307

CatBoost R² score: 0.8100
XGBoost R² score: 0.8049
LightGBM R² score: 0.7715
RandomForest R² score: 0.7542
Level 1 meta model RidgeCV R² score: 0.8332

CatBoost R² score: 0.8047
XGBoost R² score: 0.8049
LightGBM R² score: 0.7715
RandomForest R² score: 0.7542
Level 1 meta model RidgeCV R² score: 0.8329
...

In [None]:
# Polynomial Feature generation and printing important features

X_train_ori = pipeline.fit_transform(train_df)
y_train_ori = train_df['target']


X_poly_process = X_train_ori.copy()
cat_cols = X_poly_process.select_dtypes(['category']).columns
X_poly_process[cat_cols] = X_poly_process[cat_cols].astype('int')
X_poly_process = X_poly_process.fillna(0)

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_poly_process)

feature_names = poly.get_feature_names_out(X_poly_process.columns)
X_poly_df = pd.DataFrame(X_poly, columns=feature_names)
print(X_poly_df.shape) # (13132, 9179) 
# display(X_poly_df.T.head(400))

# Step 1: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_poly_df, y_train_ori, test_size=0.2, random_state=42
)
# print(X_train.shape)

models = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42,
                                      boosting_type= 'Plain', 
    grow_policy= 'SymmetricTree', 
    iterations= 2200, 
    learning_rate= 0.10479224744093014,
    depth= 8, 
    l2_leaf_reg= 8.969716675882767, 
    random_strength= 1.05507240655598, 
    bagging_temperature= 0.3332356005734044, 
    border_count= 74,
        
                                 ),
    # "XGBoost": XGBRegressor(
    #     n_estimators=100,
    #     random_state=42,
    #     tree_method='hist',
    #     enable_categorical=True),

    # "LightGBM": LGBMRegressor(
    #     n_estimators=100,
    #     random_state=42,
    #     verbose=-1)
}


for name, model in models.items():
    # Train
    cat_9000_trained = model.fit(X_train, y_train)

    # Predict and evaluate
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    print(f"{name} R² score: {r2:.4f}")

# Store R2 and top 3000 DataFrames
r2_scores_full = {}
X_train_top3000_dict = {}
top3000_features_dict = {}

for name, model in models.items():
    print(f"\n📌 Training {name} on full feature set...")
    model.fit(X_train, y_train)
    
    # R2 on full features
    preds_full = model.predict(X_test)
    r2 = r2_score(y_test, preds_full)
    r2_scores_full[name] = r2
    print(f"✅ {name} R² score (full features): {r2:.4f}")
    
    # Get top 3000 features
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        sorted_indices = np.argsort(importances)[::-1][:3000]
        top_features = X_train.columns[sorted_indices]
        
        # Store top 3000 features per model
        X_train_top3000_dict[name] = X_train[top_features].copy()
        top3000_features_dict[name] = list(top_features)
        
        print(f"✅ {name} top 3000 features extracted.")
    else:
        print(f"⚠️ {name} does not support feature_importances_. Skipping top 3000 extraction.")



(13132, 9179)
CatBoost R² score: 0.8552

📌 Training CatBoost on full feature set...


In [30]:
# Random Forest Optuna run
# Split your preprocessed features and target
X_train_ori = pipeline.fit_transform(train_df)
y_train_ori = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_train_ori, test_size=0.2, random_state=42)

# Optuna objective function
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),  # base used 100
        "max_depth": trial.suggest_categorical("max_depth", [None] + list(range(5, 51))),  # base used None
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),  # base used 2
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),    # base used 1
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),  # base unclear, include common
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),  # default is True
        "n_jobs": -1,
        "random_state": 42,
        "verbose": 0
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return r2_score(y_test, preds)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Show best results
print("\n🔥 Best R² Score:", study.best_value)
print("🔥 Best Parameters:\n", study.best_params)

    # "RandomForest": RandomForestRegressor(
    #     n_estimators=100,
    #     max_depth=None,
    #     min_samples_split=2,
    #     min_samples_leaf=1,
    #     n_jobs=-1,
    #     random_state=42,
    #     verbose=0
    # ),

[I 2025-05-14 17:44:06,586] A new study created in memory with name: no-name-ba159552-06d1-4dae-9ae9-6bd3009211a6


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-14 17:44:09,842] Trial 0 finished with value: 0.6393483198327363 and parameters: {'n_estimators': 1000, 'max_depth': 41, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.6393483198327363.
[I 2025-05-14 17:44:10,689] Trial 1 finished with value: 0.4752017928091845 and parameters: {'n_estimators': 400, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.6393483198327363.
[I 2025-05-14 17:44:16,478] Trial 2 finished with value: 0.6897486872536919 and parameters: {'n_estimators': 200, 'max_depth': 33, 'min_samples_split': 12, 'min_samples_leaf': 10, 'max_features': None, 'bootstrap': True}. Best is trial 2 with value: 0.6897486872536919.
[I 2025-05-14 17:44:16,881] Trial 3 finished with value: 0.30537433127697133 and parameters: {'n_estimators': 300, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_featur

In [32]:
# LightGBM Optuna run
# Your preprocessed features
X_train_ori = pipeline.fit_transform(train_df)
y_train_ori = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_train_ori, test_size=0.2, random_state=42)
X_train, X_test, cat_features = preprocess_for_model('LightGBM', X_train, X_test)


def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 4000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "random_state": 42,
        "n_jobs": -1,
        'verbose' : -1
    }


    model = LGBMRegressor(**params)
    model.fit(X_train, y_train, categorical_feature=cat_features)
    preds = model.predict(X_test)
    return r2_score(y_test, preds)

# Run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True, n_jobs=-1)

# Print best results
print("\n🔥 Best R² Score:", study.best_value)
print("🔥 Best Parameters:\n", study.best_params)

[I 2025-05-14 18:01:04,863] A new study created in memory with name: no-name-19c80d9a-c587-40c4-8442-143ae5db4820


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-14 18:01:17,437] Trial 11 finished with value: 0.5975911518758866 and parameters: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.0302178803345446, 'num_leaves': 1721, 'min_child_samples': 15, 'subsample': 0.850569966679041, 'colsample_bytree': 0.9822952605646738, 'reg_alpha': 6.0315370079391, 'reg_lambda': 3.5470412587262743}. Best is trial 11 with value: 0.5975911518758866.
[I 2025-05-14 18:01:34,410] Trial 9 finished with value: 0.7597649823757288 and parameters: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.05306215383786456, 'num_leaves': 2926, 'min_child_samples': 92, 'subsample': 0.643628859197829, 'colsample_bytree': 0.6507275558473318, 'reg_alpha': 8.66196021878192, 'reg_lambda': 4.20424289204817}. Best is trial 9 with value: 0.7597649823757288.
[I 2025-05-14 18:02:27,099] Trial 12 finished with value: 0.83794916686368 and parameters: {'n_estimators': 1300, 'max_depth': 7, 'learning_rate': 0.2188314494010197, 'num_leaves': 238, 'min_child_samples

In [33]:
# Optuna run for xgboost
X_train_ori = pipeline.fit_transform(train_df)
y_train_ori = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_train_ori, test_size = 0.2, random_state=42)
X_train, X_test, cat_features = preprocess_for_model('XGBoost', X_train, X_test)


def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 4000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "random_state": 42,
        "verbosity": 0,
        "objective": "reg:squarederror", 
        "enable_categorical":True
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return r2_score(y_test, preds)

# Run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Print best results
print("\n🎯 Best R² Score:", study.best_value)
print("🔥 Best Parameters:\n", study.best_params)

[I 2025-05-14 18:20:03,895] A new study created in memory with name: no-name-75eccbc9-8bcf-4735-aa63-d4e77f0da50e


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-14 18:20:07,978] Trial 0 finished with value: 0.8439252659323679 and parameters: {'n_estimators': 1300, 'max_depth': 5, 'learning_rate': 0.0393664371123296, 'subsample': 0.7976482143187783, 'colsample_bytree': 0.9422857783464369, 'reg_alpha': 3.5592832944056396, 'reg_lambda': 7.683368484059959}. Best is trial 0 with value: 0.8439252659323679.
[I 2025-05-14 18:20:13,403] Trial 1 finished with value: 0.7978600520393005 and parameters: {'n_estimators': 3000, 'max_depth': 3, 'learning_rate': 0.024629726414522504, 'subsample': 0.6509878254936378, 'colsample_bytree': 0.890905223776405, 'reg_alpha': 2.925918508026358, 'reg_lambda': 5.841377085962448}. Best is trial 0 with value: 0.8439252659323679.
[I 2025-05-14 18:20:24,246] Trial 2 finished with value: 0.8368628593357522 and parameters: {'n_estimators': 1600, 'max_depth': 10, 'learning_rate': 0.08929134248623896, 'subsample': 0.6159577056560219, 'colsample_bytree': 0.9257683148637802, 'reg_alpha': 1.6839537342751554, 'reg_lambda'

In [35]:
# Optuna run for cat boost
X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_train_ori, test_size = 0.2, random_state=42)
X_train_mod, X_test_mod, cat_features = preprocess_for_model('CatBoost', X_train, X_test)


def objective(trial):
    boosting_type = trial.suggest_categorical("boosting_type", ["Plain", "Ordered"])
    if boosting_type == "Ordered":
        grow_policy = "SymmetricTree"
    else:
        grow_policy = trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"])
    
    params = {
        "iterations": trial.suggest_int("iterations", 300, 4000, step=100),
        "early_stopping_rounds": 50,
        "verbose": 0,
        "loss_function": "RMSE",
        "eval_metric": "R2",
        "boosting_type": boosting_type,
        "grow_policy": grow_policy,

        # Hyperparameters to tune
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 5.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Plain", "Ordered"]),
    }

    # CatBoost pool for better handling of validation set
    cat_features = [col for col in X_train.columns if X_train[col].dtype.name == 'category']
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    valid_pool = Pool(X_test, y_test, cat_features=cat_features)
    
    # train_pool = Pool(X_train, y_train)
    # valid_pool = Pool(X_test, y_test)

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=valid_pool)

    preds = model.predict(X_test)
    return r2_score(y_test, preds)

# Run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Print best results
print("\n🔥 Best R² Score:", study.best_value)
print("📦 Best Parameters:\n", study.best_params)

[I 2025-05-14 18:29:56,330] A new study created in memory with name: no-name-cfcef2cc-7631-4263-bb29-6ddbc688c46a


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-14 18:32:24,219] Trial 0 finished with value: 0.8216588838200614 and parameters: {'boosting_type': 'Ordered', 'iterations': 3800, 'learning_rate': 0.02430451089389866, 'depth': 9, 'l2_leaf_reg': 4.374671539978969, 'random_strength': 1.24363411539355, 'bagging_temperature': 0.07178134164855798, 'border_count': 220}. Best is trial 0 with value: 0.8216588838200614.
[I 2025-05-14 18:32:26,907] Trial 1 finished with value: 0.3897245146133438 and parameters: {'boosting_type': 'Ordered', 'iterations': 400, 'learning_rate': 0.01144089124374292, 'depth': 6, 'l2_leaf_reg': 7.6497446501287305, 'random_strength': 3.426106904956809, 'bagging_temperature': 0.6169544587664344, 'border_count': 163}. Best is trial 0 with value: 0.8216588838200614.
[I 2025-05-14 18:33:09,964] Trial 2 finished with value: 0.8276343770288965 and parameters: {'boosting_type': 'Ordered', 'iterations': 1500, 'learning_rate': 0.14915462815960986, 'depth': 9, 'l2_leaf_reg': 9.67023270059699, 'random_strength': 2.595

# Old code from here

In [2]:
# PolynomialFeatures prediction test

X_train_ori = pipeline.fit_transform(train_df)
X_test_ori = pipeline.fit_transform(test_df)
y_train_ori = train_df['target']

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

X_poly_process = X_train_ori.copy()
X_test_process = X_test_ori.copy()
cat_cols = X_poly_process.select_dtypes(['category']).columns
X_poly_process[cat_cols] = X_poly_process[cat_cols].astype('int')
X_test_process[cat_cols] = X_test_process[cat_cols].astype('int')
X_poly_process = X_poly_process.fillna(0)
X_test_process = X_test_process.fillna(0)

X_poly = poly.fit_transform(X_poly_process)
X_poly_test = poly.fit_transform(X_test_process)

feature_names = poly.get_feature_names_out(X_poly_process.columns)
X_poly_df = pd.DataFrame(X_poly, columns=feature_names)
X_poly_test_df = pd.DataFrame(X_poly_test, columns=feature_names)
# X_poly_df.drop(columns=X_train_ori.columns, inplace=True)


# Step 1: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_poly_df, y_train_ori, test_size=0.2, random_state=42
)
print(X_train.shape)
print(f"X_poly_test_df shape: {X_poly_test_df.shape}")

models = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42
                                  # , depth=8, iterations=2500
                                 )
    # ,"XGBoost": XGBRegressor(
    #     n_estimators=100,
    #     random_state=42,
    #     tree_method='hist',
    #     enable_categorical=True
    # ),

    # "LightGBM": LGBMRegressor(
    #     n_estimators=100,
    #     random_state=42,
    #     verbose=-1
    # ),

    # "RandomForest": RandomForestRegressor(
    #     n_estimators=100,
    #     max_depth=None,
    #     min_samples_split=2,
    #     min_samples_leaf=1,
    #     n_jobs=-1,
    #     random_state=42,
    #     verbose=0
    # ),

    # "Ridge": Ridge(alpha=1.0),

    # "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=5000, random_state=42)

}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)

    # Predict and evaluate
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    print(f"{name} R² score: {r2:.4f}")

    if name == "CatBoost":
        saved_catboost_model = model

    # Submit logic
    # subs = model.predict(X_poly_test_df)
    # submit_df = pd.DataFrame({'id': range(len(subs)), 'target':subs})
    # submit_df.to_csv('submit.csv', index = False)
    

    # Check for feature importance support
    # if hasattr(model, "feature_importances_"):
    #     importances = np.array(model.feature_importances_)
    #     features = X_train.columns

    #     # Sort top 30
    #     sorted_indices = importances.argsort()[::-1]
    #     print(f"\nTop 30 features for {name}:")
    #     for i in sorted_indices:
    #         print(f"{features[i]}: {importances[i]:.4f}")
    # else:
    #     print(f"{name} does not support feature_importances_")

    # print()

NameError: name 'pipeline' is not defined

In [None]:
import time
import numpy as np

print("🌀 CPU Warm-up (Heavy) starting...")
start = time.time()

# Matrix multiplication on large matrices for a longer duration
for _ in range(200):  # Increase iterations for longer warm-up
    a = np.random.rand(4000, 4000)
    b = np.random.rand(4000, 4000)
    c = np.dot(a, b)

end = time.time()
print(f"✅ Warm-up complete in {end - start:.2f} seconds")

🌀 CPU Warm-up (Heavy) starting...


In [30]:
# Step 1: Get top 3000 features
if hasattr(saved_catboost_model, "feature_importances_"):
    importances = np.array(saved_catboost_model.feature_importances_)
    features = X_poly_df.columns
    sorted_indices = importances.argsort()[::-1][:3000]
    top_3000_features = [features[i] for i in sorted_indices]
else:
    raise ValueError("Model does not support feature_importances_")

# Step 2: Slice the DataFrame
X_train_top3000 = X_train[top_3000_features]

    # Step 3: Train CatBoost with reduced features
model_top3000 = CatBoostRegressor(verbose=0, random_state=42)
model_top3000.fit(X_train_top3000, y_train)

# Step 4: Evaluate
preds = model_top3000.predict(X_test[top_3000_features])
r2 = r2_score(y_test, preds)
print(f"CatBoost R² with top 3000 features: {r2:.4f}")

CatBoost R² with top 3000 features: 0.8298


In [None]:
CatBoost R² score: 0.8274

Top 30 features for CatBoost:
vapor_pressure_n_late_evening_mean week_cos: 5.9547
dew_point_n_late_evening_mean humidity_n_late_evening_mean: 2.0209
dew_point_n_late_evening_mean humidity_n_afternoon_mean: 1.5551
vapor_pressure_n_evening_mean week_cos: 1.4185
vapor_pressure_n_late_evening_mean month_cos: 1.2828
dew_point_n_late_evening_mean humidity_n_evening_mean: 1.2668
humidity_n_late_evening_mean climatology_temp: 0.9876
surface_temp_n_late_evening_mean week_sin: 0.8565
visibility_n_morning_mean climatology_temp: 0.7640
humidity_n_evening_mean humidity_n_range: 0.7342
vapor_pressure_n_evening_mean climatology_temp: 0.6970
dew_point_n_late_evening_mean week_cos: 0.6267
climatology_temp month: 0.5823
surface_temp_n_when_max vapor_pressure_n_late_evening_mean: 0.5606
surface_temp_n_late_evening_mean month_sin: 0.5475
humidity_n_evening_mean climatology_temp: 0.5304
wind_speed_n_range climatology_temp: 0.5263
vapor_pressure_n_late_evening_mean week_sin: 0.5110
climatology_temp week_of_year: 0.4830
local_pressure_n_afternoon_mean vapor_pressure_n_late_evening_mean: 0.4803
snow_depth_n_when_max climatology_temp: 0.4695
vapor_pressure_n_afternoon_mean climatology_temp: 0.4689
humidity_n_evening_mean wind_speed_n_range: 0.4253
vapor_pressure_n_late_evening_mean climatology_temp: 0.4207
dew_point_n_evening_mean climatology_temp: 0.4083
humidity_n_late_evening_mean week_sin: 0.4040
dew_point_n_late_evening_mean week_of_year: 0.4038
dew_point_n_late_evening_mean visibility_n_early_night_mean: 0.3928
surface_temp_n_late_evening_mean quarter_sin: 0.3727
surface_temp_n_evening_mean climatology_temp: 0.3692
surface_temp_n_afternoon_mean climatology_temp: 0.3680
dew_point_n_morning_mean climatology_temp: 0.3663
sea_level_pressure_n_afternoon_mean climatology_temp: 0.3641
sea_level_pressure_n_late_evening_mean surface_temp_n_evening_mean: 0.3625
dew_point_n_late_evening_mean humidity_n_when_min: 0.3618
surface_temp_n_evening_mean quarter_sin: 0.3562
cloud_cover_n_afternoon_mean dew_point_n_late_evening_mean: 0.3540
dew_point_n_late_evening_mean climatology_temp: 0.3519
vapor_pressure_n_late_night_mean week_sin: 0.3507
surface_temp_n_evening_mean week_cos: 0.3473

XGBoost R² score: 0.9791

Top 30 features for XGBoost:
dew_point_n_late_evening_mean humidity_n_afternoon_mean: 0.0821
wind_speed_n_range week_of_year: 0.0219
vapor_pressure_n_late_evening_mean week_cos: 0.0203
snow_depth_n_when_max week_sin: 0.0185
surface_temp_n_evening_mean month_sin: 0.0149
surface_temp_n_afternoon_mean quarter_sin: 0.0132
surface_temp_n_late_evening_mean quarter: 0.0121
dew_point_n_late_evening_mean quarter: 0.0119
climatology_temp fog_factor: 0.0111
dew_point_n_late_evening_mean humidity_n_late_evening_mean: 0.0108
precipitation_n_when_max vapor_pressure_n_late_evening_mean: 0.0105
precipitation_n_total_sum day_of_year: 0.0101
humidity_n_morning_mean week_sin: 0.0098
precipitation_n_range week_cos: 0.0095
precipitation_n_when_max vapor_pressure_n_evening_mean: 0.0094
sea_level_pressure_n_late_night_mean day_of_year: 0.0077
surface_temp_n_when_min wind_speed_n_evening_mean: 0.0075
dew_point_n_late_evening_mean humidity_n_evening_mean: 0.0072
surface_temp_n_total_mean week_sin: 0.0072
week_sin week_cos: 0.0065
local_pressure_n_range precipitation_n_last_condition_hour: 0.0064
visibility_n_range wind_speed_n_range: 0.0063
sunshine_duration_n_morning_mean vapor_pressure_n_afternoon_mean: 0.0062
sunshine_duration_n_morning_mean surface_temp_n_total_mean: 0.0056
surface_temp_n_morning_mean surface_temp_n_late_evening_mean: 0.0055
visibility_n_morning_mean wind_speed_n_evening_mean: 0.0054
surface_temp_n_late_evening_mean day_of_year: 0.0053
month_sin week_sin: 0.0052
vapor_pressure_n_early_night_mean climatology_temp: 0.0050
dew_point_n_evening_mean sunshine_duration_n_last_condition_hour: 0.0049
surface_temp_n_late_evening_mean month_cos: 0.0048
surface_temp_n_late_evening_mean week_sin: 0.0047
sunshine_duration_n_last_condition_hour surface_temp_n_late_evening_mean: 0.0047
vapor_pressure_n_afternoon_mean wind_speed_n_late_night_mean: 0.0046
local_pressure_n_when_max visibility_n_evening_mean: 0.0045
surface_temp_n_late_evening_mean vapor_pressure_n_afternoon_mean: 0.0044
surface_temp_n_when_min fog_factor: 0.0043
dew_point_n_late_evening_mean climatology_temp: 0.0040
sea_level_pressure_n_range vapor_pressure_n_late_evening_mean: 0.0040
dew_point_n_late_night_mean day_of_year: 0.0039

LightGBM R² score: 0.9093

Top 30 features for LightGBM:
humidity_n_late_evening_mean climatology_temp: 31.0000
vapor_pressure_n_late_evening_mean week_cos: 26.0000
dew_point_n_late_evening_mean humidity_n_late_evening_mean: 17.0000
dew_point_n_late_evening_mean month_cos: 15.0000
vapor_pressure_n_late_evening_mean week_sin: 14.0000
climatology_temp week_of_year: 14.0000
surface_temp_n_late_evening_mean week_sin: 13.0000
climatology_temp week_cos: 13.0000
humidity_n_evening_mean climatology_temp: 12.0000
dew_point_n_late_evening_mean week_cos: 11.0000
dew_point_n_late_evening_mean week_sin: 11.0000
local_pressure_n_when_min climatology_temp: 11.0000
dew_point_n_late_evening_mean climatology_temp: 10.0000
climatology_temp month: 10.0000
surface_temp_n_late_evening_mean week_cos: 10.0000
humidity_n_evening_mean humidity_n_range: 10.0000
dew_point_n_evening_mean climatology_temp: 10.0000
dew_point_n_evening_mean humidity_n_late_evening_mean: 9.0000
humidity_n_late_night_mean humidity_n_late_evening_mean: 9.0000
surface_temp_n_late_evening_mean month_cos: 9.0000
climatology_temp week_sin: 9.0000
sunshine_duration_n_evening_mean vapor_pressure_n_late_evening_mean: 8.0000
climatology_temp month_sin: 8.0000
surface_temp_n_evening_mean week_cos: 8.0000
vapor_pressure_n_evening_mean climatology_temp: 8.0000
dew_point_n_early_night_mean week_cos: 8.0000
month_cos day_sin: 8.0000
dew_point_n_late_evening_mean humidity_n_evening_mean: 8.0000
climatology_temp month_cos: 8.0000
humidity_n_late_night_mean climatology_temp: 7.0000
local_pressure_n_when_min day_sin: 7.0000
climatology_temp day_of_week: 7.0000
day_sin dow_sin: 7.0000
vapor_pressure_n_evening_mean week_cos: 7.0000
vapor_pressure_n_evening_mean week_sin: 7.0000
vapor_pressure_n_late_evening_mean month_cos: 7.0000
humidity_n_early_night_mean climatology_temp: 7.0000
vapor_pressure_n_late_evening_mean wind_speed_n_late_night_mean: 7.0000
vapor_pressure_n_late_evening_mean climatology_temp: 7.0000
snow_depth_n_when_max climatology_temp: 7.0000

RandomForest R² score: 0.9599

Top 30 features for RandomForest:
vapor_pressure_n_late_evening_mean week_cos: 0.1071
dew_point_n_late_evening_mean humidity_n_afternoon_mean: 0.0399
dew_point_n_late_evening_mean humidity_n_evening_mean: 0.0396
dew_point_n_late_evening_mean humidity_n_late_evening_mean: 0.0277
dew_point_n_late_evening_mean climatology_temp: 0.0147
surface_temp_n_late_evening_mean week_sin: 0.0146
precipitation_n_when_max vapor_pressure_n_evening_mean: 0.0103
surface_temp_n_evening_mean week_cos: 0.0069
surface_temp_n_late_evening_mean day_of_year: 0.0058
precipitation_n_when_min vapor_pressure_n_evening_mean: 0.0057
surface_temp_n_total_mean week_sin: 0.0057
surface_temp_n_evening_mean climatology_temp: 0.0054
surface_temp_n_when_min wind_speed_n_evening_mean: 0.0050
wind_speed_n_range day_of_year: 0.0047
surface_temp_n_late_evening_mean month_cos: 0.0046
dew_point_n_late_evening_mean week_cos: 0.0044
precipitation_n_when_max vapor_pressure_n_late_evening_mean: 0.0041
sunshine_duration_n_evening_mean vapor_pressure_n_late_evening_mean: 0.0039
surface_temp_n_late_evening_mean week_cos: 0.0038
humidity_n_range week_sin: 0.0037
dew_point_n_late_evening_mean month_cos: 0.0037
climatology_temp week_cos: 0.0035
precipitation_n_when_min vapor_pressure_n_late_evening_mean: 0.0035
surface_temp_n_evening_mean month_sin: 0.0035
snow_depth_n_when_max week_sin: 0.0034
surface_temp_n_when_min wind_speed_n_afternoon_mean: 0.0033
snow_depth_n_when_max climatology_temp: 0.0032
wind_speed_n_range week_of_year: 0.0031
climatology_temp day_of_year: 0.0029
surface_temp_n_late_evening_mean week_of_year: 0.0029
surface_temp_n_evening_mean week_sin: 0.0029
wind_speed_n_afternoon_mean week_of_year: 0.0029
dew_point_n_evening_mean climatology_temp: 0.0028
surface_temp_n_late_evening_mean month_sin: 0.0028
humidity_n_morning_mean week_sin: 0.0027
wind_speed_n_afternoon_mean day_of_year: 0.0025
vapor_pressure_n_evening_mean climatology_temp: 0.0025
vapor_pressure_n_late_evening_mean climatology_temp: 0.0024
humidity_n_late_evening_mean climatology_temp: 0.0023
dew_point_n_late_evening_mean surface_temp_n_range: 0.0022

In [None]:
# submit logic

# Prediction for Individual Models
for name, model in models.items():
    cat_features = []
    X_train_mod, X_test_mod, cat_features = preprocess_for_model(name, X_train, X_test)

    if name == "CatBoost":
        model.fit(X_train_mod, y_train, cat_features=cat_features)

    elif name == "LightGBM":
        model.fit(X_train_mod, y_train, categorical_feature=cat_features)

    elif name in ["XGBoost", "RandomForest", "Ridge", "ElasticNet", "SVR"]:
        model.fit(X_train_mod, y_train)

    preds = model.predict(X_test_mod)
    r2 = r2_score(y_test, preds)
    print(f"{name} R² score: {r2:.4f}")

    # importances = model.feature_importances_
    # features = X_train_mod.columns
    # sorted_indices = importances.argsort()[::-1]
    
    # Print top features
    # for i in sorted_indices[:20]:
    #     print(f"{features[i]}: {importances[i]:.4f}")

    # print()



# === Level 1 meta model Prediction===
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

meta_models = {
    "RidgeCV": RidgeCV(
        alphas=[0.01, 0.1, 1.0, 10.0, 100.0],
        cv=5,
        scoring='r2',
        fit_intercept=True
    )
}
X_train = pipeline.fit_transform(train_df)
y_train = train_df['target']
X_test = pipeline.fit_transform(test_df)


# Out-of-fold predictions for train set
train_meta = np.zeros((X_train.shape[0], len(models)))
test_meta = np.zeros((X_test.shape[0], len(models)))

for i, (name, model) in enumerate(models.items()):
    test_meta_fold = np.zeros((X_test.shape[0], n_folds))

    # Preprocess X_test ONCE per model
    X_test_preprocessed = preprocess_for_model(name, X_train, X_test)[1]


    for j, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_tr_raw = X_train.iloc[train_idx]
        X_val_raw = X_train.iloc[val_idx]
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]

        # Preprocess fold data
        cat_features = []
        X_tr, X_val, cat_features = preprocess_for_model(name, X_tr_raw, X_val_raw)
        # Align all 3 sets to ensure identical columns
        X_tr, X_val = X_tr.align(X_val, join="outer", axis=1, fill_value=0)
        X_tr, X_test_preprocessed = X_tr.align(X_test_preprocessed, join="outer", axis=1, fill_value=0)
        X_val, X_test_preprocessed = X_val.align(X_test_preprocessed, join="outer", axis=1, fill_value=0)

        model_clone = clone(model)

        if name == "LightGBM":
            model_clone.fit(X_tr, y_tr, categorical_feature=cat_features)
        elif name == "CatBoost":
            model_clone.fit(X_tr, y_tr, cat_features=cat_features)
        else:
            model_clone.fit(X_tr, y_tr)

        train_meta[val_idx, i] = model_clone.predict(X_val)
        test_meta_fold[:, j] = model_clone.predict(X_test_preprocessed)

    # Average test predictions across folds
    test_meta[:, i] = test_meta_fold.mean(axis=1)

# Fit and evaluate each meta-model
for name, model in meta_models.items():
    model.fit(train_meta, y_train)
    stack_preds = model.predict(test_meta)

    submit_df = pd.DataFrame({'id': range(len(stack_preds)), 'target':stack_preds})
    submit_df.to_csv('submit.csv', index = False)
    
    # stack_r2 = r2_score(y_test, stack_preds)
    # print(f"Level 1 meta model {name} R² score: {stack_r2:.4f}")


In [None]:
# After hyper parameter tuning
CatBoost R² score: 0.8536
XGBoost R² score: 0.8631
LightGBM R² score: 0.8572
RandomForest R² score: 0.7544
Level 1 meta model RidgeCV R² score: 0.8716

CatBoost R² score: 0.8110
XGBoost R² score: 0.8114
LightGBM R² score: 0.7715
RandomForest R² score: 0.7542
Level 1 meta model RidgeCV R² score: 0.8307

CatBoost R² score: 0.8100
XGBoost R² score: 0.8049
LightGBM R² score: 0.7715
RandomForest R² score: 0.7542
Level 1 meta model RidgeCV R² score: 0.8332

CatBoost R² score: 0.8047
XGBoost R² score: 0.8049
LightGBM R² score: 0.7715
RandomForest R² score: 0.7542
Level 1 meta model RidgeCV R² score: 0.8329

CatBoost R² score: 0.8047
XGBoost R² score: 0.7980
LightGBM R² score: 0.7715
RandomForest R² score: 0.7542
Level 1 meta model RidgeCV R² score: 0.8353

CatBoost R² score: 0.8019
XGBoost R² score: 0.8049
LightGBM R² score: 0.7673
RandomForest R² score: 0.7560
Level 1 meta model RidgeCV R² score: 0.8305

CatBoost R² score: 0.8019
XGBoost R² score: 0.7932
LightGBM R² score: 0.7673
RandomForest R² score: 0.7560
Level 1 meta model RidgeCV R² score: 0.8314

CatBoost R² score: 0.8019
XGBoost R² score: 0.7928
LightGBM R² score: 0.7673
RandomForest R² score: 0.7560
Level 1 meta model RidgeCV R² score: 0.8315

CatBoost R² score: 0.7993
XGBoost R² score: 0.7906
LightGBM R² score: 0.7524
RandomForest R² score: 0.7547
Level 1 meta model RidgeCV R² score: 0.8275

# After station info data exclusion
CatBoost R² score: 0.7982
XGBoost R² score: 0.7880
LightGBM R² score: 0.7557
RandomForest R² score: 0.7557
Level 1 meta model RidgeCV R² score: 0.8277

# With day of year
CatBoost R² score: 0.8095
XGBoost R² score: 0.7996
LightGBM R² score: 0.7635
RandomForest R² score: 0.7333
Ridge R² score: 0.6655
ElasticNet R² score: 0.6367
Level 1 meta model RidgeCV R² score: 0.8330

# with no day of year and sin cos
CatBoost R² score: 0.8054
XGBoost R² score: 0.7965
LightGBM R² score: 0.7573
RandomForest R² score: 0.7298
Ridge R² score: 0.6655
ElasticNet R² score: 0.6367
Level 1 meta model RidgeCV R² score: 0.8331

CatBoost R² score: 0.8082
XGBoost R² score: 0.7996
LightGBM R² score: 0.7635
RandomForest R² score: 0.7348
Ridge R² score: 0.6655
ElasticNet R² score: 0.6367
Level 1 meta model RidgeCV R² score: 0.833

# After station info addition
CatBoost R² score: 0.8066
XGBoost R² score: 0.8005
LightGBM R² score: 0.7640
RandomForest R² score: 0.7348
Ridge R² score: 0.6626
ElasticNet R² score: 0.6351
Level 1 meta model RidgeCV R² score: 0.8336

# Cat boost depth and iteration addition
CatBoost R2 score: 0.8433
XGBoost R2 score: 0.8005
LightGBM R2 score: 0.7640
RandomForest R2 score: 0.7348
Ridge R2 score: 0.6626
ElasticNet R2 score: 0.6351
Level 1 meta model RidgeCV R2 score: 0.8540
Level 2 input model RidgeCV R2 score: 0.8330


CatBoost R² score: 0.8016
XGBoost R² score: 0.7973
LightGBM R² score: 0.7582
RandomForest R² score: 0.7570
Ridge R² score: 0.6595
ElasticNet R² score: 0.6352
Level 1 meta model RidgeCV R² score: 0.8284

# Without day of year
CatBoost R² score: 0.7992
XGBoost R² score: 0.7830
LightGBM R² score: 0.7503
RandomForest R² score: 0.7512
Ridge R² score: 0.6595
ElasticNet R² score: 0.6352
Level 1 meta model RidgeCV R² score: 0.8264

# Without vapor pressure
CatBoost R² score: 0.7999
XGBoost R² score: 0.7864
LightGBM R² score: 0.7544
RandomForest R² score: 0.7584
Ridge R² score: 0.6587
ElasticNet R² score: 0.6315
Level 1 meta model RidgeCV R² score: 0.8292

CatBoost R² score: 0.8016
XGBoost R² score: 0.7973
LightGBM R² score: 0.7582
RandomForest R² score: 0.7570
Level 1 meta model RidgeCV R² score: 0.8278

CatBoost R² score: 0.8016
XGBoost R² score: 0.7909
LightGBM R² score: 0.7554
RandomForest R² score: 0.7570
Level 1 meta model RidgeCV R² score: 0.8281

CatBoost R² score: 0.7985
XGBoost R² score: 0.7909
LightGBM R² score: 0.7554
RandomForest R² score: 0.7570
Level 1 meta model RidgeCV R² score: 0.8273

CatBoost R² score: 0.7967
XGBoost R² score: 0.7571
LightGBM R² score: 0.7450
RandomForest R² score: 0.7538
Level 1 meta model RidgeCV R² score: 0.8117

CatBoost R² score: 0.7855
XGBoost R² score: 0.7973
LightGBM R² score: 0.7582
RandomForest R² score: 0.7482
Manual Stacking RidgeCV R² score: 0.8226

CatBoost R² score: 0.7861
XGBoost R² score: 0.7944
LightGBM R² score: 0.7533
RandomForest R² score: 0.7432
Manual Stacking RidgeCV R² score: 0.8234

CatBoost R² score: 0.7844
XGBoost R² score: 0.7944
LightGBM R² score: 0.7533
RandomForest R² score: 0.7424
Manual Stacking RidgeCV R² score: 0.8220

CatBoost R² score: 0.7844
XGBoost R² score: 0.7944
LightGBM R² score: 0.7533
RandomForest R² score: 0.7408
Manual Stacking RidgeCV R² score: 0.8220


CatBoost R² score: 0.7844
XGBoost R² score: 0.7944
LightGBM R² score: 0.7533
Manual Stacking RidgeCV R² score: 0.8218

CatBoost R² score: 0.7797
XGBoost R² score: 0.7944
LightGBM R² score: 0.7533
Manual Stacking RidgeCV R² score: 0.8185

CatBoost R² score: 0.7709
XGBoost R² score: 0.7847
LightGBM R² score: 0.7505
Manual Stacking RidgeCV R² score: 0.8097

CatBoost R² score: 0.7693
XGBoost R² score: 0.7911
LightGBM R² score: 0.7434
Manual Stacking RidgeCV R² score: 0.8100

CatBoost R² score: 0.7693
XGBoost R² score: 0.7911
LightGBM R² score: 0.7434
Stacked RidgeCV R² score: 0.8113

In [24]:
# Range Histogram
import matplotlib.pyplot as plt

def plot_histograms_grid(df, bins=50, cols_per_row=5, figsize=(20, 4)):
    numeric_cols = df.select_dtypes(include=['number']).columns
    num_cols = len(numeric_cols)
    num_rows = int(np.ceil(num_cols / cols_per_row))
    
    plt.figure(figsize=(figsize[0], figsize[1] * num_rows))

    for i, col in enumerate(numeric_cols):
        plt.subplot(num_rows, cols_per_row, i + 1)
        plt.hist(df[col].dropna(), bins=bins, edgecolor='black')
        plt.title(col)
        plt.xlabel('')
        plt.ylabel('')
        plt.grid(True)

    plt.tight_layout()
    plt.show()

X_train_ori = pipeline.fit_transform(train_df)
# plot_histograms_grid(X_train_ori)

In [None]:
# Graveyard

# Level 2 training meta-data
n_splits = 5
level2_train_meta = np.zeros((train_meta.shape[0], len(meta_models)))
level2_test_meta = np.zeros((X_test.shape[0], len(meta_models)))

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for i, (name, model) in enumerate(meta_models.items()):
    level2_test_meta_fold = np.zeros((X_test.shape[0], n_splits))

    for fold, (tr_idx, val_idx) in enumerate(kf.split(train_meta)):
        X_tr_meta, X_val_meta = train_meta[tr_idx], train_meta[val_idx]
        y_tr_meta, y_val_meta = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model_clone = clone(model)
        model_clone.fit(X_tr_meta, y_tr_meta)

        # Predict for validation fold
        level2_train_meta[val_idx, i] = model_clone.predict(X_val_meta)

        # Predict for test set
        level2_test_meta_fold[:, fold] = model_clone.predict(test_meta)

    # Average predictions across folds for final test_meta
    level2_test_meta[:, i] = level2_test_meta_fold.mean(axis=1)

for name, model in meta_models.items():
    # Clone to avoid contamination from previous folds
    model_clone = clone(model)
    
    # Fit on full level 2 training set
    model_clone.fit(level2_train_meta, y_train)
    
    # Predict on level 2 test meta
    preds = model_clone.predict(level2_test_meta)
    
    # Calculate R²
    r2 = r2_score(y_test, preds)
    
    print(f"Level 2 meta model {name:10} R²: {r2:.4f}")

    # r2 = r2_score(y_train, level2_train_meta[:, i])
    # print(f"Level 2 meta model {name} R² score: {r2:.4f}")

# # === STACKED MODEL StackingRegressor===
# # Wrap individual models for stacking
# estimators = [
#     ('cat', models["CatBoost"]),
#     ('xgb', models["XGBoost"]),
#     ('lgb', models["LightGBM"])
# ]

# meta_model = RidgeCV()

# stack = StackingRegressor(
#     estimators=estimators,
#     final_estimator=meta_model,
#     cv=5  # OOF stacking
# )

# # Train stacking model
# stack.fit(X_train, y_train)
# stack_preds = stack.predict(X_test)
# stack_r2 = r2_score(y_test, stack_preds)
# print(f"StackingRegressor RidgeCV R² score: {stack_r2:.4f}")


# # === Level 2 meta model prediction ===

# n_folds = 5
# kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# # Prepare placeholders
# level2_train = np.zeros((train_meta.shape[0], len(meta_models)))
# level2_test = np.zeros((test_meta.shape[0], len(meta_models), n_folds))

# # For each fold, fill in level2_train and test predictions
# for fold, (tr_idx, val_idx) in enumerate(kf.split(train_meta)):
#     print(f"LEVEL 2 FOLD {fold+1}/{n_folds}")
    
#     X_tr, X_val = train_meta[tr_idx], train_meta[val_idx]
#     y_tr, y_val = y_train[tr_idx], y_train[val_idx]

#     for i, (name, model) in enumerate(meta_models.items()):
#         model_clone = clone(model)
#         model_clone.fit(X_tr, y_tr)

#         level2_train[val_idx, i] = model_clone.predict(X_val)
#         level2_test[:, i, fold] = model_clone.predict(test_meta)

# # Average test predictions across folds
# level2_test_mean = np.mean(level2_test, axis=2)

# # Now train final level 2 model on OOF predictions
# final_model = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0])
# final_model.fit(level2_train, y_train)

# final_oof_preds = final_model.predict(level2_train)
# final_test_preds = final_model.predict(level2_test_mean)

# # Evaluation
# final_r2 = r2_score(y_train, final_oof_preds)
# print(f"Level 2 Meta Model RidgeCV R²: {final_r2:.4f}")


In [175]:
# Old code from now

In [None]:
# Main Code

X_train_ori = pipeline.fit_transform(train_df)
y_test_ori = train_df['target']


X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_test_ori, test_size = 0.2, random_state=42)

# STEP 4: Define models
models = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42, verbose = -1)
}

# STEP 5: Train and evaluate individual models
for name, model in models.items():
    if name == "CatBoost":
        model.fit(X_train, y_train, cat_features=cat_features)
    elif name == "LightGBM":
        model.fit(X_train, y_train, categorical_feature=cat_features)
    else:  # XGBoost
        model.fit(X_train, y_train)

    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    print(f"{name} R² score: {r2:.4f}")

# === STACKED MODEL ===
# Wrap individual models for stacking
estimators = [
    ('cat', models["CatBoost"]),
    ('xgb', models["XGBoost"]),
    ('lgb', models["LightGBM"])
]

meta_model = RidgeCV()

stack = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    cv=5  # OOF stacking
)

# Train stacking model
stack.fit(X_train, y_train)
stack_preds = stack.predict(X_test)
stack_r2 = r2_score(y_test, stack_preds)
print(f"Stacked RidgeCV R² score: {stack_r2:.4f}")


In [None]:
X_train_ori = pipeline.fit_transform(train_df)
y_test_ori = train_df['target']
kaka = pipeline.fit_transform(test_df)

stack.fit(X_train_ori, y_test_ori)
submit_preds = stack.predict(kaka)
submit_df = pd.DataFrame({
    'id': range(len(submit_preds)),
    'target': submit_preds
})
submit_df.head(200)
submit_df.to_csv("submission.csv", index = False)

In [None]:
# Time Condensed Features + Original Features (About 400 features)

X_train_ori = pipeline.fit_transform(train_df)
y_test_ori = train_df['target']


X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_test_ori, test_size = 0.2, random_state=42)

# STEP 4: Define models
models = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42, verbose = -1)
}

# STEP 5: Train and evaluate individual models
for name, model in models.items():
    if name == "CatBoost":
        model.fit(X_train, y_train, cat_features=cat_features)
    elif name == "LightGBM":
        model.fit(X_train, y_train, categorical_feature=cat_features)
    else:  # XGBoost
        model.fit(X_train, y_train)

    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    print(f"{name} R² score: {r2:.4f}")

# === STACKED MODEL ===
# Wrap individual models for stacking
estimators = [
    ('cat', models["CatBoost"]),
    ('xgb', models["XGBoost"]),
    ('lgb', models["LightGBM"])
]

meta_model = RidgeCV()

stack = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    cv=5  # OOF stacking
)

# Train stacking model
stack.fit(X_train, y_train)
stack_preds = stack.predict(X_test)
stack_r2 = r2_score(y_test, stack_preds)
print(f"Stacked RidgeCV R² score: {stack_r2:.4f}")


In [20]:
# No feature engineering

# STEP 0: Define dropped columns
drop_cols = ["target", "id", "station", "station_name"]

# STEP 1: Extract month & day from date column (assumed to be named 'date' in MM-DD format)
def preprocess(df):
    df = df.copy()
    df["month"] = df["date"].str.split("-").str[0].astype(int)
    df["day"] = df["date"].str.split("-").str[1].astype(int)
    df = df.drop(columns=["date"] + [col for col in drop_cols if col in df.columns])
    return df



# STEP 2: Prepare data
X_train_ori = preprocess(train_df)
# X_test = preprocess(test_df)
y_train_ori = train_df["target"]
# y_test = test_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_train_ori, test_size = 0.2, random_state=42)


# STEP 3: Detect categorical columns
cat_features = X_train.select_dtypes(include="object").columns.tolist()

# STEP 4: Define models
models = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42, verbose = -1),
}

# STEP 5: Train and evaluate
for name, model in models.items():
    if name == "CatBoost":
        model.fit(X_train, y_train, cat_features=cat_features)
    elif name == "LightGBM":
        model.fit(X_train, y_train, categorical_feature=cat_features)
    else:  # XGBoost
        model.fit(X_train, y_train)

    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    print(f"{name} R² score: {r2:.4f}")


CatBoost R² score: 0.7739
XGBoost R² score: 0.7319
LightGBM R² score: 0.7280


In [37]:

# STEP 0: Define dropped columns
drop_cols = ["target", "id", "station", "station_name"]

# STEP 1: Extract month & day from date column (assumed to be named 'date' in MM-DD format)
def preprocess(df):
    df = df.copy()
    df["month"] = df["date"].str.split("-").str[0].astype(int)
    df["day"] = df["date"].str.split("-").str[1].astype(int)
    df = df.drop(columns=["date"] + [col for col in drop_cols if col in df.columns])
    return df



# STEP 2: Prepare data
X_train = preprocess(train_df)
# X_test = preprocess(test_df)
y_train = train_df["target"]
# y_test = test_df["target"]
x_pre = preprocess(test_df)

# X_train, X_test, y_train, y_test = train_test_split(X_train_ori, y_train_ori, test_size = 0.2, random_state=42)


# STEP 3: Detect categorical columns
cat_features = X_train.select_dtypes(include="object").columns.tolist()




model = CatBoostRegressor(verbose=0, random_state=42)
model.fit(X_train, y_train, cat_features=cat_features)
pred = model.predict(x_pre)

submission = pd.DataFrame({
    "id": range(len(pred)),     # generate sequential IDs: 0, 1, 2, ...
    "target": pred              # predicted values
})

# Save to CSV
submission.to_csv("cat_boost.csv", index=False)



# STEP 4: Define models
# models = {
#     "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
#     # "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
#     # "LightGBM": LGBMRegressor(n_estimators=100, random_state=42),
# }

# STEP 5: Train and evaluate
# for name, model in models.items():
#     if name == "CatBoost":
#         model.fit(X_train, y_train, cat_features=cat_features)
#     # elif name == "LightGBM":
#     #     model.fit(X_train, y_train, categorical_feature=cat_features)
#     # else:  # XGBoost
#     #     model.fit(X_train, y_train)

#     preds = model.predict(X_test)
#     r2 = r2_score(y_test, preds)
#     print(f"{name} R² score: {r2:.4f}")
    


In [31]:
# target, id, station, station_name
train_df['date']

0        01-01
1        01-02
2        01-03
3        01-04
4        01-05
         ...  
13127    12-26
13128    12-27
13129    12-28
13130    12-29
13131    12-30
Name: date, Length: 13132, dtype: object

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor  # ← skip this line if not installed
from sklearn.metrics import mean_squared_error

# Assume: train_df, test_df, y_train, y_test are ready and numeric

models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42),  # ← skip if not installed
}

for name, model in models.items():
    model.fit(train_df, y_train)
    preds = model.predict(test_df)
    rmse = mean_squared_error(y_test, preds, squared=False)
    print(f"{name} RMSE: {rmse:.4f}")


In [27]:
import sys
print(sys.executable)


/opt/anaconda3/bin/python


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipeline, train_df, train_y, cv=5, scoring='neg_mean_squared_error')
print("CV RMSE:", np.mean(np.sqrt(-scores)))


In [None]:
cloud_cover_cols = [f"cloud_cover_{i}" for i in range(24)]
humidity_cols = [f"humidity_{i}" for i in range(24)]

feature_definitions ={
"cloud_cover_n": cloud_cover_cols,
"humidity_n": humidity_cols,
}
feature_config = {
"cloud_cover_n": {"total_sum": True, "skew": True},
"humidity_n": {"total_sum": False, "skew": False},
}


In [None]:
class Condenser:
    def __init__(self, feature_name, hour_cols, config):
        self.feature = feature_name
        self.hour_cols = hour_cols
        self.config = config  # Dict of which features to generate (from the table)

    def transform(self, df):
        values = df[self.hour_cols].copy()
        result = pd.DataFrame(index=df.index)

        if self.config.get("total_sum"):
            result[f"{self.feature}_total_sum"] = values.sum(axis=1, skipna=True)

        if self.config.get("has_condition"):
            result[f"{self.feature}_has_condition"] = (values > 0).any(axis=1).astype(int)

        if self.config.get("valid_count"):
            result[f"{self.feature}_valid_count"] = values.count(axis=1)

        # Continue for other flags like:
        if self.config.get("segment_stats"):
            result = result.join(self._get_segment_stats(values))

        if self.config.get("global_stats"):
            result = result.join(self._get_global_stats(values))

        if self.config.get("skewness"):
            result[f"{self.feature}_skewness"] = values.apply(
                lambda row: row.skew() if row.count() >= 5 else np.nan, axis=1
            )

        return result

    def _get_segment_stats(self, values):
    # Morning, Afternoon, Evening, Night segment breakdown
    # Return DataFrame with segment-based mean, std, range, etc.
    ...

    def _get_global_stats(self, values):
    # Calculate mean, std, range, hour_max_jump etc.
    # Return as DataFrame
    ...


In [None]:
condensed_dfs = []

for feature, hour_cols in feature_definitions.items():
    config = feature_config[feature]
    condenser = Condenser(feature, hour_cols, config)
    condensed_df = condenser.transform(train_df)
    condensed_dfs.append(condensed_df)
    
condesnsed_dfs

In [20]:
train_df.index

RangeIndex(start=0, stop=13132, step=1)

In [25]:
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# train_df.head(500).T

In [None]:
print(sklearn.__version__)

1.6.1


In [None]:
# Features

## Original featuers
# id: 순서
# station: 지상관측소 번호
# station_name: 지상관측소 이름
# date: 날짜(월-일)
# cloud_cover_n: 증하층운량(10분위)
# dew_point_n: 이슬점 온도(°C)
# humidity_n: 습도(%)
# local_pressure_n: 현지기압(hPa)m
# min_cloud_height_n: 최저운고(100m)
# precipitation_n: 강수량(mm)
# sea_level_pressure_n: 해면기압(hPa)
# snow_depth_n: 적설(cm)
# sunshine_duration_n: 일조(hr)
# surface_temp_n: 지면온도(°C)
# vapor_pressure_n: 증기압(hPa)
# visibility_n: 시정(10m)
# wind_speed_n: 풍속(m/s)
# wind_direction_n: 풍향(°)
# climatology_temp:

## Added features
# year


## Utilizing features
# cloud_cover_n: 증하층운량(10분위)
# dew_point_n: 이슬점 온도(°C)
# humidity_n: 습도(%)
# local_pressure_n: 현지기압(hPa)
# min_cloud_height_n: 최저운고(100m)
# precipitation_n: 강수량(mm)
# sea_level_pressure_n: 해면기압(hPa)
# snow_depth_n: 적설(cm)
# sunshine_duration_n: 일조(hr)
# surface_temp_n: 지면온도(°C)
# vapor_pressure_n: 증기압(hPa)
# visibility_n: 시정(10m)
# wind_speed_n: 풍속(m/s)
# wind_direction_n: 풍향(°)
# climatology_temp:
# year
# is_weekend

# is_weekend, day, month, wind_direction convert
# features with nan convert
# station longitude, latitude


## Omit features
# id, station, station_name




In [None]:



def is_leap_year(year):
  return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def add_year_column(df, year = 2019):
  year_added_df = df.copy()
  years_column = []
  current_year = year -1
  for station_id, station_df in year_added_df.groupby('station'):
    for mmdd in station_df['date']:
      if mmdd == '01-01':
        current_year += 1
      years_column.append(current_year)
    current_year = year-1
  year_added_df['year'] = years_column
  return year_added_df

def list_n_24_add(name):
  n_24 = []
  for x in range(24):
    n_24.append(name +f'_{x}')
  return n_24

# Add year column
year_train_df = add_year_column(train_df)
test_df = add_year_column(test_df)

# Wind_direction
wind_directions = list_n_24_add('wind_direction')
wind_speeds = list_n_24_add('wind_speed')

# columns rearrange
col_all = list(year_train_df.columns)
col_im = ['id', 'date', 'year', 'station', 'station_name', 'climatology_temp', 'target']
col_other = natsorted([c for c in col_all if c not in col_im])
edited_col = col_im + col_other

# df column rearrange
year_train_df = year_train_df[edited_col].copy()
year_target_df = year_train_df['target'].copy()
year_train_df.drop(['id', 'date', 'station', 'station_name', 'target'] + wind_directions + wind_speeds, axis = 1, inplace=True)

test_df = test_df[['id', 'date', 'year', 'station', 'station_name', 'climatology_temp']+col_other].copy()
test_df.drop(['id', 'date', 'station', 'station_name'] + wind_directions + wind_speeds, axis = 1, inplace=True)

In [None]:
# Convert nan to separate column and set the value to -1
def nan_handle(df):
  updates = {}

  for x in df.columns:
    if df[x].isna().sum() > 0:
      updates[f'{x}_is_nan'] = df[x].isna().astype(int)
      df[x] = df[x].fillna(-1)

  df = pd.concat([df, pd.DataFrame(updates)], axis = 1)
  return df

year_train_df = nan_handle(year_train_df)
test_df = nan_handle(test_df)

In [None]:
# Check if there's any nan value
print((year_train_df.isna().sum() > 0).any())
print((test_df.isna().sum() > 0).any())

False
False


In [None]:


x_train, x_test, y_train, y_test = train_test_split(year_train_df, year_target_df, test_size = 0.3, random_state=67)
graaa = RandomForestRegressor(n_estimators = 100, max_depth = None, random_state = 67, n_jobs = -1)
graaa.fit(x_train, y_train)

y_pred = graaa.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'mse is {mse}')
print(f'r2 score is {r2}')

mse is 2.471875641788321
r2 score is 0.7160905189217839
