In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso

# Load and preprocess data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

for data in [train_data, test_data]:
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day

drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day']

for data in [train_data, test_data]:
    for col in data.select_dtypes(include=np.number).columns:
        if data[col].isnull().any():
            data[col].fillna(data[col].median(), inplace=True)

X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']
X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

# Preprocessing pipeline
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Define models
svr = SVR()
lgbm = LGBMRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
ridge = Ridge()
lasso = Lasso()

# Create pipelines
svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('svr', svr)])
lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('lgbm', lgbm)])
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('xgb', xgb)])
ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('ridge', ridge)])
lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('lasso', lasso)])

# Hyperparameter tuning
svr_param_grid = {
    'svr__kernel': ['rbf', 'poly', 'sigmoid'],
    'svr__C': [0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto'],
    'svr__epsilon': [0.01, 0.1, 0.5]
}

lgbm_param_grid = {
    'lgbm__num_leaves': [31, 63, 127],
    'lgbm__max_depth': [5, 10, 15],
    'lgbm__learning_rate': [0.01, 0.1, 0.2],
    'lgbm__n_estimators': [100, 200, 300]
}

xgb_param_grid = {
    'xgb__max_depth': [3, 5, 7],
    'xgb__n_estimators': [100, 200, 300],
    'xgb__learning_rate': [0.01, 0.1, 0.2]
}

ridge_param_grid = {
    'ridge__alpha': [0.1, 1, 10]
}

lasso_param_grid = {
    'lasso__alpha': [0.01, 0.1, 1]
}

# Random search with cross-validation
svr_random_search = RandomizedSearchCV(svr_pipeline, svr_param_grid, n_iter=100,
                                       scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)

lgbm_random_search = RandomizedSearchCV(lgbm_pipeline, lgbm_param_grid, n_iter=100,
                                        scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)

xgb_random_search = RandomizedSearchCV(xgb_pipeline, xgb_param_grid, n_iter=100,
                                       scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)

ridge_random_search = RandomizedSearchCV(ridge_pipeline, ridge_param_grid, n_iter=100,
                                         scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)

lasso_random_search = RandomizedSearchCV(lasso_pipeline, lasso_param_grid, n_iter=100,
                                         scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)

# Fit the best models
svr_random_search.fit(X_train, y_train)
svr_best = svr_random_search.best_estimator_

lgbm_random_search.fit(X_train, y_train)
lgbm_best = lgbm_random_search.best_estimator_

xgb_random_search.fit(X_train, y_train)
xgb_best = xgb_random_search.best_estimator_

ridge_random_search.fit(X_train, y_train)
ridge_best = ridge_random_search.best_estimator_

lasso_random_search.fit(X_train, y_train)
lasso_best = lasso_random_search.best_estimator_

# Ensemble models
voting_ensemble = VotingRegressor([('svr', svr_best), ('lgbm', lgbm_best), ('xgb', xgb_best),
                                   ('ridge', ridge_best), ('lasso', lasso_best)])

stacking_ensemble = StackingRegressor([('svr', svr_best), ('lgbm', lgbm_best), ('xgb', xgb_best),
                                       ('ridge', ridge_best), ('lasso', lasso_best)],
                                      final_estimator=Ridge())

# Fit the ensembles
voting_ensemble.fit(X_train, y_train)
stacking_ensemble.fit(X_train, y_train)

# Make predictions
voting_predictions = voting_ensemble.predict(X_test)
stacking_predictions = stacking_ensemble.predict(X_test)

# Save predictions
predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': voting_predictions
})
predictions_df.to_csv('test_predictions_ensemble_optimized_claude_full.csv', index=False)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 11
[LightGBM] [Info] Start training from score 24.639296




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 11
[LightGBM] [Info] Start training from score 24.639296
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 11
[LightGBM] [Info] Start training from score 24.639296
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y