In [80]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor

In [81]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [82]:
for data in [train_data, test_data]:
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day

In [83]:
drop_columns = ['id', 'site_id', 'date']

categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day']

In [84]:
for data in [train_data, test_data]:
    for col in data.select_dtypes(include=np.number).columns:
        if data[col].isnull().any():
            data[col].fillna(data[col].median(), inplace=True)


In [85]:
X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']

X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

In [86]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

In [87]:
svr = SVR()
lgbm = LGBMRegressor(random_state=42)


In [88]:
svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('svr', svr)])
lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('lgbm', lgbm)])

In [89]:
svr_param_grid = {
    'svr__kernel': ['rbf', 'poly', 'sigmoid'],
    'svr__C': [0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto']
}

In [90]:
svr_random_search = RandomizedSearchCV(svr_pipeline, svr_param_grid, n_iter=100,
                                       scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)

In [91]:
svr_random_search.fit(X_train, y_train)
svr_best = svr_random_search.best_estimator_



In [92]:
lgbm_param_grid = {
    'lgbm__num_leaves': [31, 63, 127],
    'lgbm__max_depth': [5, 10, 15],
    'lgbm__learning_rate': [0.01, 0.1, 0.2]
}

lgbm_random_search = RandomizedSearchCV(lgbm_pipeline, lgbm_param_grid, n_iter=100,
                                        scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)


In [93]:
lgbm_random_search.fit(X_train, y_train)
lgbm_best = lgbm_random_search.best_estimator_



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 11
[LightGBM] [Info] Start training from score 24.639296


In [94]:
ensemble = VotingRegressor([('svr', svr_best), ('lgbm', lgbm_best)])

In [95]:
ensemble.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 11
[LightGBM] [Info] Start training from score 24.639296


In [96]:
predictions = ensemble.predict(X_test)

In [97]:
predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': predictions
})

# Save the predictions to a CSV file
predictions_df.to_csv('test_predictions_ensemble_claude.csv', index=False)