In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

In [2]:
train_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Train.csv')
test_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Test.csv')

In [3]:
def add_date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday

for data in [train_data, test_data]:
    add_date_features(data)

# Handle missing values
for data in [train_data, test_data]:
    for col in data.columns:
        if data[col].isnull().any():
            data[col].fillna(data[col].median(), inplace=True)

# Preprocessing
drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day', 'weekday']

X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']
X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [4]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

In [5]:
svr = SVR()

svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('svr', svr)])

# Parameter tuning
svr_param_grid = {
    'svr__kernel': ['rbf'],
    'svr__C': [400],
    'svr__gamma': ['scale', 'auto']
}

In [8]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

param_grid = {
    'svr__C': Categorical([359.9463714999908]),
    'svr__gamma': Categorical([0.47732892632361296]),
    'svr__kernel': Categorical(['rbf'])
}

bayes_search = BayesSearchCV(svr_pipeline, param_grid, n_iter=32, scoring='neg_root_mean_squared_error', n_jobs=-1)
bayes_search.fit(X_train, y_train)



In [10]:
# Random search setup
# svr_random_search = RandomizedSearchCV(svr_pipeline, svr_param_grid, n_iter=50,
#                                       scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, random_state=42)

# Training
# svr_random_search.fit(X_train, y_train)

# svr_best = svr_random_search.best_estimator_
svr_best = bayes_search.best_estimator_

# Stacking Ensemble
estimators = [('svr', svr_best)]
final_estimator = Ridge()
stacking_ensemble = StackingRegressor(estimators=estimators, final_estimator=final_estimator)
stacking_ensemble.fit(X_train, y_train)


In [11]:
# Blend model predictions
svr_pred = svr_best.predict(X_train)

# Evaluate blended model
svr_rmse = mean_squared_error(y_train, svr_pred, squared=False)
print("Blended RMSE on training set:", svr_rmse)

# Predictions on test set
final_svr_pred = svr_best.predict(X_test)

predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': final_svr_pred
})

# Save the predictions to a CSV file
predictions_df.to_csv('/kaggle/working/test_predictions_proper_svr_bayes.csv', index=False)

Blended RMSE on training set: 22.04743326473132


In [12]:
svr_best 