In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [2]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [3]:
for data in [train_data, test_data]:
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day

drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day']

In [4]:
for data in [train_data, test_data]:
    for col in data.select_dtypes(include=np.number).columns:
        if data[col].isnull().any():
            data[col].fillna(data[col].median(), inplace=True)

In [5]:
X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']
X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']


In [6]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [9]:
svr_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

lgbm_params = {
    'num_leaves': [31, 63, 127],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2]
}

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

gbm_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

et_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

# Preprocessor for categorical and numerical data
categorical_features = ['city', 'country']  # adjust this based on your dataset
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])


In [11]:
X_train = X_train.drop(['city', 'country'], axis=1)
X_train.head()

Unnamed: 0,site_latitude,site_longitude,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,sulphurdioxide_sensor_zenith_angle,sulphurdioxide_solar_azimuth_angle,...,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,year,day
0,6.53257,3.39936,13,1.2e-05,0.669632,7e-06,0.123876,71.65316,42.564364,-95.848477,...,60432.792969,4389.787844,8.752905,0.257323,-97.477511,49.187592,-74.597511,29.002745,2023,25
1,6.53257,3.39936,12,1.2e-05,0.669632,7e-06,0.123876,71.65316,42.564364,-95.848477,...,60432.792969,4389.787844,8.752905,0.257323,-97.477511,49.187592,-74.597511,29.002745,2023,2
2,6.53257,3.39936,13,1.2e-05,0.669632,7e-06,0.123876,71.65316,42.564364,-95.848477,...,51171.802486,5791.682829,11.816715,0.192757,-96.41189,61.045123,-121.307414,41.898269,2023,3
3,6.53257,3.39936,14,1.2e-05,0.669632,7e-06,0.123876,71.65316,42.564364,-95.848477,...,60432.792969,4389.787844,8.752905,0.257323,-97.477511,49.187592,-74.597511,29.002745,2023,8
4,6.53257,3.39936,13,0.000267,0.774656,0.000207,0.223403,-97.811241,49.513344,-126.064468,...,96215.90625,451.050598,10.521009,0.153114,-97.811241,49.513439,-126.064453,40.167355,2023,9


In [12]:
X_test = X_test.drop(['city', 'country'], axis=1)
X_test.head()

Unnamed: 0,site_latitude,site_longitude,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,sulphurdioxide_sensor_zenith_angle,sulphurdioxide_solar_azimuth_angle,...,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,year,day
0,5.61252,-0.22955,13,-7.2e-05,0.762543,-5.5e-05,0.079645,-100.330299,26.92642,-86.879776,...,83569.504246,1710.544483,3.063105,0.263193,-100.317077,27.059646,-86.88567,25.530511,2023,6
1,5.61252,-0.22955,13,-3e-06,0.633956,-2e-06,0.072231,70.071861,41.864498,-71.3769,...,59878.560547,4485.485268,6.99682,0.263927,-97.247214,53.075366,-68.727708,29.742252,2023,7
2,5.61252,-0.22955,13,-5.1e-05,1.004265,-5.1e-05,0.16316,73.117264,43.112466,-89.089083,...,59878.560547,4485.485268,6.99682,0.263927,-97.247214,53.075366,-68.727708,29.742252,2023,8
3,5.61252,-0.22955,12,-3e-06,0.633956,-2e-06,0.072231,70.071861,41.864498,-71.3769,...,52160.980469,5585.034668,29.145922,0.314945,70.680077,61.874222,-90.875603,11.865201,2023,9
4,5.61252,-0.22955,12,-0.000634,0.632173,-0.000401,0.0,70.066956,66.014107,-111.396515,...,59878.560547,4485.485268,6.99682,0.263927,-97.247214,53.075366,-68.727708,29.742252,2023,20


In [13]:
def train_tune_model(model, params, X_train, y_train):
    grid_search = RandomizedSearchCV(model, params, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, random_state=42, verbose=1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

In [None]:
svr_best = train_tune_model(SVR(), svr_params, X_train, y_train)
lgbm_best = train_tune_model(LGBMRegressor(random_state=42), lgbm_params, X_train, y_train)
rf_best = train_tune_model(RandomForestRegressor(random_state=42), rf_params, X_train, y_train)
gbm_best = train_tune_model(GradientBoostingRegressor(random_state=42), gbm_params, X_train, y_train)
et_best = train_tune_model(ExtraTreesRegressor(random_state=42), et_params, X_train, y_train)
xgb_best = train_tune_model(XGBRegressor(random_state=42), xgb_params, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
from sklearn.ensemble import VotingRegressor
ensemble = VotingRegressor(estimators=[
    ('svr', svr_best),
    ('lgbm', lgbm_best),
    ('rf', rf_best),
    ('gbm', gbm_best),
    ('et', et_best),
    ('xgb', xgb_best)
])

In [None]:
ensemble.fit(X_train, y_train)

In [None]:
predictions = ensemble.predict(X_test)
predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': predictions
})

In [None]:
predictions_df.to_csv('submission_chat_ensemble_1_optimized.csv', index=False)