In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.pipeline import Pipeline

In [10]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [11]:
for data in [train_data, test_data]:
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day

In [12]:
drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day']

In [13]:
for data in [train_data, test_data]:
    for col in data.select_dtypes(include=np.number).columns:
        if data[col].isnull().any():
            data[col].fillna(data[col].median(), inplace=True)

In [14]:
X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']

X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

In [15]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

In [16]:
xgb = XGBRegressor(random_state=42)
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('xgb', xgb)])
xgb_pipeline.fit(X_train, y_train)
xgb_predictions = xgb_pipeline.predict(X_test)

In [18]:
y_train = y_train.values.reshape(-1, 1)

tabnet = TabNetRegressor(seed=42)
tabnet_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('tabnet', tabnet)])
tabnet_pipeline.fit(X_train, y_train)
tabnet_predictions = tabnet_pipeline.predict(X_test)



epoch 0  | loss: 1128.46497|  0:00:00s
epoch 1  | loss: 1044.66548|  0:00:01s
epoch 2  | loss: 895.12332|  0:00:01s
epoch 3  | loss: 682.63427|  0:00:02s
epoch 4  | loss: 614.97234|  0:00:02s
epoch 5  | loss: 516.15947|  0:00:02s
epoch 6  | loss: 530.74181|  0:00:03s
epoch 7  | loss: 515.57323|  0:00:03s
epoch 8  | loss: 532.18896|  0:00:04s
epoch 9  | loss: 546.98962|  0:00:04s
epoch 10 | loss: 527.54711|  0:00:05s
epoch 11 | loss: 512.62007|  0:00:05s
epoch 12 | loss: 508.3286|  0:00:06s
epoch 13 | loss: 513.76862|  0:00:06s
epoch 14 | loss: 524.97698|  0:00:07s
epoch 15 | loss: 505.27636|  0:00:07s
epoch 16 | loss: 507.66772|  0:00:08s
epoch 17 | loss: 541.45958|  0:00:08s
epoch 18 | loss: 532.8063|  0:00:09s
epoch 19 | loss: 463.72716|  0:00:09s
epoch 20 | loss: 509.71211|  0:00:10s
epoch 21 | loss: 496.12171|  0:00:10s
epoch 22 | loss: 490.74509|  0:00:11s
epoch 23 | loss: 511.87831|  0:00:11s
epoch 24 | loss: 537.27157|  0:00:11s
epoch 25 | loss: 466.3627|  0:00:12s
epoch 26 | lo

In [21]:
# XGBoost predictions
xgb_predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': xgb_predictions
})
xgb_predictions_df.to_csv('test_predictions_xgb_cld.csv', index=False)

# TabNet predictions
tabnet_predictions = tabnet_predictions.squeeze()  # Flatten the array
tabnet_predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': tabnet_predictions
})
tabnet_predictions_df.to_csv('test_predictions_tabnet_cld.csv', index=False)