In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
# Read the data
df = pd.read_csv('Train.csv')
df1 = pd.read_csv('Test.csv')

In [3]:
# Convert 'date' column to datetime and extract year, month, and day
for data in [df, df1]:
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day

In [4]:
# Define columns to drop
drop_columns = ['id', 'site_id', 'date', 'city', 'country']

In [5]:
# Fill missing values with median
for data in [df, df1]:
    for col in data.select_dtypes(include=np.number).columns:
        if data[col].isnull().any():
            data[col].fillna(data[col].median(), inplace=True)

In [6]:
# Prepare the training and test sets
X_train = df.drop(columns=drop_columns + ['pm2_5'])
y_train = df['pm2_5']
X_test = df1.drop(columns=drop_columns)
ids_test = df1['id']

In [7]:
# Create and train the Gradient Boosting Regressor model
gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm.fit(X_train, y_train)

In [8]:
# Make predictions on the test set
pred = gbm.predict(X_test)

# Create a DataFrame for submission
submission = pd.DataFrame({'id': ids_test, 'pm2_5': pred})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_gbm.csv', index=False)
