### Import libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


### Load data

In [13]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
sample_submission = pd.read_csv("../data/sample_submission.csv")

print(train.shape, test.shape, sample_submission.shape)
train.head()


(30676, 12) (13148, 11) (13148, 2)


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,cbwd_cv,pm2.5
0,1,-1.580878,-1.92225,0.443328,-0.441894,-0.069353,-0.137667,2010-01-01 00:00:00,1.448138,-0.732019,-0.522096,
1,2,-1.580878,-2.004228,0.345943,-0.379306,-0.069353,-0.137667,2010-01-01 01:00:00,1.448138,-0.732019,-0.522096,
2,3,-1.580878,-1.92225,0.248559,-0.343514,-0.069353,-0.137667,2010-01-01 02:00:00,1.448138,-0.732019,-0.522096,
3,4,-1.580878,-2.168183,0.248559,-0.280926,-0.069353,-0.137667,2010-01-01 03:00:00,1.448138,-0.732019,-0.522096,
4,5,-1.511594,-2.004228,0.151174,-0.218339,-0.069353,-0.137667,2010-01-01 04:00:00,1.448138,-0.732019,-0.522096,


### Preprocess data

In [14]:
# Convert datetime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])


for df in [train, test]:
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek


In [16]:
print(train.columns)


Index(['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW',
       'cbwd_SE', 'cbwd_cv', 'pm2.5', 'year', 'month', 'day', 'hour',
       'dayofweek'],
      dtype='object')


### Define features and target

In [20]:
from sklearn.model_selection import train_test_split

train = train.dropna(subset=['pm2.5'])

X = train.drop(columns=['pm2.5', 'datetime', 'No'], errors='ignore')
y = train['pm2.5']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Shapes:")
print("X_train:", X_train.shape, "X_valid:", X_valid.shape)
print("y_train:", y_train.shape, "y_valid:", y_valid.shape)


Shapes:
X_train: (23004, 14) X_valid: (5751, 14)
y_train: (23004,) y_valid: (5751,)


### Train a Baseline Model (Linear Regression)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_valid)

mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)

print("Linear Regression RMSE:", rmse)


Linear Regression RMSE: 77.76808734390757


### Try a More Powerful Model (Random Forest)

In [22]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_valid)

mse_rf = mean_squared_error(y_valid, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

print("Random Forest RMSE:", rmse_rf)


Random Forest RMSE: 34.22424152503358


### Train Final Model on All Data

In [24]:

final_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

final_model.fit(X, y)

print("Final model trained on full dataset.")


Final model trained on full dataset.


### Make Predictions on Test Set

In [25]:
X_test = test.drop(columns=['datetime', 'No'], errors='ignore')


test_predictions = final_model.predict(X_test)

print("Predictions ready. Shape:", test_predictions.shape)


Predictions ready. Shape: (13148,)


### Create Submission File

In [26]:
submission = sample_submission.copy()
submission['pm2.5'] = test_predictions

submission.to_csv("submission.csv", index=False)

print("✅ Submission file saved as submission.csv")
submission.head()


✅ Submission file saved as submission.csv


Unnamed: 0,row ID,pm2.5
0,2013-07-02 4:00:00,27.55
1,2013-07-02 5:00:00,24.86
2,2013-07-02 6:00:00,22.465
3,2013-07-02 7:00:00,17.525
4,2013-07-02 8:00:00,20.51
