In [14]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler


In [5]:
df = pd.read_csv('../../Data/Mendalay.csv')
df.head()

Unnamed: 0,Timestamp,PM2.5 [ug/m3],PM10[ug/m3],Ozone [ppb],Carbon_Monoxide [ppb],Temperature [°C],Relative_Humidity [%]
0,31/01/2022 12:00:00,13.181818,14.181818,456.090909,371.909091,20.413636,28.798182
1,31/01/2022 13:00:00,16.333333,17.333333,78.833333,405.75,22.851667,22.753333
2,31/01/2022 14:00:00,25.48,26.893333,44.28,407.946667,24.910667,19.626133
3,31/01/2022 15:00:00,26.701058,28.343915,39.462963,405.02381,25.443915,19.002831
4,31/01/2022 16:00:00,32.166667,34.166667,33.916667,399.416667,23.1525,22.096667


In [6]:
df = df.drop(columns=['Timestamp'])

df.rename(columns={
    'PM2.5 [ug/m3]': 'PM25', 
    'PM10[ug/m3]': 'PM10', 
    'Ozone [ppb]': 'Ozone', 
    'Carbon_Monoxide [ppb]': 'CO', 
    'Temperature [°C]': 'Temp', 
    'Relative_Humidity [%]': 'Humidity'
}, inplace=True)

In [7]:
df.head()

Unnamed: 0,PM25,PM10,Ozone,CO,Temp,Humidity
0,13.181818,14.181818,456.090909,371.909091,20.413636,28.798182
1,16.333333,17.333333,78.833333,405.75,22.851667,22.753333
2,25.48,26.893333,44.28,407.946667,24.910667,19.626133
3,26.701058,28.343915,39.462963,405.02381,25.443915,19.002831
4,32.166667,34.166667,33.916667,399.416667,23.1525,22.096667


In [8]:
# features for AQI calculation
features = ['PM25', 'PM10', 'Ozone', 'CO', 'Temp', 'Humidity']

scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

weights = {'PM25': 0.4, 'PM10': 0.3, 'Ozone': 0.2, 'CO': 0.1}

df['AQI'] = (
    df['PM25'] * weights['PM25'] +
    df['PM10'] * weights['PM10'] +
    df['Ozone'] * weights['Ozone'] +
    df['CO'] * weights['CO']
)

threshold = 0.5
df['AQ_class'] = df['AQI'].apply(lambda x: 'Good' if x <= threshold else 'Bad')

In [9]:
df.head()

Unnamed: 0,PM25,PM10,Ozone,CO,Temp,Humidity,AQI,AQ_class
0,2.636096e-08,0.000259,0.475653,0.098891,0.51187,0.263175,0.105098,Good
1,3.266335e-08,0.000321,0.067292,0.138074,0.595577,0.200532,0.027362,Good
2,5.095482e-08,0.000509,0.02989,0.140618,0.66627,0.168125,0.020193,Good
3,5.339669e-08,0.000538,0.024676,0.137233,0.684578,0.161666,0.01882,Good
4,6.432679e-08,0.000653,0.018672,0.130741,0.605905,0.193727,0.017004,Good


In [10]:
X_humidity = df[['Temp', 'PM25', 'PM10', 'Ozone', 'CO']]
y_humidity = df['Humidity']

X_temp = df[['Humidity', 'PM25', 'PM10', 'Ozone', 'CO']]
y_temp = df['Temp']

X_train_humidity, X_test_humidity, y_train_humidity, y_test_humidity = train_test_split(X_humidity, y_humidity, test_size=0.2, random_state=42)
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

In [11]:
# Initialize XGBoost regressor models
model_humidity = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=5, subsample=0.8)
model_temp = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=5, subsample=0.8)

# Fit the models
model_humidity.fit(X_train_humidity, y_train_humidity, eval_set=[(X_test_humidity, y_test_humidity)], verbose=True)
model_temp.fit(X_train_temp, y_train_temp, eval_set=[(X_test_temp, y_test_temp)], verbose=True)

# Predict with the models
y_pred_humidity = model_humidity.predict(X_test_humidity)
y_pred_temp = model_temp.predict(X_test_temp)

[0]	validation_0-rmse:0.23097
[1]	validation_0-rmse:0.22944
[2]	validation_0-rmse:0.22795
[3]	validation_0-rmse:0.22647
[4]	validation_0-rmse:0.22502
[5]	validation_0-rmse:0.22358
[6]	validation_0-rmse:0.22217
[7]	validation_0-rmse:0.22076
[8]	validation_0-rmse:0.21939
[9]	validation_0-rmse:0.21802
[10]	validation_0-rmse:0.21669
[11]	validation_0-rmse:0.21536
[12]	validation_0-rmse:0.21406
[13]	validation_0-rmse:0.21276
[14]	validation_0-rmse:0.21148
[15]	validation_0-rmse:0.21022
[16]	validation_0-rmse:0.20898
[17]	validation_0-rmse:0.20776
[18]	validation_0-rmse:0.20656
[19]	validation_0-rmse:0.20535
[20]	validation_0-rmse:0.20417
[21]	validation_0-rmse:0.20301
[22]	validation_0-rmse:0.20185
[23]	validation_0-rmse:0.20073
[24]	validation_0-rmse:0.19962
[25]	validation_0-rmse:0.19853
[26]	validation_0-rmse:0.19745
[27]	validation_0-rmse:0.19637
[28]	validation_0-rmse:0.19532
[29]	validation_0-rmse:0.19428
[30]	validation_0-rmse:0.19325
[31]	validation_0-rmse:0.19224
[32]	validation_0-

In [15]:
# Evaluate the models
mae_humidity = mean_absolute_error(y_test_humidity, y_pred_humidity)
mse_humidity = mean_squared_error(y_test_humidity, y_pred_humidity)
r2_humidity = r2_score(y_test_humidity, y_pred_humidity)

mae_temp = mean_absolute_error(y_test_temp, y_pred_temp)
mse_temp = mean_squared_error(y_test_temp, y_pred_temp)
r2_temp = r2_score(y_test_temp, y_pred_temp)

# Print results
print(f"MAE for Humidity Prediction: {mae_humidity}")
print(f"MSE for Humidity Prediction: {mse_humidity}")
print(f"R² for Humidity Prediction: {r2_humidity}")

print(f"MAE for Temperature Prediction: {mae_temp}")
print(f"MSE for Temperature Prediction: {mse_temp}")
print(f"R² for Temperature Prediction: {r2_temp}")

# Print the first few rows of the updated dataframe with AQI and AQ_Class
print(df[['Temp', 'Humidity', 'AQI', 'AQ_class']].head())

MAE for Humidity Prediction: 0.08976076555000341
MSE for Humidity Prediction: 0.014433788044333155
R² for Humidity Prediction: 0.732955737379768
MAE for Temperature Prediction: 0.06209208130212408
MSE for Temperature Prediction: 0.006625067243971364
R² for Temperature Prediction: 0.7678539661908974
       Temp  Humidity       AQI AQ_class
0  0.511870  0.263175  0.105098     Good
1  0.595577  0.200532  0.027362     Good
2  0.666270  0.168125  0.020193     Good
3  0.684578  0.161666  0.018820     Good
4  0.605905  0.193727  0.017004     Good


In [13]:
# Save the models in native XGBoost format
model_humidity.save_model('model_humidity.xgb')
model_temp.save_model('model_temp.xgb')

  self.get_booster().save_model(fname)
  self.get_booster().save_model(fname)
