# Gradient Boosting Regression Model (Version 1)

Regression counterpart of Extreme Gradient Boosting, optimised from distributed gradient boosting libraries, supported by [`xgboost.XGBRegressor`](https://docs.getml.com/1.1.0/api/getml.predictors.XGBoostRegressor.html) function by getml

### Summary

| Techniques                     | Used / Description           |
| ------------------------------ | ---------------------------- |
| Handling Unknown Variables     | Drop Rows                    |
| Handling Categorical Variables | Drop Columns (Drop Features) |
| Handling Class Imbalance       | Not Applied                  |
| Handling Outliers              | Not Applied                  |

### Results

| Metric                 | Value   |
| ---------------------- | ------- |
| RMSE (Lower is better) | 0.92064 |
| R2 (Higher is better)  | 0.34699 |

### NOTE

In case there are wacky errors, do use the `xgboost-script.py` file to debug. For some reason the error messages shown in the python file is more comprehensive compared to using Jupyter notebook.


### Preprocessing Stage

In [88]:
import numpy as np
import pandas as pd
import random

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score


In [89]:
X_train = pd.read_csv('../../cleaned-data/X_train.csv')
y_train = pd.read_csv('../../cleaned-data/y_train.csv')

X_test = pd.read_csv('../../cleaned-data/X_test.csv')
y_test = pd.read_csv('../../cleaned-data/y_test.csv')

In [90]:
X_train.head()

Unnamed: 0,latitude,longitude,land_use_label,distance_to_waterbody,distance_to_open_space,subzone,planning_area,region,elevation,temp_2024_04_07_min,...,built-up,bare / sparse vegetation,snow and ice,permanent water bodies,herbaceous wetland,mangroves,moss and lichen,min_ndvi,mean_ndvi,max_ndvi
0,1.327345,103.776261,ROAD,0.005491,0.000305,HOLLAND ROAD,BUKIT TIMAH,CENTRAL REGION,34,28.880736,...,128,1,0,1,0,0,0,0.1176063463,0.2107233339,0.3355351585
1,1.36231,103.885041,RESIDENTIAL,0.002163,0.002288,KOVAN,HOUGANG,NORTH-EAST REGION,14,33.603571,...,183,1,0,0,0,0,0,0.06873453002,0.1237388913,0.1772913102
2,1.304792,103.740678,BUSINESS 2,0.00166,0.001437,PENJURU CRESCENT,JURONG EAST,WEST REGION,10,28.880736,...,251,8,0,33,0,0,0,0.03399855502,0.07334574643,0.1149060753
3,1.432131,103.793028,ROAD,0.002688,0.002472,WOODLANDS SOUTH,WOODLANDS,NORTH REGION,32,30.168782,...,-,-,-,-,-,-,-,-,-,-
4,1.30353,103.820861,CIVIC & COMMUNITY INSTITUTION,0.011124,0.004127,RIDOUT,TANGLIN,CENTRAL REGION,17,30.168782,...,63,1,0,0,0,0,0,0.09017470784,0.2076336658,0.3255961435


In [91]:
# Combine X and y to make sure that the oversampling is done correctly
X_train = pd.concat([X_train, y_train], axis=1)
X_test = pd.concat([X_test, y_test], axis=1)

- Drop subzone and planning area columns
- Replace land use label by one hot encoding
- Drop temperature data, since they are not independent variables

In [92]:
X_train.columns

Index(['latitude', 'longitude', 'land_use_label', 'distance_to_waterbody',
       'distance_to_open_space', 'subzone', 'planning_area', 'region',
       'elevation', 'temp_2024_04_07_min', 'temp_2024_04_07_max',
       'temp_2024_04_07_median', 'temp_2024_04_08_min', 'temp_2024_04_08_max',
       'temp_2024_04_08_median', 'temp_2024_04_09_min', 'temp_2024_04_09_max',
       'temp_2024_04_09_median', 'temp_2024_04_10_min', 'temp_2024_04_10_max',
       'temp_2024_04_10_median', 'Total_x', 'HDB Total',
       'Condominiums & Other Apartments', 'Landed Properties_x',
       'Other Dwellings_x', 'Floor_below_60', 'Floor_60-80', 'Floor_80-100',
       'Floor_100-120', 'Floor_above_120', 'Below $1,000', '$1,000 - $1,999',
       '$2,000 - $2,999', '$3,000 - $3,999', '$4,000 - $4,999',
       '$5,000 - $5,999', '$6,000 - $6,999', '$7,000 - $7,999',
       '$8,000 - $8,999', '$9,000 - $9,999', '$10,000 - 10,999',
       '$11,000 - 11,999', '$12,000 - $14,999', '$15,000 & Over', 'tree cover',
 

In [93]:
columns_to_drop = ['land_use_label', 'subzone', 'planning_area', 'region',
       'temp_2024_04_07_min', 'temp_2024_04_07_max',
       'temp_2024_04_07_median', 'temp_2024_04_08_min', 'temp_2024_04_08_max',
       'temp_2024_04_08_median', 'temp_2024_04_09_min', 'temp_2024_04_09_max',
       'temp_2024_04_09_median', 'temp_2024_04_10_min', 'temp_2024_04_10_max',
       'temp_2024_04_10_median']

X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)

In [94]:
# Remove rows where min_ndvi values is -
X_train = X_train[X_train['min_ndvi'] != '-']
X_test = X_test[X_test['min_ndvi'] != '-']

In [95]:
# Split X and y
y_train = X_train['avg_temp']
X_train = X_train.drop(columns=['avg_temp'])

y_test = X_test['avg_temp']
X_test = X_test.drop(columns=['avg_temp'])

## Model Training

In [96]:
X_train

Unnamed: 0,latitude,longitude,distance_to_waterbody,distance_to_open_space,elevation,Total_x,HDB Total,Condominiums & Other Apartments,Landed Properties_x,Other Dwellings_x,...,built-up,bare / sparse vegetation,snow and ice,permanent water bodies,herbaceous wetland,mangroves,moss and lichen,min_ndvi,mean_ndvi,max_ndvi
0,1.327345,103.776261,0.005491,0.000305,34,10840,0,6540,4220,90,...,128,1,0,1,0,0,0,0.1176063463,0.2107233339,0.3355351585
1,1.362310,103.885041,0.002163,0.002288,14,24810,3870,8870,11770,290,...,183,1,0,0,0,0,0,0.06873453002,0.1237388913,0.1772913102
2,1.304792,103.740678,0.001660,0.001437,10,10,0,0,0,0,...,251,8,0,33,0,0,0,0.03399855502,0.07334574643,0.1149060753
4,1.303530,103.820861,0.011124,0.004127,17,1520,0,630,850,40,...,63,1,0,0,0,0,0,0.09017470784,0.2076336658,0.3255961435
5,1.355391,103.871259,0.000000,0.000513,30,24760,19330,4050,1250,130,...,94,1,0,1,0,0,0,0.06070118587,0.1198349719,0.173249596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926,1.353420,103.741808,0.004249,0.000980,28,19820,19800,0,0,10,...,69,2,0,0,0,0,0,0.07285133006,0.1611562385,0.2541778707
928,1.307712,103.770770,0.003321,0.000543,13,16000,12400,2920,280,400,...,64,3,0,1,0,0,0,0.07318084436,0.1464278546,0.2249178424
929,1.400962,103.746179,0.004042,0.000303,32,40390,35710,4670,0,10,...,74,1,0,3,0,0,0,0.0651594473,0.1558659119,0.225600469
933,1.335135,103.887696,0.000849,0.001990,10,8150,4850,1810,1380,120,...,92,1,0,0,0,0,0,0.05514517186,0.1176944237,0.1701961006


In [97]:
def set_data_types(X_train):
    X_train['tree cover'] = X_train['tree cover'].astype('int')
    X_train['grassland'] = X_train['grassland'].astype('int')
    X_train['shrubland'] = X_train['shrubland'].astype('int')
    X_train['cropland'] = X_train['cropland'].astype('int')
    X_train['built-up'] = X_train['built-up'].astype('int')
    X_train['permanent water bodies'] = X_train['permanent water bodies'].astype('int')
    X_train['herbaceous wetland'] = X_train['herbaceous wetland'].astype('int')
    X_train['herbaceous wetland'] = X_train['herbaceous wetland'].astype('int')
    X_train['bare / sparse vegetation'] = X_train['bare / sparse vegetation'].astype('int')
    X_train['min_ndvi'] = X_train['min_ndvi'].astype('float')
    X_train['mean_ndvi'] = X_train['mean_ndvi'].astype('float')
    X_train['max_ndvi'] = X_train['max_ndvi'].astype('float')
    X_train.drop(['snow and ice', 'mangroves', 'moss and lichen'], axis=1, inplace=True)
    return X_train

In [98]:
X_train = set_data_types(X_train)
X_test = set_data_types(X_test)

In [101]:
regressor = XGBRegressor()
regressor.fit(X_train, y_train)

In [102]:
# Predict and evaluate RMSE and R2 on test set
y_pred = regressor.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print(f"RMSE: {rmse}")

# Calculate the R2
r2 = r2_score(y_test, y_pred)
print(f"R2: {r2}")

RMSE: 0.9206373408992247
R2: 0.3469938962044975
