# Multilayer Perceptron Regression Model (Version 1)

Regression with MLP using ScikitLearn's MLP Regressor. Feel free to adapt to using PyTorch/TensorFlow multilayer perceptron if deemed suitable.

Disclaimer: As a _suay_ kid, I can't do hyperparameter tuning, so this is just a baseline implementation with Optuna for automated hyperparameter search.

### Summary

| Techniques                     | Used / Description           |
| ------------------------------ | ---------------------------- |
| Handling Unknown Variables     | Drop Rows                    |
| Handling Categorical Variables | Drop Columns (Drop Features) |
| Handling Class Imbalance       | Not Applied                  |
| Handling Outliers              | Not Applied                  |

### Results

| Metric                 | Value  |
| ---------------------- | ------ |
| RMSE (Lower is better) | 0.8562 |
| R2 (Higher is better)  | 0.4351 |


### Preprocessing Stage

In [2]:
import optuna
import numpy as np
import pandas as pd

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
X_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')

X_test = pd.read_csv('./data/X_test.csv')
y_test = pd.read_csv('./data/y_test.csv')

In [4]:
X_train.head()

Unnamed: 0,latitude,longitude,land_use_label,distance_to_waterbody,distance_to_open_space,subzone,planning_area,region,elevation,temp_2024_04_07_min,...,built-up,bare / sparse vegetation,snow and ice,permanent water bodies,herbaceous wetland,mangroves,moss and lichen,min_ndvi,mean_ndvi,max_ndvi
0,1.327345,103.776261,ROAD,0.005491,0.000305,HOLLAND ROAD,BUKIT TIMAH,CENTRAL REGION,34,28.880736,...,128,1,0,1,0,0,0,0.1176063463,0.2107233339,0.3355351585
1,1.36231,103.885041,RESIDENTIAL,0.002163,0.002288,KOVAN,HOUGANG,NORTH-EAST REGION,14,33.603571,...,183,1,0,0,0,0,0,0.06873453002,0.1237388913,0.1772913102
2,1.304792,103.740678,BUSINESS 2,0.00166,0.001437,PENJURU CRESCENT,JURONG EAST,WEST REGION,10,28.880736,...,251,8,0,33,0,0,0,0.03399855502,0.07334574643,0.1149060753
3,1.432131,103.793028,ROAD,0.002688,0.002472,WOODLANDS SOUTH,WOODLANDS,NORTH REGION,32,30.168782,...,-,-,-,-,-,-,-,-,-,-
4,1.30353,103.820861,CIVIC & COMMUNITY INSTITUTION,0.011124,0.004127,RIDOUT,TANGLIN,CENTRAL REGION,17,30.168782,...,63,1,0,0,0,0,0,0.09017470784,0.2076336658,0.3255961435


In [5]:
# Combine X and y to make sure that the oversampling is done correctly
X_train = pd.concat([X_train, y_train], axis=1)
X_test = pd.concat([X_test, y_test], axis=1)

- Drop subzone and planning area columns
- Replace land use label by one hot encoding
- Drop temperature data, since they are not independent variables

In [6]:
X_train.columns

Index(['latitude', 'longitude', 'land_use_label', 'distance_to_waterbody',
       'distance_to_open_space', 'subzone', 'planning_area', 'region',
       'elevation', 'temp_2024_04_07_min', 'temp_2024_04_07_max',
       'temp_2024_04_07_median', 'temp_2024_04_08_min', 'temp_2024_04_08_max',
       'temp_2024_04_08_median', 'temp_2024_04_09_min', 'temp_2024_04_09_max',
       'temp_2024_04_09_median', 'temp_2024_04_10_min', 'temp_2024_04_10_max',
       'temp_2024_04_10_median', 'Total_x', 'HDB Total',
       'Condominiums & Other Apartments', 'Landed Properties_x',
       'Other Dwellings_x', 'Floor_below_60', 'Floor_60-80', 'Floor_80-100',
       'Floor_100-120', 'Floor_above_120', 'Below $1,000', '$1,000 - $1,999',
       '$2,000 - $2,999', '$3,000 - $3,999', '$4,000 - $4,999',
       '$5,000 - $5,999', '$6,000 - $6,999', '$7,000 - $7,999',
       '$8,000 - $8,999', '$9,000 - $9,999', '$10,000 - 10,999',
       '$11,000 - 11,999', '$12,000 - $14,999', '$15,000 & Over', 'tree cover',
 

In [7]:
columns_to_drop = ['land_use_label', 'subzone', 'planning_area', 'region',
       'temp_2024_04_07_min', 'temp_2024_04_07_max',
       'temp_2024_04_07_median', 'temp_2024_04_08_min', 'temp_2024_04_08_max',
       'temp_2024_04_08_median', 'temp_2024_04_09_min', 'temp_2024_04_09_max',
       'temp_2024_04_09_median', 'temp_2024_04_10_min', 'temp_2024_04_10_max',
       'temp_2024_04_10_median']

X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)

In [8]:
# Remove rows where min_ndvi values is -
X_train = X_train[X_train['min_ndvi'] != '-']
X_test = X_test[X_test['min_ndvi'] != '-']

In [9]:
# Split X and y
y_train = X_train['avg_temp']
X_train = X_train.drop(columns=['avg_temp'])

y_test = X_test['avg_temp']
X_test = X_test.drop(columns=['avg_temp'])

## Model Training

In [19]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def objective(trial):
    # Convert tuple strings back to tuple
    layer_choices = ['(50,)', '(100,)', '(100, 50)', '(50, 50, 50)']
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', layer_choices)
    hidden_layer_sizes = eval(hidden_layer_sizes)

    activation = trial.suggest_categorical('activation', ['tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['sgd', 'adam'])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
    learning_rate_init = trial.suggest_float('learning_rate_init', 1e-5, 1e-1, log=True)

    regressor = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate_init=learning_rate_init,
        max_iter=1000,  # Increased max iterations
        random_state=42
    )
    regressor.fit(X_train_scaled, y_train)

    y_pred = regressor.predict(X_test_scaled)
    score = r2_score(y_test, y_pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial
print(f"  R2 score: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[32m[I 2024-04-17 02:11:11,194][0m A new study created in memory with name: no-name-1321d41a-3000-41ef-95bc-099d4d351a11[0m


[32m[I 2024-04-17 02:11:15,832][0m Trial 0 finished with value: -0.9200477148860409 and parameters: {'hidden_layer_sizes': '(50, 50, 50)', 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0264108471675485, 'learning_rate_init': 0.0005369568273154539}. Best is trial 0 with value: -0.9200477148860409.[0m
[32m[I 2024-04-17 02:11:17,042][0m Trial 1 finished with value: -0.20274229882354367 and parameters: {'hidden_layer_sizes': '(50,)', 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0173560714487939, 'learning_rate_init': 0.0006093858420583486}. Best is trial 1 with value: -0.20274229882354367.[0m
[32m[I 2024-04-17 02:11:17,078][0m Trial 2 finished with value: -37685263321717.52 and parameters: {'hidden_layer_sizes': '(50,)', 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.004426439211073496, 'learning_rate_init': 0.05706641605071665}. Best is trial 1 with value: -0.20274229882354367.[0m
[32m[I 2024-04-17 02:11:19,485][0m Trial 3 finished with value: -0.4520082044943876 a

Best trial:
  R2 score: 0.43334430005359004
  Params: 
    hidden_layer_sizes: (100, 50)
    activation: tanh
    solver: sgd
    alpha: 2.7626991540211166e-05
    learning_rate_init: 0.0027471904190204772


In [20]:

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("Best trial:")
trial = study.best_trial
print(f"  R2 score: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[32m[I 2024-04-17 02:13:51,082][0m A new study created in memory with name: no-name-fceedcfa-46ba-45f5-ab35-9a238fdadca7[0m
[32m[I 2024-04-17 02:13:53,746][0m Trial 0 finished with value: 0.1986636603297448 and parameters: {'hidden_layer_sizes': '(50, 50, 50)', 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0003863893565745369, 'learning_rate_init': 0.00417926892466658}. Best is trial 0 with value: 0.1986636603297448.[0m
[32m[I 2024-04-17 02:13:59,748][0m Trial 1 finished with value: -6.9506184406526454 and parameters: {'hidden_layer_sizes': '(100, 50)', 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.024220879223789557, 'learning_rate_init': 0.00010671664255828329}. Best is trial 0 with value: 0.1986636603297448.[0m
[32m[I 2024-04-17 02:14:04,231][0m Trial 2 finished with value: -0.9401448308293352 and parameters: {'hidden_layer_sizes': '(50, 50, 50)', 'activation': 'relu', 'solver': 'adam', 'alpha': 3.0407603296407804e-05, 'learning_rate_init': 0.0003440751832872817

Best trial:
  R2 score: 0.4351459075532992
  Params: 
    hidden_layer_sizes: (100, 50)
    activation: tanh
    solver: sgd
    alpha: 0.00038926431797372033
    learning_rate_init: 0.0026858035643917587


In [25]:
regressor = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='tanh',
    solver='sgd',
    alpha=0.00038926431797372033,
    learning_rate_init=0.0026858035643917587,
    # alpha=2.7626991540211166e-05,
    # learning_rate_init=0.0027471904190204772,
    max_iter=1000,  # Increased max iterations
    random_state=42
)

regressor.fit(X_train_scaled, y_train)

y_pred = regressor.predict(X_test_scaled)
score = r2_score(y_test, y_pred)

# Calculate the RMSE
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print(f"RMSE: {rmse}")

# Calculate the R2
r2 = r2_score(y_test, y_pred)
print(f"R2: {r2}")

RMSE: 0.856245111992063
R2: 0.4351459075532992


### Approach 2: Minimise RMSE

In [11]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def objective(trial):
    # Convert tuple strings back to tuple
    layer_choices = ['(50,)', '(100,)', '(100, 50)', '(50, 50, 50)']
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', layer_choices)
    hidden_layer_sizes = eval(hidden_layer_sizes)

    activation = trial.suggest_categorical('activation', ['tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['sgd', 'adam'])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
    learning_rate_init = trial.suggest_float('learning_rate_init', 1e-5, 1e-1, log=True)

    regressor = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate_init=learning_rate_init,
        max_iter=1000,  # Increased max iterations
        random_state=42
    )
    regressor.fit(X_train_scaled, y_train)

    y_pred = regressor.predict(X_test_scaled)
    score = np.sqrt(np.mean((y_test - y_pred)**2))
    return score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print("Best trial:")
trial = study.best_trial
print(f"  R2 score: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[32m[I 2024-04-17 02:25:21,558][0m A new study created in memory with name: no-name-2b5670a9-cd75-45a7-b6ed-d3ed739cebd8[0m


[32m[I 2024-04-17 02:25:25,754][0m Trial 0 finished with value: 1.0658188926354724 and parameters: {'hidden_layer_sizes': '(50,)', 'activation': 'tanh', 'solver': 'sgd', 'alpha': 8.320805039687442e-05, 'learning_rate_init': 3.143884104443272e-05}. Best is trial 0 with value: 1.0658188926354724.[0m
[32m[I 2024-04-17 02:25:28,260][0m Trial 1 finished with value: 1.466391562776176 and parameters: {'hidden_layer_sizes': '(50, 50, 50)', 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.026284463421248946, 'learning_rate_init': 0.00011814573539929562}. Best is trial 0 with value: 1.0658188926354724.[0m
[32m[I 2024-04-17 02:25:34,228][0m Trial 2 finished with value: 2.9497303585828076 and parameters: {'hidden_layer_sizes': '(100, 50)', 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.00018700682370942286, 'learning_rate_init': 0.00011007367777070534}. Best is trial 0 with value: 1.0658188926354724.[0m
[32m[I 2024-04-17 02:25:34,498][0m Trial 3 finished with value: 1.7189292297424

Best trial:
  R2 score: 0.8766006917434296
  Params: 
    hidden_layer_sizes: (100, 50)
    activation: tanh
    solver: sgd
    alpha: 0.004315134812016797
    learning_rate_init: 0.0005642261095426077


In [12]:
regressor = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='tanh',
    solver='sgd',
    alpha=0.004315134812016797,
    learning_rate_init=0.0005642261095426077,
    # alpha=2.7626991540211166e-05,
    # learning_rate_init=0.0027471904190204772,
    max_iter=1000,  # Increased max iterations
    random_state=42
)

regressor.fit(X_train_scaled, y_train)

y_pred = regressor.predict(X_test_scaled)
score = r2_score(y_test, y_pred)

# Calculate the RMSE
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print(f"RMSE: {rmse}")

# Calculate the R2
r2 = r2_score(y_test, y_pred)
print(f"R2: {r2}")

RMSE: 0.8766006917434296
R2: 0.4079700370621313
