# TODO
try RBFSampler to get a differentiable model that can help the minimizer?
try other models?

In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, SplineTransformer
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from scipy.optimize import minimize

from ucimlrepo import fetch_ucirepo

from IPython.display import display

In [None]:
energy_efficiency = fetch_ucirepo(id=242)

X = energy_efficiency.data.features
y = energy_efficiency.data.targets
df = energy_efficiency.data.original

y = y.Y1 + y.Y2 # for now
df['Y'] = df['Y1'] + df['Y2']

disc_vars = ["X6", "X8"]
cts_vars = ["X1", "X2", "X3", "X4", "X5", "X7"]
X = X[disc_vars + cts_vars]
df = df[disc_vars + cts_vars + ["Y"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

features = list(X.columns)
nfeatures = X.shape[1]
nrows = X.shape[0]
ndiscrete = len(disc_vars)
ncontinuous = len(cts_vars)

print("variables breakdown")
display(energy_efficiency.variables)
print("features description")
display(X.describe())
print("label description")
display(y.describe().to_frame().T)
print("minimum labels")
display(y.sort_values().head().to_frame().T)
print("value counts")
for feature in features:
  print(feature)
  display(X[feature].value_counts().to_frame().T)

In [114]:
def make_model():
    return Pipeline([
        ('preprocessor', ColumnTransformer([
            ('cat', OneHotEncoder(), disc_vars),
            ('num', StandardScaler(), cts_vars),
        ])),
        ('regressor', SVR(kernel="rbf", degree=2, C=50.0))
    ])

model = make_model()
model.fit(X_train, y_train)
print(f"train mse:   {mean_squared_error(model.predict(X_train), y_train):06f}")
print(f"test mse:    {mean_squared_error(model.predict(X_test), y_test):06f}")

model = make_model()
model.fit(X, y)
print(f"overall mse: {mean_squared_error(model.predict(X), y):06f}")

train mse:   16.129024
test mse:    20.397660
overall mse: 12.977848


In [117]:
def partial_energy_load(model, disc_vals):
    def energy_load(cts_vals):
        x2 = np.concat((disc_vals, cts_vals))
        df = pd.DataFrame([x2], columns=features)
        df[disc_vars] = df[disc_vars].astype(np.int32)
        return model.predict(df)[0]
    return energy_load

# TODO: make differentiable/symbolic?
# TODO: smarter picking of discrete values?
def minimize_model(model, niters, constraints=None):
    bounds = [(X[col].min(), X[col].max()) for col in cts_vars]
    results = np.empty((niters, nfeatures+1))
    disc_vals = np.random.randint(low=X[disc_vars].min(), high=X[disc_vars].max()+1, size=(niters, ndiscrete))
    cts_vals = np.random.uniform(low=X[cts_vars].min(), high=X[cts_vars].max(), size=(niters, ncontinuous))
    for i in range(niters):
        objective = partial_energy_load(model, disc_vals[i, :])
        result = minimize(objective, cts_vals[i, :], method='SLSQP', bounds=bounds)
        results[i, :ndiscrete] = disc_vals[i, :]
        results[i, ndiscrete:-1] = result.x
        results[i, -1] = result.fun
    results = pd.DataFrame(results, columns=features+["Y"])
    return results.sort_values("Y")

# assumes inequality constraints
def minimize_samples(df, constraints=None):
    constraints = [] if constraints is None else constraints
    mask = np.ones(nrows, bool)
    for constraint in constraints:
        mask &= df.apply(constraint['fun']) >= 0
    return df[mask].sort_values("Y")

results_model_unconstrained = minimize_model(model, 10)
results_samples_unconstrained = minimize_samples(df)
print("model unconstrained")
display(results_model_unconstrained.head(10))
print("sampled unconstrained")
display(results_samples_unconstrained.head(10))

model unconstrained


Unnamed: 0,X6,X8,X1,X2,X3,X4,X5,X7,Y
9,4.0,0.0,0.62,770.707118,286.615012,220.5,3.5,0.02454434,15.551724
6,5.0,0.0,0.62,778.93273,287.402128,220.5,3.5,0.02625316,15.629547
2,2.0,0.0,0.69079,669.612214,245.0,217.924317,3.889755,0.01154459,16.597731
3,2.0,0.0,0.690819,668.464353,245.0,217.767825,3.895014,0.01193818,16.598372
7,3.0,4.0,0.62,605.69734,245.0,182.53869,6.330054,0.1384709,20.323474
5,4.0,1.0,0.62,791.091096,286.317122,220.5,3.5,0.00885333,20.820909
8,5.0,5.0,0.62,607.843202,245.0,184.369889,6.173496,0.1593737,21.165319
1,4.0,1.0,0.98,514.5,245.0,110.25,3.5,0.0,39.381544
4,3.0,5.0,0.98,808.5,416.5,220.5,3.5,3.430176e-09,40.98225
0,2.0,1.0,0.98,659.876313,416.5,220.5,3.5,4.263246e-14,45.379651


sampled unconstrained


Unnamed: 0,X6,X8,X1,X2,X3,X4,X5,X7,Y
26,4,0,0.74,686.0,245.0,220.5,3.5,0.0,16.95
24,2,0,0.74,686.0,245.0,220.5,3.5,0.0,16.97
27,5,0,0.74,686.0,245.0,220.5,3.5,0.0,17.21
25,3,0,0.74,686.0,245.0,220.5,3.5,0.0,17.24
28,2,0,0.71,710.5,269.5,220.5,3.5,0.0,17.64
30,4,0,0.71,710.5,269.5,220.5,3.5,0.0,17.66
31,5,0,0.71,710.5,269.5,220.5,3.5,0.0,18.07
29,3,0,0.71,710.5,269.5,220.5,3.5,0.0,18.12
34,4,0,0.69,735.0,294.0,220.5,3.5,0.0,18.5
32,2,0,0.69,735.0,294.0,220.5,3.5,0.0,18.59
