This is a pathfinder notebook, where we will search for a better class structure to implement 4 things:
- continuous features binarisation
- categorical features LeaveOneOut encoding
- feature standartisation
- polynomial features transformation

They need to be implemented in the same order as listed.

In [14]:
import pandas as pd
from category_encoders import LeaveOneOutEncoder

from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge


In [4]:
raw_df =  pd.read_csv('datasets/geo_solar_panels.csv').dropna().reset_index(drop=True)
raw_label = 'potential_kwh_total'

In [29]:
numeric_features = [ 
    'lat_max',
    'lat_min',
    'long_max',
    'long_min',
    'number_of_panels_north',
    'number_of_panels_south',
    'number_of_panels_east',
    'number_of_panels_west',
    'number_of_panels_flat',
    'number_of_panels_total'
]

numeric_transformer = Pipeline(
    steps=[("binariser", QuantileTransformer()), 
           ("scaler", RobustScaler(unit_variance=True))]
)

categorical_features = ['state']
categorical_transformer = Pipeline(
    steps=[("encoder", LeaveOneOutEncoder())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", Ridge())])
regr = TransformedTargetRegressor(regressor=pipeline, transformer=RobustScaler(unit_variance=True))

In [30]:
X = raw_df[numeric_features + categorical_features]
y = raw_df[raw_label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

regr.fit(X_train, y_train)
print("model score: %.3f" % regr.score(X_test, y_test))

model score: 0.865
