This is a pathfinder notebook, where we will search for a better class structure to implement 4 things:
- continuous features binarisation
- categorical features LeaveOneOut encoding
- feature standartisation
- polynomial features transformation

They need to be implemented in the same order as listed.

In [157]:
import pandas as pd
import numpy as np
from category_encoders import LeaveOneOutEncoder

from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import seaborn as sns


In [161]:
raw_df =  pd.read_csv('datasets/geo_solar_panels.csv').dropna().reset_index(drop=True)
raw_label = 'potential_kwh_total'

In [145]:
# pca_pipe = Pipeline(
#     steps=[
#     ("scaler", RobustScaler(unit_variance=True)), 
#     ('pca', PCA())
#     ]
# )

# numeric_union = FeatureUnion([
#     ("scaler", RobustScaler(unit_variance=True)),
#     ("binariser", QuantileTransformer()),
#     ('pca', pca_pipe)
# ])

# numeric_polinomisation = Pipeline(
#     steps=[
#     ('numeric_union', numeric_union), 
#     ('poly', PolynomialFeatures(degree=2))
#     ]
# )

# categorical_transformer = Pipeline(
#     steps=[("encoder", LeaveOneOutEncoder())]
# )

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('numeric_polinomisation', numeric_polinomisation, numeric_features),
#         ('cat', categorical_transformer, categorical_features),
#     ]
# )



In [168]:
class Experiment:
    """Class to conduct and score an experiment using passed model, and data"""
    def __init__(self, X: np.ndarray, label: np.ndarray,  pipe):
        self.label = label
        self.pipe = pipe
        self.X = X

    def _score(self, estimator, X_test, y_test):
        """Method to score the passed model on passed data. Only tb used as input to self.run_exp"""
        y_pred = estimator.predict(X_test)
        naive_baseline_rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test))
        return {'nrmse': naive_baseline_rmse / np.std(y_test),
                'r2': r2_score(y_pred=y_pred, y_true=y_test)}

    def run_exp(self):
        """Method to run cross validation """
        kfold = RepeatedKFold(n_splits=5, n_repeats=1)
        test_res = cross_validate(self.pipe, 
                                X=self.X, 
                                y=self.label, 
                                scoring = self._score,
                                cv=kfold,
                                error_score='raise')
        return {'NRMSE': test_res['test_nrmse'].mean(), 'R2': test_res['test_r2'].mean()}

In [169]:
numeric_features = [ 
    'lat_max',
    'lat_min',
    'long_max',
    'long_min',
    'number_of_panels_north',
    'number_of_panels_south',
    'number_of_panels_east',
    'number_of_panels_west',
    'number_of_panels_flat',
    'number_of_panels_total'
]

categorical_features = [
    'state'
]

numeric_transformer = Pipeline(
    steps=[
    ('numeric_union', FeatureUnion([
                            ("scaler", RobustScaler(unit_variance=True)),
                            ("binariser", QuantileTransformer()),
                            ('pca', Pipeline(
                                        steps=[
                                        ("scaler", RobustScaler(unit_variance=True)), 
                                        ('pca', PCA())
                                        ]
                                    ))
                        ])), 
    ('poly', PolynomialFeatures(degree=2))
    ]
)

categorical_transformer = Pipeline(
    steps=[("encoder", LeaveOneOutEncoder())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_polinomisation', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

In [170]:
X = raw_df[numeric_features + categorical_features]
y = raw_df[raw_label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("regressor", Ridge())]) #Lasso(alpha=.005)

pipeline

In [171]:
regr = TransformedTargetRegressor(regressor=pipeline, transformer=RobustScaler(unit_variance=True))

regr.fit(X_train, y_train)
print("model score: %.3f" % regr.score(X_test, y_test))

model score: 0.997


In [172]:
regr.regressor_.named_steps['preprocessor'].transform(X).shape

(11490, 497)

In [173]:
naive_exp = Experiment(
    X=raw_df[[col for col in list(raw_df) if col!=raw_label]],
    label=raw_df[[raw_label]],
    pipe=regr
)

naive_exp.run_exp()

{'NRMSE': 0.05988006962703679, 'R2': 0.9964092896369966}