This is a pathfinder notebook, where we will search for a better class structure to implement 4 things:
- continuous features binarisation
- categorical features LeaveOneOut encoding
- feature standartisation
- polynomial features transformation

They need to be implemented in the same order as listed.

In [1]:
import pandas as pd
import numpy as np
from category_encoders import LeaveOneOutEncoder

from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline

import seaborn as sns


In [2]:
raw_df =  pd.read_csv('datasets/geo_solar_panels.csv').dropna().reset_index(drop=True)
raw_label = 'potential_kwh_total'

prescaled_df = pd.read_csv('datasets/prescaled_modeling_df.csv').dropna().reset_index(drop=True)# eda feats before manual scaling
prescaled_label = 'potential_kwh_total'

In [7]:
type(ElasticNet())

sklearn.linear_model._coordinate_descent.ElasticNet

In [3]:
def set_up_pipeline(estimator=ElasticNet(alpha=.001, tol=1e-3)):
    """Function that creates data preprocessing pipeline with the passed estimator at the end"""
    numeric_transformer = Pipeline(steps=[
        ('numeric_union', FeatureUnion([
                                ("binariser", QuantileTransformer()),
                                ("original", make_pipeline('passthrough')),
                                ('pca', Pipeline(
                                            steps=[
                                            ("scaler", StandardScaler()), 
                                            ('pca', PCA(n_components=7))
                                            ]
                                        ))
                            ])),
        ('poly', PolynomialFeatures(degree=3)),
        ('scaler', StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[('CategoryTransformer', CategoryTransformer()), 
               ('encoder', LeaveOneOutEncoder())] 
    )

    all_feats_transform = ColumnTransformer(
        transformers=[
            ('numeric_transformer', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
        ]
    )

    X_pipe = Pipeline(steps=[('preprocessor', all_feats_transform),
                             ('regressor', estimator)])

    return TransformedTargetRegressor(regressor=X_pipe, transformer=StandardScaler())

class CategoryTransformer(TransformerMixin, BaseEstimator):        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Method transforms the type of the columns of passed dataset into category dtype"""
        return X.astype('category')


class Experiment:
    """Class to conduct and score an experiment using passed model, and data"""
    def __init__(self, X: np.ndarray, label: np.ndarray,  pipe):
        self.label = label
        self.pipe = pipe
        self.X = X

    def _score(self, estimator, X_test, y_test):
        """Method to score the passed model on passed data. Only tb used as input to self.run_exp"""
        y_pred = estimator.predict(X_test)
        naive_baseline_rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test))
        return {'nrmse': naive_baseline_rmse / np.std(y_test),
                'r2': r2_score(y_pred=y_pred, y_true=y_test)}

    def run_exp(self):
        """Method to run cross validation """
        kfold = RepeatedKFold(n_splits=5, n_repeats=1)
        test_res = cross_validate(self.pipe, 
                                X=self.X, 
                                y=self.label, 
                                scoring = self._score,
                                cv=kfold,
                                error_score='raise')
        return {'NRMSE': np.median(test_res['test_nrmse']), 'R2': np.median(test_res['test_r2'])}

In [4]:
numeric_features = [
    'lat_max',
    'lat_min',
    'lat_mean',
    'long_max',
    'long_min',
    'long_mean',
    'zip_area',
    'dens',
    'zip_lat',
    'zip_long',
    # 'approx_zips',
    'state_size',
    'state_lat',
    'state_long',
    'number_of_panels_north',
    'number_of_panels_south',
    'number_of_panels_east',
    'number_of_panels_west',
    'number_of_panels_flat',
    'number_of_panels_total'
]
categorical_features = [
    'is_usa',
    # 'state',
    'approx_zips'
]
regr = set_up_pipeline(estimator=ElasticNet(alpha=.001, tol=1e-3))


In [5]:
regr

In [6]:
df = prescaled_df; label = prescaled_label

naive_exp = Experiment(
    X=df[numeric_features + categorical_features],
    label=df[label],
    pipe=regr
)

naive_exp.run_exp()

{'NRMSE': 0.0470362335521312, 'R2': 0.9977875927332294}

In [None]:
df = prescaled_df; label = prescaled_label
# df = raw_df; label = raw_label

X = df[numeric_features + categorical_features]
y = df[label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# regr = X_pipe
regr.fit(X_train, y_train)
print("model score: %.3f" % regr.score(X_test, y_test))

In [None]:
regr.regressor_['preprocessor'].fit_transform(X, y).shape