In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/jmquintana79/Datasets/master/housing.csv", header = None)
cols_x = [f"x{i}"for i in range(df.shape[1]-1)]
df.columns =  cols_x + ["y"]
df.shape

(506, 14)

In [3]:
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    A feature selector for scikit-learn's Pipeline class that returns specified columns from a Pandas dataframe.
    cols -- list of columns to be selected.
    return_array -- return array or df with selected columns (default, array).
    """

    # constructor
    def __init__(self, cols:list, return_array:bool = True):
        self.cols = cols
        self.return_array = return_array

    def transform(self, X, y=None):
        # validation input format
        assert isinstance(X, pd.DataFrame), "It is required a dataframe as input."
        # validation selected columns are in df
        for col in self.cols:
            assert col in X.columns.tolist(), f"Column '{col}' is not in the input dataframe."
        # select and return
        if self.return_array:
            return X[self.cols].values
        else:
            return X[self.cols]

    def fit(self, X, y=None):
        return self

In [20]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

class PipelineTools():
    def __init__(self):
        pass
    
    @staticmethod
    def create(list_of_steps:list):
        # validate input
        assert list(set([len(i) for i in list_of_steps]))[0] == 2, "Any step don't have a correct dimession."
        assert list(set([type(i[0]) for i in list_of_steps]))[0] == str, "Any step don't have a correct name description."
        # return pipeline
        return Pipeline(list_of_steps)
        
    @staticmethod
    def merge_features(list_of_pipelines:list):
        # validate input
        assert list(set([len(i) for i in list_of_pipelines]))[0] == 2, "Any pipeline don't have a correct dimession."
        assert list(set([type(i[0]) for i in list_of_pipelines]))[0] == str, "Any pipeline don't have a correct name description."
        # return pipeline
        return FeatureUnion(transformer_list=list_of_pipelines)        
    
    @staticmethod
    def merge_features_by_columns_lists(list_of_pipelines_by_columns_lists:list):
        # validate input
        assert list(set([len(i) for i in list_of_pipelines_by_columns_lists]))[0] == 3, "Any pipeline don't have a correct dimession."
        assert list(set([type(i[0]) for i in list_of_pipelines_by_columns_lists]))[0] == str, "Any pipeline don't have a correct name description."
        assert list(set([type(i[2]) for i in list_of_pipelines_by_columns_lists]))[0] == list, "Any pipeline don't have a correct list of columns."
        # return pipeline
        return ColumnTransformer(list_of_pipelines_by_columns_lists)     

In [28]:
cols_y = ["y"]
cols_x = [c for c in df.columns if not c in cols_y]
X = df[cols_x]
y = df[cols_y]

In [31]:
from sklearn.preprocessing import StandardScaler
pltools = PipelineTools()

lpreprocessing = [
    ('selector', ColumnSelector(cols_x)),
    ('scaler', StandardScaler()),
]
ppl_preprocessing = pltools.create(lpreprocessing)

In [36]:
from models.base import Regressor
from sklearn.linear_model import LinearRegression

In [37]:
clf = Regressor(LinearRegression())

[info] Create LinearRegression(regressor): {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}


In [39]:
dir(clf)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_fit_gridsearchcv',
 'clf',
 'fit',
 'fit_exhaustive_tuning',
 'get_params',
 'metric_cv',
 'metrics',
 'metrics_train_cv',
 'name',
 'predict',
 'scores_cv',
 'scores_cv_exhaustive_tuning',
 'set_params',
 'type',
 'valida']

In [40]:
clf.fit(X, y)

[info] Fitting...


In [45]:
clf.scores_cv(X,y)

[info] CV scoring...
[info] cv scores(neg_mean_absolute_error): -4.250 +/- 0.979 


In [48]:
dparam_grid = {
    "fit_intercept": [True, False]
}
clf.scores_cv_exhaustive_tuning(X,y, dparam_grid)

[info] CV scoring with exhaustive tuning...
[info] cv scores(neg_mean_absolute_error): -4.096 +/- 1.063 


In [50]:
dir(clf)

['X_',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_fit_gridsearchcv',
 'clf',
 'fit',
 'fit_exhaustive_tuning',
 'get_params',
 'metric_cv',
 'metrics',
 'metrics_train_cv',
 'name',
 'predict',
 'scores_cv',
 'scores_cv_exhaustive_tuning',
 'set_params',
 'type',
 'valida',
 'y_']

In [57]:
X_processed = ppl_preprocessing.fit_transform(X)

In [58]:
clf.metrics_train_cv(X, y)

[info] Calculating train/cv metrics...
[info] Metrics(train): bias = 0.000  mae = 3.271   r2 = 0.734
[info] Metrics(cv): bias = 0.075  mae = 4.247   r2 = 0.549


{'train': {'bias': 7.66712121986195e-15,
  'mae': 3.2708628109003164,
  'r2': 0.733789726372463},
 'cv': {'bias': 0.07462323987717547,
  'mae': 4.246751028599074,
  'r2': 0.549122407684909}}

In [59]:
clf.metrics_train_cv(X_processed, y)

[info] Calculating train/cv metrics...
[info] Metrics(train): bias = 0.000  mae = 3.271   r2 = 0.734
[info] Metrics(cv): bias = 0.075  mae = 4.247   r2 = 0.549


{'train': {'bias': 7.0071309317053356e-15,
  'mae': 3.270862810900317,
  'r2': 0.7337897263724628},
 'cv': {'bias': 0.07462323987725343,
  'mae': 4.246751028599062,
  'r2': 0.5491224076849099}}

In [None]:
# https://www.kaggle.com/travelcodesleep/end-to-end-regression-pipeline-using-scikitlearn