In [20]:
import sys

sys.path.append('../../globalfunction')  # setting path
import globalfunction.vv as vv  # importing

import numpy as np
import pandas as pd
import ast
import math

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sklearn.metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn import model_selection

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
#df, data_version_description, numeric_cols, cat_cols = vv.dataset_modelling_version(iteration_code="0011_20220703", row_limit=100)
#df, data_version_description, numeric_cols, cat_cols = vv.dataset_modelling_version(iteration_code="0011_20220703", row_limit=500)
df, data_version_description, numeric_cols, cat_cols = vv.dataset_modelling_version(iteration_code="0011_20220703", row_limit=8000)
#df, data_version_description, numeric_cols, cat_cols = vv.dataset_modelling_version(iteration_code="0011_20220703", row_limit=0)
print(data_version_description)
print(df.shape)
df.sample(20)

In [None]:
df = vv.tidy_dataset(df, coerce_to_float=['location.latitude'], na_infer_median=['bedrooms_model', 'bathrooms_model'], na_drop_column=[],
                     na_drop_rows=[])

In [None]:
df.info()

In [None]:
sample_incomplete_rows = df[df.isnull().any(axis=1)]  #.head()
sample_incomplete_rows

In [None]:
df.dropna(inplace=True)

In [None]:
y = df['Price']
X = df.drop(['Price'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=101)
X_train.shape, X_test.shape

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])  # define the transformer for categorical columns

numeric_no_scale_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])  # define the transformer for categorical columns

categorical_transformer1 = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

categorical_transformer2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])
features_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer,
         ['location.latitude', 'location.longitude', 'distance_to_any_train', 'bedrooms_model', 'bathrooms_model', 'analyticsProperty.imageCount', 'analyticsProperty.added',
          'floorplan_count', 'property_age']),
        ('categorical1', categorical_transformer1, []),  #
        ('categorical2', categorical_transformer2, [])
    ])
features_noscale_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_no_scale_transformer,
         ['location.latitude', 'location.longitude', 'distance_to_any_train', 'bedrooms_model', 'bathrooms_model', 'analyticsProperty.imageCount', 'analyticsProperty.added',
          'floorplan_count']),
        ('categorical1', categorical_transformer1, []),
        ('categorical2', categorical_transformer2, [])
    ])

features_preprocessor

In [None]:
features_noscale_preprocessor

In [None]:
class EstimatorPipeSelectionHelper:

    def __init__(self, models_and_params):
        # if not set(models.keys()).issubset(set(params.keys())):
        #     missing_params = list(set(models.keys()) - set(params.keys()))
        #     raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        # self.models = models
        # self.params = params
        # self.keys = models.keys()
        self.keys = models_and_params.keys()
        self.models_and_params = models_and_params
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False, n_iter=10):
        for key in self.keys:
            model = self.models_and_params[key]["model"]
            params = self.models_and_params[key]["params"]

            if 'noscale' in key:
                pipe = Pipeline(steps=[
                    ('preprocessor', features_noscale_preprocessor),  # preprocess features
                    ('estimator', model),
                ])  # start the training
            else:
                pipe = Pipeline(steps=[
                    ('preprocessor', features_preprocessor),  # preprocess features
                    ('estimator', model),
                ])  # start the training

            if self.models_and_params[key]["cv_type"] == 'grid':
                print("Running GridSearchCV for %s." % key)
                gs = GridSearchCV(pipe, params, cv=cv, n_jobs=n_jobs,
                                  verbose=verbose, scoring=scoring, refit=refit,
                                  return_train_score=True)
                gs.fit(X, y)
                self.grid_searches[key] = gs
            elif self.models_and_params[key]["cv_type"] == 'random':
                print("Running RandomizedSearchCV for %s." % key)
                gs = RandomizedSearchCV(pipe, params, cv=cv, n_jobs=n_jobs,
                                  verbose=verbose, scoring=scoring, refit=refit,
                                  return_train_score=True,n_iter=n_iter)
                gs.fit(X, y)
                self.grid_searches[key] = gs
            else:
                raise ValueError()

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            #return pd.Series({**params, **d})
            return pd.Series({**params, **d, **{'params_full': str(params)}})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [None]:
options__maxdepth = [3, 5, 10, 50, 100, 200, 250, 300, 350, 400, 500]
options__nestimators = [5, 50, 100, 500, 1000, 2000, 5000]

models_and_params = {
    'RF scaled random': {
        'model': RandomForestRegressor(random_state=101),
        'params': {'estimator__max_depth': options__maxdepth,
                   'estimator__n_estimators': options__nestimators},
        'cv_type': 'random'
    },
    'RF noscale random': {
        'model': RandomForestRegressor(),
        'params': {'estimator__max_depth': options__maxdepth,
                   'estimator__n_estimators': options__nestimators},
        'cv_type': 'random'
    },
    # 'RF scaled max depth': {
    #     'model': RandomForestRegressor(),
    #     'params': {'estimator__max_depth': options__maxdepth},
    #     'cv_type': 'grid'
    # },
    # 'RF noscale max depth': {
    #     'model': RandomForestRegressor(),
    #     'params': {'estimator__max_depth': options__maxdepth},
    #     'cv_type': 'grid'
    # },
    # 'RF scaled nestimators': {
    #     'model': RandomForestRegressor(),
    #     'params': {'estimator__n_estimators': options__nestimators},
    #     'cv_type': 'grid'
    # },
    # 'RF noscale nestimators': {
    #     'model': RandomForestRegressor(),
    #     'params': {'estimator__n_estimators': options__nestimators},
    #     'cv_type': 'grid'
    # },
    # 'RF redundancy 1': {
    #     'model': RandomForestRegressor(),
    #     'params': {'estimator__n_estimators': [5]},
    #     'cv_type': 'grid'
    # },
    # 'RF redundancy 2': {
    #     'model': RandomForestRegressor(),
    #     'params': {'estimator__n_estimators': [5]},
    #     'cv_type': 'grid'
    # },
    'RF scaled 1 (scaled vs not)': {
        'model': RandomForestRegressor(random_state=101),
        'params': {'estimator__n_estimators': [5]},
        'cv_type': 'grid'
    },
    'RF scaled 2 (scaled vs not)': {
        'model': RandomForestRegressor(random_state=101),
        'params': {'estimator__n_estimators': [5]},
        'cv_type': 'grid'
    },
    'RF noscaled (scaled vs not)': {
        'model': RandomForestRegressor(random_state=101),
        'params': {'estimator__n_estimators': [5]},
        'cv_type': 'grid'
    }
}

helper = EstimatorPipeSelectionHelper(models_and_params)
helper.fit(X_train, y_train, scoring='neg_root_mean_squared_error', n_jobs=2, cv=2, n_iter=5)

score_summary = helper.score_summary(sort_by='max_score')
score_summary

In [None]:
if False:
    if False:
        sns.plotting_context()
        # https://seaborn.pydata.org/generated/seaborn.set_theme.html#seaborn.set_theme

    best_estimator = score_summary.iloc[0]
    worst_estimator = score_summary.iloc[-1]
    best_estimator, worst_estimator

    name_best = best_estimator["estimator"]
    params_str = best_estimator["params_full"]

    params_best = ast.literal_eval(params_str)
    print("best", params_best)
    RandomForestRegressor().set_params()

    if 'noscale' in name_best:
        pipe = Pipeline(steps=[
            ('preprocessor', features_noscale_preprocessor),  # preprocess features
            ('estimator', models1[name_best]),
        ])  # start the training
    else:
        pipe = Pipeline(steps=[
            ('preprocessor', features_preprocessor),
            ('estimator', models1[name_best]),  # preprocess features
        ])  # start the training

    fig, ax = plt.subplots()
    ax.scatter(y_test, pipe.set_params(**params_best).fit(X_train, y_train).predict(X_test), edgecolors=(0, 0, 1))
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
    ax.set_ylabel('Predicted')
    ax.set_xlabel('Actual')
    ax.title.set_text(f'best model: {name_best} {params_best}')

    name_worst = worst_estimator["estimator"]
    params_str = worst_estimator["params_full"]

    params_worst = ast.literal_eval(params_str)
    print("worst", params_worst)

    if 'noscale' in name_worst:
        pipe = Pipeline(steps=[
            ('preprocessor', features_noscale_preprocessor),  # preprocess features
            ('estimator', models1[name_worst]),
        ])  # start the training
    else:
        pipe = Pipeline(steps=[
            ('preprocessor', features_preprocessor),
            ('estimator', models1[name_worst]),  # preprocess features
        ])  # start the training

    fig, ax = plt.subplots()
    ax.scatter(y_test, pipe.set_params(**params_worst).fit(X_train, y_train).predict(X_test), edgecolors=(0, 0, 1))
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
    ax.set_ylabel('Predicted')
    ax.set_xlabel('Actual')
    ax.title.set_text(f'worst model: {name_worst} {params_worst}')


def make_pipe(name):
    if 'noscale' in name:
        pipe = Pipeline(steps=[
            ('preprocessor', features_noscale_preprocessor),  # preprocess features
            ('estimator', models_and_params[name]["model"]),
        ])  # start the training
    else:
        pipe = Pipeline(steps=[
            ('preprocessor', features_preprocessor),
            ('estimator', models_and_params[name]["model"]),  # preprocess features
        ])  # start the training

    return pipe


if True:
    #sns.set(rc={"figure.figsize": (10, 10)})
    sns.set_theme(font_scale=2, rc=None)
    sns.set_theme(font_scale=1, rc=None)

    #total_graphs = len(score_summary)
    # max_horizontal = 4
    # index2 = 0
    # resultant_rows = math.ceil(total_graphs / max_horizontal)
    # #subplots_adjust()
    #
    # #fig, axes = plt.subplots(nrows=resultant_rows, ncols=max_horizontal)
    fig, axes = plt.subplots(ncols=3, figsize=(15, 5))

    plt.subplots_adjust(hspace=0.2)
    plt.subplots_adjust(wspace=0.2)

    best_estimator = score_summary.iloc[0]
    worst_estimator = score_summary.iloc[-1]

    name_best = best_estimator["estimator"]
    params_str = best_estimator["params_full"]
    params_best = ast.literal_eval(params_str)

    name_worst = worst_estimator["estimator"]
    params_str = worst_estimator["params_full"]
    params_worst = ast.literal_eval(params_str)

    RandomForestRegressor().set_params()

    best_pipe = make_pipe(name_best)
    worst_pipe = make_pipe(name_worst)

    coordinates = axes[0]
    sns.lineplot(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], ax=axes[0], color='red')
    sns.scatterplot(x=y_test, y=best_pipe.set_params(**params_best).fit(X_train, y_train).predict(X_test), ax=axes[0], s=100).set(
        title=f'"BEST" model: {name_best} \n{params_best}')

    sns.lineplot(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], ax=axes[1], color='red')
    sns.scatterplot(x=y_test, y=worst_pipe.set_params(**params_worst).fit(X_train, y_train).predict(X_test), ax=axes[1], s=100).set(
        title=f'"WORST" model: {name_worst} \n{params_worst}')

    sns.scatterplot(x=y_test, y=worst_pipe.set_params(**params_worst).fit(X_train, y_train).predict(X_test), ax=axes[2], s=100, color='orange')
    sns.scatterplot(x=y_test, y=best_pipe.set_params(**params_best).fit(X_train, y_train).predict(X_test), ax=axes[2], s=100, alpha=0.6, color='black').set(
        title='best (black) vs worst (orange)')

    fig.tight_layout()
    plt.show()

if True:
    max_horizontal = 3

    #sns.set()
    #sns.set_theme(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)
    sns.set(rc={"figure.figsize": (20, 20)})
    sns.set_theme(font_scale=2, rc=None)
    sns.set_theme(font_scale=1, rc=None)

    total_graphs = len(score_summary)
    index2 = 0
    resultant_rows = math.ceil(total_graphs / max_horizontal)
    #subplots_adjust()

    #fig, axes = plt.subplots(nrows=resultant_rows, ncols=max_horizontal)
    fig, axes = plt.subplots(nrows=2, ncols=max_horizontal, figsize=(15, 10))

    plt.subplots_adjust(hspace=0.2)
    plt.subplots_adjust(wspace=0.2)

    for (key, next_estimator), index in zip(score_summary.iterrows(), range(total_graphs)):
        if index % (max_horizontal * 2) == 0 and index != 0:
            index2 = 0
            fig.tight_layout()
            plt.show()
            #fig, axes = plt.subplots(nrows=resultant_rows, ncols=max_horizontal)
            fig, axes = plt.subplots(nrows=2, ncols=max_horizontal, figsize=(15, 10))

        name_next = next_estimator["estimator"]
        params_str = next_estimator["params_full"]
        params_next = ast.literal_eval(params_str)
        #print("next", params_next)

        if 'noscale' in name_next:
            pipe = Pipeline(steps=[
                ('preprocessor', features_noscale_preprocessor),  # preprocess features
                ('estimator', models_and_params[name_next]["model"]),
            ])  # start the training
        else:
            pipe = Pipeline(steps=[
                ('preprocessor', features_preprocessor),
                ('estimator', models_and_params[name_next]["model"]),  # preprocess features
            ])  # start the training

        # 0 ==> 0,0
        # 1 ==> 0,1
        # 2 ==> 1,0
        x_coor = index2 // max_horizontal
        y_coor = index2 % max_horizontal

        coordinates = axes[x_coor][y_coor]
        #sns.lineplot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], hue='red', lw=3)
        sns.lineplot(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], ax=coordinates, color='red')
        sns.scatterplot(x=y_test, y=pipe.set_params(**params_next).fit(X_train, y_train).predict(X_test), ax=coordinates, s=100).set(
            title=f'({index}) {"BEST" if index == 0 else "next"} model: {name_next} \n{params_next}')
        #if index == 11: break
        index2 += 1

    fig.tight_layout()
    plt.show()