From ad1d128627c19413d29f1be7fc474986fb81f7fe Mon Sep 17 00:00:00 2001 From: Nic Dalmasso Date: Thu, 20 Jun 2019 21:31:43 -0400 Subject: [PATCH 1/5] Changes to the README.md to reflect changes in code --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1046a42..589ac9c 100644 --- a/README.md +++ b/README.md @@ -120,11 +120,11 @@ z_validation = z_data[perm[n_train:]] # Fit the model model = flexcode.FlexCodeModel(XGBoost, max_basis=40, basis_system='cosine', - regression_params={"num_round":2000}) + regression_params={"max_depth": 8}) model.fit(x_train, z_train) model.tune(x_validation, z_validation) # Make predictions cdes, z_grid = model.predict(x_test, n_grid=200) -``` \ No newline at end of file +``` From 07ff007352204efdf3b6676c3544d8c8bcfeb81d Mon Sep 17 00:00:00 2001 From: Nic Dalmasso Date: Mon, 8 Jul 2019 11:09:16 -0400 Subject: [PATCH 2/5] Adding Lasso to Flexcode, in the same way it is done in the R package - i.e. using ElasticNet implementation --- src/flexcode/regression_models.py | 47 +++++++++++++++++++++++++++++++ tests/test_examples.py | 31 +++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/src/flexcode/regression_models.py b/src/flexcode/regression_models.py index 6f8d780..fa46474 100644 --- a/src/flexcode/regression_models.py +++ b/src/flexcode/regression_models.py @@ -12,6 +12,7 @@ import sklearn.neighbors import sklearn.multioutput import sklearn.model_selection + import sklearn.linear_model SKLEARN_AVAILABLE = True except ImportError: SKLEARN_AVAILABLE = False @@ -159,3 +160,49 @@ def predict(self, x_test): coefs = self.models.predict(x_test) return coefs + +class Lasso(FlexCodeRegression): + def __init__(self, max_basis, params): + if not SKLEARN_AVAILABLE: + raise Exception("Lasso requires sklearn to be installed") + super(Lasso, self).__init__(max_basis) + + # Also, set the default values if not passed + params['alpha'] = params.get("alpha", 1.0) + params['l1_ratio'] = params.get("l1_ratio", 1.0) + + params_opt, opt_flag = params_dict_optim_decision(params, multi_output=True) + self.params = params_opt + self.models = None if opt_flag else sklearn.multioutput.MultiOutputRegressor( + sklearn.linear_model.ElasticNet(**self.params), n_jobs=-1 + ) + + def fit(self, x_train, z_basis, weight=None): + + if weight is not None: + raise ValueError('Weights are not supported in the ElasticNet/Lasso ' + 'implementation in sklearn.') + + if self.models is None: + self.cv_optim(x_train, z_basis) + + self.models.fit(x_train, z_basis) + + def cv_optim(self, x_train, z_basis): + xgb_obj = sklearn.multioutput.MultiOutputRegressor( + sklearn.linear_model.ElasticNet(), n_jobs=-1 + ) + clf = sklearn.model_selection.GridSearchCV( + xgb_obj, self.params, cv=5, scoring='neg_mean_squared_error', verbose=2 + ) + clf.fit(x_train, z_basis) + + self.params = params_name_format(clf.best_params_, str_rem='estimator__') + self.models = sklearn.multioutput.MultiOutputRegressor( + sklearn.linear_model.ElasticNet(**self.params), n_jobs=-1 + ) + + def predict(self, x_test): + coefs = self.models.predict(x_test) + return coefs + diff --git a/tests/test_examples.py b/tests/test_examples.py index c8db826..930bb82 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -1,6 +1,6 @@ import numpy as np import flexcode -from flexcode.regression_models import NN, RandomForest, XGBoost +from flexcode.regression_models import NN, RandomForest, XGBoost, Lasso def test_example(): # Generate data p(z | x) = N(x, 1) @@ -143,3 +143,32 @@ def generate_data(n_draws): cdes_coefs = coefs.evaluate(z_grid) assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4 + + +def test_coef_predict_same_as_predict_lasso(): + # Generate data p(z | x) = N(x, 1) + def generate_data(n_draws): + x = np.random.normal(0, 1, n_draws) + z = np.random.normal(x, 1, n_draws) + return x, z + + x_train, z_train = generate_data(10000) + x_validation, z_validation = generate_data(10000) + x_test, z_test = generate_data(10000) + + # Parameterize model + model = flexcode.FlexCodeModel(Lasso, max_basis=31, basis_system="cosine", + regression_params={"alpha": 1.0}) + + # Fit and tune model + model.fit(x_train, z_train) + model.tune(x_validation, z_validation, + bump_threshold_grid=np.linspace(0, 0.2, 3), + sharpen_grid=np.linspace(0.5, 1.5, 3)) + + cdes_predict, z_grid = model.predict(x_test, n_grid=200) + + coefs = model.predict_coefs(x_test) + cdes_coefs = coefs.evaluate(z_grid) + + assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 0.1 From 8f067d1e3f52d81ad8200e2095dc8a77b5ca58fd Mon Sep 17 00:00:00 2001 From: Nic Dalmasso Date: Mon, 8 Jul 2019 15:13:08 -0400 Subject: [PATCH 3/5] Add a custom class implementation. The user can now pass its favorite regression method to be used within FlexCode. The model needs to have a and method implemented to work --- .gitignore | 1 + src/flexcode/core.py | 16 +++++---- src/flexcode/regression_models.py | 55 +++++++++++++++++++++++++++---- tests/test_cv_optim.py | 35 +++++++++++++++++++- tests/test_examples.py | 38 ++++++++++++++++++++- 5 files changed, 130 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 68f147a..70ee039 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ tests/__pycache__/* build/* dist/* src/flexcode.egg-info/* +.eggs/ diff --git a/src/flexcode/core.py b/src/flexcode/core.py index ca92cf1..c17ec81 100644 --- a/src/flexcode/core.py +++ b/src/flexcode/core.py @@ -8,7 +8,8 @@ class FlexCodeModel(object): def __init__(self, model, max_basis, basis_system="cosine", - z_min=None, z_max=None, regression_params={}): + z_min=None, z_max=None, regression_params={}, + custom_model=None): """Initialize FlexCodeModel object :param model: A FlexCodeRegression object @@ -20,12 +21,13 @@ def __init__(self, model, max_basis, basis_system="cosine", to the maximum of the training values :param regression_params: A dictionary of tuning parameters for the regression model - + :param custom_model: a sklearn-type model, i.e. with fit and + predict method. """ self.max_basis = max_basis self.best_basis = range(max_basis) self.basis_system = basis_system - self.model = model(max_basis, regression_params) + self.model = model(max_basis, regression_params, custom_model) self.z_min = z_min self.z_max = z_max @@ -33,7 +35,7 @@ def __init__(self, model, max_basis, basis_system="cosine", self.bump_threshold = None self.sharpen_alpha = None - def fit(self, x_train, z_train, weight = None): + def fit(self, x_train, z_train, weight=None): """Fits basis function regression models. :param x_train: a numpy matrix of training covariates. @@ -58,8 +60,8 @@ def fit(self, x_train, z_train, weight = None): self.model.fit(x_train, z_basis, weight) - def tune(self, x_validation, z_validation, bump_threshold_grid = - None, sharpen_grid = None, n_grid=1000): + def tune(self, x_validation, z_validation, bump_threshold_grid=None, + sharpen_grid=None, n_grid=1000): """Set tuning parameters to minimize CDE loss Sets best_basis, bump_delta, and sharpen_alpha values attributes @@ -149,7 +151,7 @@ def predict(self, x_new, n_grid): cdes /= self.z_max - self.z_min return cdes, make_grid(n_grid, self.z_min, self.z_max) - def estimate_error(self, x_test, z_test, n_grid = 1000): + def estimate_error(self, x_test, z_test, n_grid=1000): """Estimates CDE loss on test data :param x_test: A numpy matrix of covariates diff --git a/src/flexcode/regression_models.py b/src/flexcode/regression_models.py index fa46474..6c7681e 100644 --- a/src/flexcode/regression_models.py +++ b/src/flexcode/regression_models.py @@ -29,7 +29,7 @@ def predict(self, x_new): class NN(FlexCodeRegression): - def __init__(self, max_basis, params): + def __init__(self, max_basis, params, *args, **kwargs): if not SKLEARN_AVAILABLE: raise Exception("NN requires sklearn to be installed") @@ -75,7 +75,7 @@ def predict(self, x_test): class RandomForest(FlexCodeRegression): - def __init__(self, max_basis, params): + def __init__(self, max_basis, params, *args, **kwargs): if not SKLEARN_AVAILABLE: raise Exception("RandomForest requires sklearn to be installed") @@ -113,7 +113,7 @@ def predict(self, x_test): class XGBoost(FlexCodeRegression): - def __init__(self, max_basis, params): + def __init__(self, max_basis, params, *args, **kwargs): if not XGBOOST_AVAILABLE: raise Exception("XGBoost requires xgboost to be installed") super(XGBoost, self).__init__(max_basis) @@ -162,7 +162,7 @@ def predict(self, x_test): class Lasso(FlexCodeRegression): - def __init__(self, max_basis, params): + def __init__(self, max_basis, params, *args, **kwargs): if not SKLEARN_AVAILABLE: raise Exception("Lasso requires sklearn to be installed") super(Lasso, self).__init__(max_basis) @@ -189,11 +189,11 @@ def fit(self, x_train, z_basis, weight=None): self.models.fit(x_train, z_basis) def cv_optim(self, x_train, z_basis): - xgb_obj = sklearn.multioutput.MultiOutputRegressor( + lasso_obj = sklearn.multioutput.MultiOutputRegressor( sklearn.linear_model.ElasticNet(), n_jobs=-1 ) clf = sklearn.model_selection.GridSearchCV( - xgb_obj, self.params, cv=5, scoring='neg_mean_squared_error', verbose=2 + lasso_obj, self.params, cv=5, scoring='neg_mean_squared_error', verbose=2 ) clf.fit(x_train, z_basis) @@ -206,3 +206,46 @@ def predict(self, x_test): coefs = self.models.predict(x_test) return coefs + +class CustomModel(FlexCodeRegression): + def __init__(self, max_basis, params, custom_model, *args, **kwargs): + if not SKLEARN_AVAILABLE: + raise Exception("Custom class requires sklearn to be installed") + super(CustomModel, self).__init__(max_basis) + + params_opt, opt_flag = params_dict_optim_decision(params, multi_output=True) + self.params = params_opt + self.base_model = custom_model + self.models = None if opt_flag else sklearn.multioutput.MultiOutputRegressor( + self.base_model(**self.params), n_jobs=-1 + ) + + def fit(self, x_train, z_basis, weight=None): + # Given it's a custom class, work would need to be done + # for sample weights - for now this is not implemented. + if weight: + raise NotImplementedError('Weights for custom class not implemented.') + + if self.models is None: + self.cv_optim(x_train, z_basis) + + self.models.fit(x_train, z_basis) + + def cv_optim(self, x_train, z_basis): + custom_obj = sklearn.multioutput.MultiOutputRegressor( + self.base_model(), n_jobs=-1 + ) + clf = sklearn.model_selection.GridSearchCV( + custom_obj, self.params, cv=5, scoring='neg_mean_squared_error', verbose=2 + ) + clf.fit(x_train, z_basis) + + self.params = params_name_format(clf.best_params_, str_rem='estimator__') + self.models = sklearn.multioutput.MultiOutputRegressor( + self.base_model(**self.params), n_jobs=-1 + ) + + def predict(self, x_test): + coefs = self.models.predict(x_test) + return coefs + diff --git a/tests/test_cv_optim.py b/tests/test_cv_optim.py index 08233b9..0c3d2e2 100644 --- a/tests/test_cv_optim.py +++ b/tests/test_cv_optim.py @@ -1,6 +1,7 @@ import flexcode import numpy as np -from flexcode.regression_models import NN, RandomForest, XGBoost +import xgboost as xgb +from flexcode.regression_models import NN, RandomForest, XGBoost, CustomModel def test_coef_predict_same_as_predict_NN(): @@ -88,4 +89,36 @@ def generate_data(n_draws): coefs = model.predict_coefs(x_test) cdes_coefs = coefs.evaluate(z_grid) + assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4 + + +def test_coef_predict_same_as_predict_custom_model(): + # Generate data p(z | x) = N(x, 1) + def generate_data(n_draws): + x = np.random.normal(0, 1, n_draws) + z = np.random.normal(x, 1, n_draws) + return x, z + + x_train, z_train = generate_data(5000) + x_validation, z_validation = generate_data(5000) + x_test, z_test = generate_data(5000) + + # Parameterize model + custom_model = xgb.XGBRegressor + model = flexcode.FlexCodeModel(CustomModel, max_basis=31, basis_system="cosine", + regression_params={"max_depth": [3, 5, 8], + 'eta': [0.1, 0.2, 0.5]}, + custom_model=custom_model) + + # Fit and tune model + model.fit(x_train, z_train) + model.tune(x_validation, z_validation, + bump_threshold_grid = np.linspace(0, 0.2, 3), + sharpen_grid = np.linspace(0.5, 1.5, 3)) + + cdes_predict, z_grid = model.predict(x_test, n_grid=200) + + coefs = model.predict_coefs(x_test) + cdes_coefs = coefs.evaluate(z_grid) + assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4 \ No newline at end of file diff --git a/tests/test_examples.py b/tests/test_examples.py index 930bb82..2a7a31a 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -1,6 +1,8 @@ import numpy as np import flexcode -from flexcode.regression_models import NN, RandomForest, XGBoost, Lasso +import xgboost as xgb +from flexcode.regression_models import NN, RandomForest, XGBoost, Lasso, CustomModel + def test_example(): # Generate data p(z | x) = N(x, 1) @@ -58,6 +60,7 @@ def generate_data(n_draws): assert True + def test_coef_predict_same_as_predict(): # Generate data p(z | x) = N(x, 1) def generate_data(n_draws): @@ -86,6 +89,7 @@ def generate_data(n_draws): assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4 + def test_coef_predict_same_as_predict_rf(): # Generate data p(z | x) = N(x, 1) @@ -172,3 +176,35 @@ def generate_data(n_draws): cdes_coefs = coefs.evaluate(z_grid) assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 0.1 + + +def test_coef_predict_same_as_predict_custom_class(): + # Generate data p(z | x) = N(x, 1) + def generate_data(n_draws): + x = np.random.normal(0, 1, n_draws) + z = np.random.normal(x, 1, n_draws) + return x, z + + x_train, z_train = generate_data(10000) + x_validation, z_validation = generate_data(10000) + x_test, z_test = generate_data(10000) + + # Parameterize model + custom_model = xgb.XGBRegressor + model = flexcode.FlexCodeModel(CustomModel, max_basis=31, basis_system="cosine", + regression_params={"max_depth": 5}, + custom_model=custom_model) + + # Fit and tune model + model.fit(x_train, z_train) + model.tune(x_validation, z_validation, + bump_threshold_grid=np.linspace(0, 0.2, 3), + sharpen_grid=np.linspace(0.5, 1.5, 3)) + + cdes_predict, z_grid = model.predict(x_test, n_grid=200) + + coefs = model.predict_coefs(x_test) + cdes_coefs = coefs.evaluate(z_grid) + + assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4 + From 27b7df274280853b63fa7033cc0a9614d37ea3b7 Mon Sep 17 00:00:00 2001 From: Nic Dalmasso Date: Mon, 8 Jul 2019 15:27:37 -0400 Subject: [PATCH 4/5] Adding vignette on Custom Class --- .gitignore | 1 + vignettes/Custom Class.ipynb | 158 +++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 vignettes/Custom Class.ipynb diff --git a/.gitignore b/.gitignore index 70ee039..90cacf2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ build/* dist/* src/flexcode.egg-info/* .eggs/ +vignettes/.ipynb_checkpoints/ diff --git a/vignettes/Custom Class.ipynb b/vignettes/Custom Class.ipynb new file mode 100644 index 0000000..4ff581a --- /dev/null +++ b/vignettes/Custom Class.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook provides an example on how to use a custom class within Flexcode.
\n", + "In order to be compatible, a regression method needs to have a `fit` and `predict` method implemented - i.e. \n", + "`model.fit()` and `model.predict()` need to be the functions used for training and predicting respectively.\n", + "\n", + "We provide here an example with artifical data.
\n", + "We compare the FlexZBoost (Flexcode with builtin XGBoost) with the custom class of FLexcode when passing\n", + "XGBoost Regressor. The two should give basically identical results." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import flexcode\n", + "import numpy as np\n", + "import xgboost as xgb\n", + "from flexcode.regression_models import XGBoost, CustomModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Creation" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(n_draws):\n", + " x = np.random.normal(0, 1, n_draws)\n", + " z = np.random.normal(x, 1, n_draws)\n", + " return x, z\n", + "\n", + "x_train, z_train = generate_data(5000)\n", + "x_test, z_test = generate_data(5000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FlexZBoost" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Parameterize model\n", + "model = flexcode.FlexCodeModel(XGBoost, max_basis=31, basis_system=\"cosine\",\n", + " regression_params={'max_depth': 3, 'learning_rate': 0.5, 'objective': 'reg:linear'})\n", + "\n", + "# Fit and tune model\n", + "model.fit(x_train, z_train)\n", + "cdes_predict_xgb, z_grid = model.predict(x_test, n_grid=200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our custom model in this case is going to be XGBRegressor.
\n", + "The only difference with the above is that we are going to use the `CustomModel` class and we are going to pass\n", + "XGBRegressor as `custom_model`.\n", + "After that, everything is exactly as above.
\n", + "\n", + "Parameters can be passed also in the same way as above." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Parameterize model\n", + "my_model = xgb.XGBRegressor\n", + "model_c = flexcode.FlexCodeModel(CustomModel, max_basis=31, basis_system=\"cosine\",\n", + " regression_params={'max_depth': 3, 'learning_rate': 0.5, 'objective': 'reg:linear'},\n", + " custom_model=my_model)\n", + "\n", + "# Fit and tune model\n", + "model_c.fit(x_train, z_train)\n", + "cdes_predict_custom, z_grid = model_c.predict(x_test, n_grid=200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The two conditional density estimates should be the same across the board.
\n", + "We check the maximum difference in absolute value between the two." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.max(np.abs(cdes_predict_custom - cdes_predict_xgb))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From c8f442115519ee372de109da77a8c9ddf2652a35 Mon Sep 17 00:00:00 2001 From: Nic Dalmasso Date: Mon, 8 Jul 2019 15:54:34 -0400 Subject: [PATCH 5/5] Adding test correction --- tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index 2a7a31a..477c11e 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -175,7 +175,7 @@ def generate_data(n_draws): coefs = model.predict_coefs(x_test) cdes_coefs = coefs.evaluate(z_grid) - assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 0.1 + assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 0.5 def test_coef_predict_same_as_predict_custom_class():