Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ tests/__pycache__/*
build/*
dist/*
src/flexcode.egg-info/*
.eggs/
vignettes/.ipynb_checkpoints/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,11 @@ z_validation = z_data[perm[n_train:]]

# Fit the model
model = flexcode.FlexCodeModel(XGBoost, max_basis=40, basis_system='cosine',
regression_params={"num_round":2000})
regression_params={"max_depth": 8})
model.fit(x_train, z_train)
model.tune(x_validation, z_validation)

# Make predictions
cdes, z_grid = model.predict(x_test, n_grid=200)

```
```
16 changes: 9 additions & 7 deletions src/flexcode/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

class FlexCodeModel(object):
def __init__(self, model, max_basis, basis_system="cosine",
z_min=None, z_max=None, regression_params={}):
z_min=None, z_max=None, regression_params={},
custom_model=None):
"""Initialize FlexCodeModel object

:param model: A FlexCodeRegression object
Expand All @@ -20,20 +21,21 @@ def __init__(self, model, max_basis, basis_system="cosine",
to the maximum of the training values
:param regression_params: A dictionary of tuning parameters
for the regression model

:param custom_model: a sklearn-type model, i.e. with fit and
predict method.
"""
self.max_basis = max_basis
self.best_basis = range(max_basis)
self.basis_system = basis_system
self.model = model(max_basis, regression_params)
self.model = model(max_basis, regression_params, custom_model)

self.z_min = z_min
self.z_max = z_max

self.bump_threshold = None
self.sharpen_alpha = None

def fit(self, x_train, z_train, weight = None):
def fit(self, x_train, z_train, weight=None):
"""Fits basis function regression models.

:param x_train: a numpy matrix of training covariates.
Expand All @@ -58,8 +60,8 @@ def fit(self, x_train, z_train, weight = None):

self.model.fit(x_train, z_basis, weight)

def tune(self, x_validation, z_validation, bump_threshold_grid =
None, sharpen_grid = None, n_grid=1000):
def tune(self, x_validation, z_validation, bump_threshold_grid=None,
sharpen_grid=None, n_grid=1000):
"""Set tuning parameters to minimize CDE loss

Sets best_basis, bump_delta, and sharpen_alpha values attributes
Expand Down Expand Up @@ -149,7 +151,7 @@ def predict(self, x_new, n_grid):
cdes /= self.z_max - self.z_min
return cdes, make_grid(n_grid, self.z_min, self.z_max)

def estimate_error(self, x_test, z_test, n_grid = 1000):
def estimate_error(self, x_test, z_test, n_grid=1000):
"""Estimates CDE loss on test data

:param x_test: A numpy matrix of covariates
Expand Down
96 changes: 93 additions & 3 deletions src/flexcode/regression_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import sklearn.neighbors
import sklearn.multioutput
import sklearn.model_selection
import sklearn.linear_model
SKLEARN_AVAILABLE = True
except ImportError:
SKLEARN_AVAILABLE = False
Expand All @@ -28,7 +29,7 @@ def predict(self, x_new):


class NN(FlexCodeRegression):
def __init__(self, max_basis, params):
def __init__(self, max_basis, params, *args, **kwargs):
if not SKLEARN_AVAILABLE:
raise Exception("NN requires sklearn to be installed")

Expand Down Expand Up @@ -74,7 +75,7 @@ def predict(self, x_test):


class RandomForest(FlexCodeRegression):
def __init__(self, max_basis, params):
def __init__(self, max_basis, params, *args, **kwargs):
if not SKLEARN_AVAILABLE:
raise Exception("RandomForest requires sklearn to be installed")

Expand Down Expand Up @@ -112,7 +113,7 @@ def predict(self, x_test):


class XGBoost(FlexCodeRegression):
def __init__(self, max_basis, params):
def __init__(self, max_basis, params, *args, **kwargs):
if not XGBOOST_AVAILABLE:
raise Exception("XGBoost requires xgboost to be installed")
super(XGBoost, self).__init__(max_basis)
Expand Down Expand Up @@ -159,3 +160,92 @@ def predict(self, x_test):
coefs = self.models.predict(x_test)
return coefs


class Lasso(FlexCodeRegression):
def __init__(self, max_basis, params, *args, **kwargs):
if not SKLEARN_AVAILABLE:
raise Exception("Lasso requires sklearn to be installed")
super(Lasso, self).__init__(max_basis)

# Also, set the default values if not passed
params['alpha'] = params.get("alpha", 1.0)
params['l1_ratio'] = params.get("l1_ratio", 1.0)

params_opt, opt_flag = params_dict_optim_decision(params, multi_output=True)
self.params = params_opt
self.models = None if opt_flag else sklearn.multioutput.MultiOutputRegressor(
sklearn.linear_model.ElasticNet(**self.params), n_jobs=-1
)

def fit(self, x_train, z_basis, weight=None):

if weight is not None:
raise ValueError('Weights are not supported in the ElasticNet/Lasso '
'implementation in sklearn.')

if self.models is None:
self.cv_optim(x_train, z_basis)

self.models.fit(x_train, z_basis)

def cv_optim(self, x_train, z_basis):
lasso_obj = sklearn.multioutput.MultiOutputRegressor(
sklearn.linear_model.ElasticNet(), n_jobs=-1
)
clf = sklearn.model_selection.GridSearchCV(
lasso_obj, self.params, cv=5, scoring='neg_mean_squared_error', verbose=2
)
clf.fit(x_train, z_basis)

self.params = params_name_format(clf.best_params_, str_rem='estimator__')
self.models = sklearn.multioutput.MultiOutputRegressor(
sklearn.linear_model.ElasticNet(**self.params), n_jobs=-1
)

def predict(self, x_test):
coefs = self.models.predict(x_test)
return coefs


class CustomModel(FlexCodeRegression):
def __init__(self, max_basis, params, custom_model, *args, **kwargs):
if not SKLEARN_AVAILABLE:
raise Exception("Custom class requires sklearn to be installed")
super(CustomModel, self).__init__(max_basis)

params_opt, opt_flag = params_dict_optim_decision(params, multi_output=True)
self.params = params_opt
self.base_model = custom_model
self.models = None if opt_flag else sklearn.multioutput.MultiOutputRegressor(
self.base_model(**self.params), n_jobs=-1
)

def fit(self, x_train, z_basis, weight=None):
# Given it's a custom class, work would need to be done
# for sample weights - for now this is not implemented.
if weight:
raise NotImplementedError('Weights for custom class not implemented.')

if self.models is None:
self.cv_optim(x_train, z_basis)

self.models.fit(x_train, z_basis)

def cv_optim(self, x_train, z_basis):
custom_obj = sklearn.multioutput.MultiOutputRegressor(
self.base_model(), n_jobs=-1
)
clf = sklearn.model_selection.GridSearchCV(
custom_obj, self.params, cv=5, scoring='neg_mean_squared_error', verbose=2
)
clf.fit(x_train, z_basis)

self.params = params_name_format(clf.best_params_, str_rem='estimator__')
self.models = sklearn.multioutput.MultiOutputRegressor(
self.base_model(**self.params), n_jobs=-1
)

def predict(self, x_test):
coefs = self.models.predict(x_test)
return coefs

35 changes: 34 additions & 1 deletion tests/test_cv_optim.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import flexcode
import numpy as np
from flexcode.regression_models import NN, RandomForest, XGBoost
import xgboost as xgb
from flexcode.regression_models import NN, RandomForest, XGBoost, CustomModel


def test_coef_predict_same_as_predict_NN():
Expand Down Expand Up @@ -88,4 +89,36 @@ def generate_data(n_draws):
coefs = model.predict_coefs(x_test)
cdes_coefs = coefs.evaluate(z_grid)

assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4


def test_coef_predict_same_as_predict_custom_model():
# Generate data p(z | x) = N(x, 1)
def generate_data(n_draws):
x = np.random.normal(0, 1, n_draws)
z = np.random.normal(x, 1, n_draws)
return x, z

x_train, z_train = generate_data(5000)
x_validation, z_validation = generate_data(5000)
x_test, z_test = generate_data(5000)

# Parameterize model
custom_model = xgb.XGBRegressor
model = flexcode.FlexCodeModel(CustomModel, max_basis=31, basis_system="cosine",
regression_params={"max_depth": [3, 5, 8],
'eta': [0.1, 0.2, 0.5]},
custom_model=custom_model)

# Fit and tune model
model.fit(x_train, z_train)
model.tune(x_validation, z_validation,
bump_threshold_grid = np.linspace(0, 0.2, 3),
sharpen_grid = np.linspace(0.5, 1.5, 3))

cdes_predict, z_grid = model.predict(x_test, n_grid=200)

coefs = model.predict_coefs(x_test)
cdes_coefs = coefs.evaluate(z_grid)

assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4
67 changes: 66 additions & 1 deletion tests/test_examples.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import flexcode
from flexcode.regression_models import NN, RandomForest, XGBoost
import xgboost as xgb
from flexcode.regression_models import NN, RandomForest, XGBoost, Lasso, CustomModel


def test_example():
# Generate data p(z | x) = N(x, 1)
Expand Down Expand Up @@ -58,6 +60,7 @@ def generate_data(n_draws):

assert True


def test_coef_predict_same_as_predict():
# Generate data p(z | x) = N(x, 1)
def generate_data(n_draws):
Expand Down Expand Up @@ -86,6 +89,7 @@ def generate_data(n_draws):

assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4


def test_coef_predict_same_as_predict_rf():

# Generate data p(z | x) = N(x, 1)
Expand Down Expand Up @@ -143,3 +147,64 @@ def generate_data(n_draws):
cdes_coefs = coefs.evaluate(z_grid)

assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4


def test_coef_predict_same_as_predict_lasso():
# Generate data p(z | x) = N(x, 1)
def generate_data(n_draws):
x = np.random.normal(0, 1, n_draws)
z = np.random.normal(x, 1, n_draws)
return x, z

x_train, z_train = generate_data(10000)
x_validation, z_validation = generate_data(10000)
x_test, z_test = generate_data(10000)

# Parameterize model
model = flexcode.FlexCodeModel(Lasso, max_basis=31, basis_system="cosine",
regression_params={"alpha": 1.0})

# Fit and tune model
model.fit(x_train, z_train)
model.tune(x_validation, z_validation,
bump_threshold_grid=np.linspace(0, 0.2, 3),
sharpen_grid=np.linspace(0.5, 1.5, 3))

cdes_predict, z_grid = model.predict(x_test, n_grid=200)

coefs = model.predict_coefs(x_test)
cdes_coefs = coefs.evaluate(z_grid)

assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 0.5


def test_coef_predict_same_as_predict_custom_class():
# Generate data p(z | x) = N(x, 1)
def generate_data(n_draws):
x = np.random.normal(0, 1, n_draws)
z = np.random.normal(x, 1, n_draws)
return x, z

x_train, z_train = generate_data(10000)
x_validation, z_validation = generate_data(10000)
x_test, z_test = generate_data(10000)

# Parameterize model
custom_model = xgb.XGBRegressor
model = flexcode.FlexCodeModel(CustomModel, max_basis=31, basis_system="cosine",
regression_params={"max_depth": 5},
custom_model=custom_model)

# Fit and tune model
model.fit(x_train, z_train)
model.tune(x_validation, z_validation,
bump_threshold_grid=np.linspace(0, 0.2, 3),
sharpen_grid=np.linspace(0.5, 1.5, 3))

cdes_predict, z_grid = model.predict(x_test, n_grid=200)

coefs = model.predict_coefs(x_test)
cdes_coefs = coefs.evaluate(z_grid)

assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4

Loading