In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from datasets import dataset_info, dataset_load

## Data Experiments

Now lets take a look at a particular real-life data problem. In the following example we take a look at the prediction of ozone concentration as a factor of other weather-based features. As with all data problems, it behooves us to take a look at all of the information that we have about the dataset.

In [None]:
dataset_info('laozone')

Now lets take a look at what this dataset looks like. 

In [None]:
data = dataset_load('laozone')
print(data)

Alright, we're ready to get started ! Now, before we touch anything, we need to follow best practices. When faced with a new dataset, we need to set up some kind of objective comparison. To do this, we need to split our dataset into three parts: **Training**(and within that, **Validation**), and **Testing** sets. 

The best practice here is to take the test data and lock it away somewhere. It is always tempting to tune your algorithms to give the best test performance. However, even if the regression isn't explicitly *trained* on the test data, as practitioners, we could be continually making changes in an effort to get our numbers up.

Instead, we should deep-freeze the test data, and then tune as much as we can via **cross-validation (CV)** on our training data.

In [None]:
from sklearn.model_selection import train_test_split, KFold

# Convert from DataFrame to array
y = data['ozone'].as_matrix().astype(float)
X = data[[i for i in range(1,10)]].as_matrix().astype(float)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1)

print("Training samples: ", len(y_train))
print("Testing samples: ", len(y_test))

Now, before we start attempting to fit models, lets take a bit of care and apply some pre-processing to our dataset. The de-facto pre-processing is *centering and normalization*. Specifically, many flavors of estimators (OLS, RR, etc.) can be thrown of by large differences in of scale and variations between the features. We can easily account for this in our estimators by simply normalizing the feature columns and removing averages. Scikit-Learn has some features for this.

In [None]:
from sklearn import preprocessing

# Adding features
# Can you think of any other possible features to include, here?
# What other pre-processing steps might you use?

# Center and scale features
X_test = preprocessing.scale(X_test)
X_train = preprocessing.scale(X_train)

# Center observations
mean_train = np.mean(y_train)
mean_test = np.mean(y_test)

y_train = y_train - mean_train
y_test = y_test - mean_test

Now it is time for us to choose our estimator. What should we choose? 

## Attempt 1: OLS

In [None]:
# Calculate the OLS estimate
reg_ols = np.linalg.solve(np.dot(X_train.T, X_train), np.dot(X_train.T, y_train))

yp = np.dot(X_train, reg_ols)
yp_test = np.dot(X_test, reg_ols)

# Visualize
plt.figure(figsize=(14,7))

plt.subplot(121)
plt.plot(y_train + mean_train,yp + mean_train, '.', label='Training')
plt.plot(y_test + mean_test, yp_test + mean_test, '.r', label='Testing', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (true)', fontsize=16)
plt.ylabel('Ozone (predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction performance', fontsize=18)

# Plot the learned model
plt.subplot(122)
plt.stem(reg_ols)
plt.title(r'Learned model $\hat{w}$', fontsize=18)
plt.xticks(range(9),data.keys()[1:10], rotation='vertical')

# Print RSS
rss_train = np.mean((y_train - yp) ** 2)
rss_test = np.mean((y_test - yp_test) ** 2)
print("Normalized RSS (train): %0.2f" % rss_train)
print("Normalized RSS (test): %0.2f" % rss_test)

## Attempt 2: Ridge Regression

In [None]:
from sklearn import linear_model

# Define regression estimator
reg_ridge = linear_model.Ridge(alpha=0.01)

Now, we need to perform some kind of CV to find the best set of parameters for our model. We will do this by constructing a **Pipeline**. A pipeline is a useful way of handling pre-processing on separate data partitions when performing CV. Let's take a look at that.

In [None]:
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline 

# Make a pre-processing + fitting pipeline
pipeline_ridge = make_pipeline(preprocessing.StandardScaler(), reg_ridge)

# Define a scoring metric
# To compare fits, we look at the prediction error via the RSS.
def neg_rss(reg, X, y):
    yp = reg.predict(X)
    return -np.mean((y - yp) ** 2)

# Define CV splitting
# We can create an iterator which performs a set of randomized 
# splits on the dataset into "train" and "validation". We have
# a natural tradeoff between the test set size and the number of 
# splits we should perform
cv = ShuffleSplit(n_splits=20, test_size=0.05, random_state=0)

# Define parameters to search
# We need to specify the estimator name since we are performing a CV on
# a Pipeline. (e.g. the formatting of `<estimator>__<param>`).
param_grid = [
    {'ridge__alpha': np.logspace(-4, 3, 50)}
]

# Run the CV
cv_ridge = GridSearchCV(pipeline_ridge, param_grid, scoring=neg_rss, cv=cv)
cv_ridge.fit(X_train,y_train)

Great ! Now lets take a look at the performance of our estimator. Here, for an example of Ridge or Lasso regression, we chart over the $\alpha$ parameter that we perform CV against.

In [None]:
# Record CV optimizing hyper-params
opt_alpha = cv_ridge.cv_results_['param_ridge__alpha'][cv_ridge.best_index_]
opt_params = cv_ridge.cv_results_['params'][cv_ridge.best_index_]

# Visualize
tested_alpha = cv_ridge.cv_results_['param_ridge__alpha']
train_scores = -cv_ridge.cv_results_['mean_train_score']   # Reverse sign
test_scores = -cv_ridge.cv_results_['mean_test_score']     # Reverse sign

plt.figure(figsize=(15,5))
plt.plot(tested_alpha, train_scores, '-', label='Training (avg.)')
plt.plot(tested_alpha, test_scores, '-', label='Validation (avg.)')
plt.xlabel(r'Regularization parameter $\alpha$', fontsize=16)
plt.ylabel(r'$\frac{1}{N} RSS(y - X w)$', fontsize=16)
plt.axvline(opt_alpha, label=r'$\alpha^*$', color='k', linestyle=':')
plt.xscale('log')
plt.xlim([1e-4, 1e3])
plt.tight_layout()
plt.legend(loc=2, fontsize=16)

And now, finally, we are ready to take our test data out of deep-freeze. How did we do?

In [None]:
yp_test = pipeline_ridge.predict(X_test)

# Visualize
plt.figure(figsize=(14,7))

plt.subplot(121)
plt.plot(y_train + mean_train, yp + mean_train, '.', label='Training (CV-opt)')
plt.plot(y_test + mean_test, yp_test + mean_test, '.r', label='Testing (CV-opt)', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (true)', fontsize=16)
plt.ylabel('Ozone (predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction Performance', fontsize=18)

# Plot the learned model
plt.subplot(122)
plt.stem(reg_ridge.coef_)
plt.title(r'Learned Model $\hat{w}$', fontsize=18)
plt.xticks(range(9), data.keys()[1:10], rotation='vertical')

# Print RSS
print("Normalized RSS (Train): %0.2f" % -neg_rss(pipeline_ridge, X_train, y_train))
print("Normalized RSS  (Test): %0.2f" % -neg_rss(pipeline_ridge, X_test, y_test))

## Attempt 3: Lasso

In [None]:
# Make a pre-processing + fitting pipeline
reg_lasso = linear_model.Lasso(alpha=1.0)
pipeline_lasso = make_pipeline(preprocessing.StandardScaler(), reg_lasso)

# Define a scoring metric
# To compare fits, we look at the prediction error via the RSS.
def neg_rss(reg, X, y):
    yp = reg.predict(X)
    return -np.mean((y - yp) ** 2)

# Define CV splitting
# We can create an iterator which performs a set of randomized 
# splits on the dataset into "train" and "validation". We have
# a natural tradeoff between the test set size and the number of 
# splits we should perform
cv = ShuffleSplit(n_splits=20, test_size=0.05, random_state=0)

# Define parameters to search
# We need to specify the estimator name since we are performing a CV on
# a Pipeline. (e.g. the formatting of `<estimator>__<param>`).
param_grid = [
    {'lasso__alpha': np.logspace(-4, 3, 50)}
]

# Run the CV
cv_lasso = GridSearchCV(pipeline_lasso, param_grid, scoring=neg_rss, cv=cv)
cv_lasso.fit(X_train,y_train)

# Record CV optimizing hyper-params
opt_alpha = cv_lasso.cv_results_['param_lasso__alpha'][cv_lasso.best_index_]
opt_params = cv_lasso.cv_results_['params'][cv_lasso.best_index_]

# Visualize
tested_alpha = cv_lasso.cv_results_['param_lasso__alpha']
train_scores = -cv_lasso.cv_results_['mean_train_score']   # Reverse sign
test_scores = -cv_lasso.cv_results_['mean_test_score']     # Reverse sign

plt.figure(figsize=(15,5))
plt.plot(tested_alpha, train_scores, '-', label='Training (avg.)')
plt.plot(tested_alpha, test_scores, '-', label='Validation (avg.)')
plt.xlabel(r'Regularization parameter $\alpha$', fontsize=16)
plt.ylabel(r'$\frac{1}{N} RSS(y - X w)$', fontsize=16)
plt.axvline(opt_alpha, label=r'$\alpha^*$', color='k', linestyle=':')
plt.xscale('log')
plt.xlim([1e-4, 1e3])
plt.tight_layout()
plt.legend(loc=2, fontsize=16)

In [None]:
# Get training predictions
pipeline_lasso.set_params(lasso__alpha=opt_alpha)
pipeline_lasso.fit(X_train,y_train)
yp = pipeline_lasso.predict(X_train)
yp_test = pipeline_lasso.predict(X_test)

# Visualize
plt.figure(figsize=(14, 7))

plt.subplot(121)
plt.plot(y_train + mean_train, yp + mean_train, '.', label='Training (CV-opt)')
plt.plot(y_test + mean_test, yp_test + mean_test, '.r', label='Testing (CV-opt)', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (true)', fontsize=16)
plt.ylabel('Ozone (predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction Performance', fontsize=18)

# Plot the learned model
plt.subplot(122)
plt.stem(reg_lasso.coef_)
plt.title(r'Learned Model $\hat{w}$', fontsize=18)
plt.xticks(range(9), data.keys()[1:10], rotation='vertical')

# Print RSS
print("Normalized RSS (Train): %0.2f" % -neg_rss(pipeline_lasso, X_train, y_train))
print("Normalized RSS  (Test): %0.2f" % -neg_rss(pipeline_lasso, X_test, y_test))

## Boosting

With boosing methods, over-training becomes a very real possibility. In this case we need to make sure to use our CV in order to stop fine-tuning our boosting approach when we start to have a loss on the validation data.

We also have many possible different metrics to use in this case, not just the RSS. So, it is important to use our CV parameter grid to check many possible values.

In [None]:
from sklearn import ensemble

# Create estimator object
reg_gb = ensemble.GradientBoostingRegressor()

# Make a pre-processing + fitting pipeline
pipeline_gb = make_pipeline(preprocessing.StandardScaler(), reg_gb)

# Define the parameters to search
# We need to specify the estimator name since we are performing a CV on
# a Pipeline. (e.g. the formatting of `<estimator>__<param>`).
param_grid = [
    {'gradientboostingregressor__loss': ['ls','lad'], 
     'gradientboostingregressor__learning_rate': np.logspace(-3,0,10),
     'gradientboostingregressor__n_estimators': range(50,200,50)} 
]

# Run the CV
cv_gb = GridSearchCV(pipeline_gb, param_grid, scoring=neg_rss, cv=cv)
cv_gb.fit(X_train,y_train)

In [None]:
# Display results
cv_gb.cv_results_['params'][cv_gb.best_index_]
cv_gb.best_estimator_

In [None]:
# Get training predictions
cvopt_gb = cv_gb.best_estimator_
cvopt_gb.fit(X_train,y_train)
yp = cvopt_gb.predict(X_train)
yp_test = cvopt_gb.predict(X_test)

# Visualize
plt.figure(figsize=(14, 7))

# Plot prediction performance
plt.subplot(121)
plt.plot(y_train + mean_train, yp + mean_train, '.', label='Training (CV-opt)')
plt.plot(y_test + mean_test, yp_test + mean_test, '.r', label='Testing (CV-opt)', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (true)', fontsize=16)
plt.ylabel('Ozone (predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction Performance', fontsize=18)

# Plot the learned model
plt.subplot(122)
plt.stem(cvopt_gb.named_steps['gradientboostingregressor'].feature_importances_)
plt.title(r'Learned model $\hat{w}$', fontsize=18)
plt.ylabel('Feature importances', fontsize=16)
plt.xticks(range(9),data.keys()[1:10], rotation='vertical')

plt.tight_layout()

# Print RSS
print("Normalized RSS (train): %0.2f" % -neg_rss(cvopt_gb, X_train, y_train))
print("Normalized RSS (test): %0.2f" % -neg_rss(cvopt_gb, X_test, y_test))