In [2]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_score 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import clone

#silence future warning message
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

In [3]:
df = pd.read_csv('sales_train.csv')

In [4]:
df = df.drop(['date'], axis = 1)

In [5]:
X,y = df.drop(['item_cnt_day'],axis=1), df['item_cnt_day']
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25, random_state=123 )

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler


class XyScaler(BaseEstimator, TransformerMixin):
    """Standardize a training set of data along with a vector of targets."""

    def __init__(self):
        self.X_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        
    def fit(self, X, y, *args, **kwargs):
        """Fit the scaler to data and a target vector."""
        self.X_scaler.fit(X)
        self.y_scaler.fit(y.reshape(-1, 1))
        return self
    
    def transform(self, X, y, *args, **kwargs):
        """Transform a new set of data and target vector."""
        return (self.X_scaler.transform(X),
                self.y_scaler.transform(y.reshape(-1, 1)).flatten())

    def inverse_transform(self, X, y, *args, **kwargs):
        """Tranform from a scaled representation back to the original scale."""
        return (self.X_scaler.inverse_transform(X),
                self.y_scaler.inverse_transform(y.reshape(-1, 1)).flatten())

In [7]:
mdl1 = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred = mdl1.predict(X_test)
mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

(4.741605106899691, 0.0007221054851097586)

0    1.0
1    1.0
2   -1.0
3    1.0
4    1.0
Name: item_cnt_day, dtype: float64

In [10]:
def cv(X_train, y_train, base_estimator, n_folds, random_seed=154):
    """Estimate the in- and out-of-sample error of a model using cross
    validation.
    
    Parameters
    ----------
    
    X_train: np.array
      Matrix of predictors.
      
    y_train: np.array
      Target array.
      
    base_estimator: sklearn model object.
      The estimator to fit.  Must have fit and predict methods.
      
    n_folds: int
      The number of folds in the cross validation.
      
    random_seed: int
      A seed for the random number generator, for repeatability.
    
    Returns
    -------
      
    train_cv_errors, test_cv_errors: tuple of arrays
      The training and testing errors for each fold of cross validation.
    """
    kf = KFold(n_splits=n_folds, random_state=random_seed)
    train_cv_errors, valid_cv_errors = np.empty(n_folds), np.empty(n_folds)

    for idx, (train_idx, valid_idx) in enumerate(kf.split(X_train)):
        # Split into train and test
        X_cv_train, y_cv_train = X_train[train_idx], y_train[train_idx]
        X_cv_valid, y_cv_valid = X_train[valid_idx], y_train[valid_idx]

        # Standardize data, fit on training set, transform training and test.
        scaler = XyScaler()
        scaler.fit(X_cv_train, y_cv_train)
        X_cv_train_std, y_cv_train_std = scaler.transform(X_cv_train, y_cv_train)
        X_cv_valid_std, y_cv_valid_std = scaler.transform(X_cv_valid, y_cv_valid)

        # Fit ridge regression to training data.
        mdl = clone(base_estimator)
        mdl.fit(X_cv_train_std, y_cv_train_std)

        # Make predictions.
        y_cv_train_pred = mdl.predict(X_cv_train_std)
        y_cv_valid_pred = mdl.predict(X_cv_valid_std)

        # Calculate MSE.
        train_cv_errors[idx] = mean_squared_error(y_cv_train_std, y_cv_train_pred)
        valid_cv_errors[idx] = mean_squared_error(y_cv_valid_std, y_cv_valid_pred)
    return train_cv_errors, valid_cv_errors

In [11]:
train_cv_errors, valid_cv_errors = cv(X_train, y_train, base_estimator=Ridge(alpha=0.5), n_folds=10, random_seed=154)

KeyError: "None of [Int64Index([ 220189,  220190,  220191,  220192,  220193,  220194,  220195,\n             220196,  220197,  220198,\n            ...\n            2201876, 2201877, 2201878, 2201879, 2201880, 2201881, 2201882,\n            2201883, 2201884, 2201885],\n           dtype='int64', length=1981697)] are in the [columns]"

In [12]:
print(f"Training CV error: {train_cv_errors.mean():.2f}")
print(f"Test CV error: {valid_cv_errors.mean():.2f}")

Training CV error: 2681561585988519568771375735381460409997389019581807311170462957507091551663902187555301065257275382741933557816075437014915201525702554777173143341498368.00
Test CV error: 2681561585988519568771375735381460409997389019581807311170462957507091551663902187555301065257275382741933557816075437014915201525702554777173143341498368.00


In [13]:
def train_at_various_alphas(X_train, y_train, model, alphas, n_folds, random_seed, **kwargs):
    """Train a regularized regression model using cross validation at various
    values of alpha.
    
    Parameters
    ----------
    
    X_train: np.array
      Matrix of predictors.
      
    y_train: np.array
      Target array.
      
    model: sklearn model class
      A class in sklearn that can be used to create a regularized regression
      object.  Options are `Ridge` and `Lasso`.
      
    alphas: numpy array
      An array of regularization parameters.
      
    n_folds: int
      Number of cross validation folds.
      
    Returns
    -------
    
    cv_errors_train, cv_errors_valid: tuple of DataFrame
      DataFrames containing the training and validating errors for each value of
      alpha and each cross validation fold.  Each row represents a CV fold, and
      each column a value of alpha.
    """
    cv_errors_train = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),
                                     columns=alphas)
    cv_errors_valid = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),
                                        columns=alphas)
    for alpha in alphas:
        train_cv_errors, valid_cv_errors = cv(X_train, y_train, base_estimator=model(alpha=alpha, **kwargs), n_folds=n_folds, random_seed=random_seed)
        cv_errors_train.loc[:,alpha] = train_cv_errors
        cv_errors_valid.loc[:,alpha] = valid_cv_errors
    return cv_errors_train, cv_errors_valid

In [14]:
alphas=np.logspace(-2, 4, num=250)
cv_errors_train, cv_errors_valid = train_at_various_alphas(X_train, y_train, model=Ridge, alphas=alphas, n_folds=10, random_seed=154)

KeyError: "None of [Int64Index([ 220189,  220190,  220191,  220192,  220193,  220194,  220195,\n             220196,  220197,  220198,\n            ...\n            2201876, 2201877, 2201878, 2201879, 2201880, 2201881, 2201882,\n            2201883, 2201884, 2201885],\n           dtype='int64', length=1981697)] are in the [columns]"

In [15]:
train_means, valid_means = cv_errors_train.mean(axis=0), cv_errors_valid.mean(axis=0)

NameError: name 'cv_errors_train' is not defined