In [59]:
from azureml.core import Workspace, Dataset, Datastore
import pandas as pd
import numpy as np
import logging


def get_data():
    subscription_id = 'd2706c67-acfc-4bd3-9067-3ff6ac190bc9'
    resource_group = 'capstone-project'
    workspace_name = 'capstone-project'

    workspace = Workspace(subscription_id, resource_group, workspace_name)

    datastore = Datastore.get(workspace, "workspaceworkingdirectory")
    dataset = Dataset.Tabular.from_delimited_files(path=(datastore, 'Users/hualcosa/nd00333-capstone/data/Walmart Data Analysis and Forcasting.csv'))
    df = dataset.to_pandas_dataframe() 
    return df

In [115]:
def process_data(df):
    '''
    This function formats the dataframe, adding past 8 weeks of sales as lagged features
    and 4 weeks of future sales as the label column.

    Return:
    X. Training data with features + lagged sales
    y. vector with the next 4 weeks of sales
    '''
    
    df_with_windows = []
    for store_num in df.Store.unique():
        store_df = df[df.Store == store_num].copy()
        # making lag features
        for i in range(1, 9):
            store_df[f'Weekly_Sales_t-{i}'] = store_df['Weekly_Sales'].shift(i)
        # making future_time_steps
        for i in range(1,4):
            store_df[f'Weekly_Sales_t+{i}'] = store_df['Weekly_Sales'].shift(-i)

        df_with_windows.append(store_df)

    df_with_windows = pd.concat(df_with_windows).dropna()
    # renaming first future value, to follow the same pattern as the other columns
    df_with_windows.rename(columns={"Weekly_Sales":"Weekly_Sales_t+0"}, inplace=True)
    df_with_windows = df_with_windows[['Store', 'Date', 'Holiday_Flag', 'Temperature',
                                        'Fuel_Price', 'CPI', 'Unemployment', 'Weekly_Sales_t-1',
                                        'Weekly_Sales_t-2', 'Weekly_Sales_t-3', 'Weekly_Sales_t-4',
                                        'Weekly_Sales_t-5', 'Weekly_Sales_t-6', 'Weekly_Sales_t-7',
                                        'Weekly_Sales_t-8', 'Weekly_Sales_t+0','Weekly_Sales_t+1', 'Weekly_Sales_t+2',
                                        'Weekly_Sales_t+3']]

    # separate by store, train_test_split, and then join data again
    x_train, x_val, y_train, y_val = [], [], [], []

    for store_num in df_with_windows.Store.unique():
        store_df = df_with_windows[df_with_windows.Store == store_num].copy()
        # future columns filter
        ftr = store_df.columns.str.match(r'.+t\+\d')
        # making label vector
        y_store = store_df.loc[:, ftr].apply(lambda row: list(row), axis=1).tolist()
        # convert list to numpy array format
        y_store = np.array(y_store)
        # making training data
        X_store = store_df.drop(columns='Date').values
        x_train_store, x_val_store, y_train_store, y_val_store = train_test_split(X_store, y_store, test_size=0.2, shuffle=False, random_state=96)
        
        # appending to final results
        x_train.append(x_train_store)
        x_val.append(x_val_store)
        y_train.append(y_train_store)
        y_val.append(y_val_store)

    x_train = np.concatenate(x_train)
    x_val = np.concatenate(x_val)
    y_train = np.concatenate(y_train)
    y_val = np.concatenate(y_val)

    return x_train, x_val, y_train, y_val


In [116]:
logging.info('Getting data...')
df = get_data()
logging.info('formatting data...')
x_train, x_val, y_train, y_val = process_data(df)

In [117]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((4725, 18), (1215, 18), (4725, 4), (1215, 4))

In [120]:
def train_model(x_train, y_train, kwargs):
    '''
    This function receives the processed X, y values and fits a multiple output XGBRegressor to it.

    Returns: the fitted model
    '''
    #Define the estimator
    estimator = xgb.XGBRegressor(
        objective = 'reg:squarederror',
        **kwargs
        )

    # Define the model
    my_model = MultiOutputRegressor(estimator = estimator, n_jobs = -1)
    my_model.fit(x_train, y_train)

    return my_model

In [121]:
from sklearn.metrics import mean_squared_error

In [122]:
params_dict = {
    'max_depth': 3,
    'n_estimators': 100,
    'lambda' : 1,
    'subsample' : 1,
    'colsample_bytree': 1
}
model = train_model(x_train, y_train, params_dict)
y_pred = model.predict(x_val)

y_min, y_max = df['Weekly_Sales'].min(), df['Weekly_Sales'].max()
# computing Normalized RMSE
nrmse = np.sqrt(mean_squared_error(y_val, y_pred))/(y_max - y_min)

In [123]:
nrmse

0.001777523833815344

# Training Xgboost regressor

In [67]:
import xgboost as xgb

In [106]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

kwargs = {
    'max_depth': 3,
    'n_estimators': 100,
    'lambda' : 1,
    'subsample' : 1,
    'colsample_bytree': 1
}
#Define the estimator
estimator = xgb.XGBRegressor(
    objective = 'reg:squarederror',
    **kwargs
    )

# Define the model
my_model = MultiOutputRegressor(estimator = estimator, n_jobs = -1).fit(X, y)

TypeError: fit() got an unexpected keyword argument 'max_depth'

In [102]:
my_model.score(X, y)

0.9968594816576335

In [103]:
xgb.__version__

'1.3.3'