# Write something here!

In [11]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)
mlflow.set_tracking_uri('http://localhost:5000')

# get args
max_depth = 3
n_estimators = 50
data_month = 5
# Define data source
TRAIN_FILE_PATH = "data/train-data/training_data_month_{}.zip".format(data_month)

# Set global random state for stable training
np.random.seed(40)
rand = 40

In [16]:
df=pd.read_csv(TRAIN_FILE_PATH)
df

Unnamed: 0,store_id,product_id,label,quantity,variant_case_price_cents,revenue_cents,order_cnt,product_variant_cnt,days,store_size,...,product_metadata_78,product_metadata_79,product_metadata_80,product_metadata_81,product_metadata_82,product_metadata_83,product_metadata_84,product_metadata_85,product_metadata_86,product_metadata_87
0,10.0,155.0,0.0,0.0,0.0,0.0,0.0,0.0,28.000000,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,29.500000,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.0,166.0,0.0,0.0,0.0,0.0,0.0,0.0,31.000000,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.0,242.0,0.0,0.0,0.0,0.0,0.0,0.0,31.000000,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,300.0,0.0,0.0,0.0,0.0,0.0,0.0,29.666667,4.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463324,2565.0,5456.0,0.0,0.0,0.0,0.0,0.0,0.0,30.000000,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
463325,2565.0,5477.0,0.0,0.0,0.0,0.0,0.0,0.0,30.000000,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
463326,2565.0,5484.0,0.0,0.0,0.0,0.0,0.0,0.0,30.000000,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
463327,2565.0,5532.0,0.0,0.0,0.0,0.0,0.0,0.0,30.000000,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
def get_training_data(url):
    try:
        train_data = pd.read_csv(url)
    except Exception as e:
        logger.exception(
            "Unable to read data from {}. Error: {}".format(url,e)
        )
    return train_data 

def split_data(data, rand):
    '''
        Custom split to avoid data leakage
        return: X_train, X_test, y_train, y_test
    '''
    data.drop(['store_id','product_id'],axis=1,inplace=True)

    df_train_avl = data[data['label']>0]
    df_train_notavl = data[data['label']==0]
    y_aval = df_train_avl.pop('label')
    X_aval = df_train_avl
    y_notaval = df_train_notavl.pop('label')
    X_notaval = df_train_notavl

    X_train_avl, X_test_avl, y_train_avl, y_test_avl = train_test_split(X_aval, y_aval, test_size=0.3, random_state=rand)
    X_train_notavl, X_test_notavl, y_train_notavl, y_test_notavl = train_test_split(X_notaval, y_notaval, test_size=0.3, random_state=rand)

    X_train = pd.concat([X_train_avl,X_train_notavl])
    y_train = pd.concat([y_train_avl,y_train_notavl])
    X_test = pd.concat([X_test_avl,X_test_notavl])
    y_test = pd.concat([y_test_avl,y_test_notavl])

    # Free resources
    del df_train_avl
    del df_train_notavl
    del y_aval
    del X_aval
    del y_notaval
    del X_notaval

    return X_train, X_test, y_train, y_test


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [13]:
def train(max_depth=3, n_estimators=5, data_month=5):
    warnings.filterwarnings("ignore")

    # Read the wine-quality csv file from the URL
    data = get_training_data(url=TRAIN_FILE_PATH)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train_x, test_x,train_y,test_y = split_data(data=data,rand=rand)

    with mlflow.start_run():
        lr = GradientBoostingRegressor(max_depth=max_depth, n_estimators=n_estimators, random_state=rand)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("GradientBoostingRegressor model (max_depth=%f, n_estimators=%f):" % (max_depth, n_estimators))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.sklearn.log_model(lr, "model", registered_model_name="GradientBoostingRegressor")
        else:
            mlflow.sklearn.log_model(lr, "model")

In [15]:
train()

GradientBoostingRegressor model (max_depth=3.000000, n_estimators=5.000000):
  RMSE: 0.741561435125196
  MAE: 0.06841786927410191
  R2: 0.4549214095606271


RestException: INVALID_PARAMETER_VALUE:  Model registry functionality is unavailable; got unsupported URI './mlruns' for model registry data storage. Supported URI schemes are: ['postgresql', 'mysql', 'sqlite', 'mssql']. See https://www.mlflow.org/docs/latest/tracking.html#storage for how to run an MLflow server against one of the supported backend storage locations.