## Prepare dataset

In [1]:
import pandas as pd

df = pd.read_csv('data/market-costs.csv')
df.head()

Unnamed: 0,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,Cost,...,Store Code,Country ISO2,Order Brand,Order,Department,Amenities Score,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
0,Dimes Off,Deluxe,8760000.0,4292400.0,yes,2842.23,2037.64,481.98,323.0,602.7575,...,H11go,ZA,Red Wing,Cleaning Supplies,Household,5,28.1997,26.6008,1.599,10000.0
1,Budget Bargains,Supermarket,6360000.0,1971600.0,no,2814.95,2049.72,457.36,304.976672,708.665,...,S04ne,WA,Nationeel,Snack Foods,Snack Foods,0,16.571,14.972,1.599,50000.0
2,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,2192.32,1322.21,523.32,348.85,564.2647,...,L05es,CA,Excel,Magazines,Periodicals,1,28.6358,27.1822,1.4536,30000.0
3,Sale Winners,Deluxe,11560000.0,4970800.0,no,2862.3,1872.19,593.93,395.95,519.7574,...,M10da,YU,Hermanos,Vegetables,Produce,5,12.6172,9.71,2.9072,50000.0
4,Weekend Discount,Supermarket,5220000.0,1618200.0,yes,1970.17,1236.07,440.92,293.95,364.1649,...,S03le,WA,Red Wing,Candles,Household,1,15.4081,13.9545,1.4536,30000.0


In [2]:
# Applay feature selection and engineering on the dataset

def wrangle(df):
    # Step 1
    df["Store CountryISO2"] = df["Store Code"] + '_' + df["Country ISO2"]
    
    # Step 2
    df['Income Level'] = (
        pd.cut(
            df['Min. Person Yearly Income'],
            bins=[0, 25000, 50000, float('inf')],
            labels=['Low', 'Middle', 'High']
        )
        .astype("object")
    )
    
    # Step 3
    df['Price Tier'] = (
        pd.cut(
            df['Gross Weight'],
            bins=[0, 5, 10, float('inf')],
            labels=['Low Price', 'Medium Price', 'High Price']
        )
        .astype("object")
    )
    
    # Step 4
    order_popularity = df['Order'].value_counts().reset_index()
    order_popularity.columns = ['Order', 'Order Popularity']
    df = df.merge(order_popularity, on='Order', how='left')
    
    # Step 5
    def calculate_family_expenses(row):
        if row["Marriage"] == "Married":
            return row["Min. Person Yearly Income"] / (row["Children"] + 2)
        else:
            return row["Min. Person Yearly Income"] / (row["Children"] + 1)

    children_mapping = {
        'No': 0,
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5
    }

    temp_df = df.copy()
    temp_df['Children'] = temp_df['Children'].map(children_mapping)
    temp_df["Family Expenses"] = temp_df.apply(lambda row: calculate_family_expenses(row), axis=1)
    df["Family Expenses"] = temp_df["Family Expenses"]
    
    # Step 6
    person_description = ["Gender", "Marriage", "Children", "Degree", "Work"]
    for col in person_description:
        df['Promotion '+ col] = df['Promotion Name'] + ' ' + df[col]

    df['Promotion Name Length'] = (df['Promotion Name'].apply(lambda x: len(x))).astype(int)
    
    # Step 7
    df.drop(columns=["Store Sales", "Frozen Area", "Net Weight", "Store Area", 'Is Recyclable?'], inplace = True)
    
    return df

df = wrangle(df)

In [4]:
# Save the train and test sets with .parquet for creating the job later
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

train_set.to_parquet("data/train.parquet", index=False)
test_set.to_parquet("data/test.parquet", index=False)

In [5]:
# Split train and test data into features X and targets Y.
from sklearn.model_selection import train_test_split

X = df.drop(columns="Cost")
y = df["Cost"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Modeling

In [6]:
# Setup a preprocessor for the categorical data

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

categorical_columns = X.select_dtypes("object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), categorical_columns)
    ]
)

In [7]:
# Fitting the model
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesRegressor

model = make_pipeline(
    preprocessor,
    ExtraTreesRegressor(n_estimators=350, max_depth=16, n_jobs=-1, random_state=42)
)

model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  Index(['Promotion Name', 'Store Kind', 'Degree Work', 'Marriage', 'Gender',
       'Children', 'Degree', 'Work', 'Store Code', 'Country ISO2',
       'Order Brand', 'Order', 'Department', 'Store CountryISO2',
       'Income Level', 'Price Tier', 'Promotion Gender', 'Promotion Marriage',
       'Promotion Children', 'Promotion Degree', 'Promotion Work'],
      dtype='object'))])),
                ('extratreesregressor',
                 ExtraTreesRegressor(max_depth=16, n_estimators=350, n_jobs=-1,
                                     random_state=42))])

In [8]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"train: {train_rmse}, test: {test_rmse}")

train: 21.989246883406977, test: 43.86509966666313


## Register the training and test data

In [11]:
# First we must create a cloud client

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential=credential)

Found the config file in: .\config.json


In [12]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

train_data_name = 'market_train'
test_data_name = 'market_test'

training_data = Data(
    name=train_data_name,
    path='../data/train.parquet',
    type=AssetTypes.URI_FILE,
    description='RAI market train data'
)
tr_data = ml_client.data.create_or_update(training_data)

test_data = Data(
    name=test_data_name,
    path='../data/test.parquet',
    type=AssetTypes.URI_FILE,
    description='RAI market test data'
)
ts_data = ml_client.data.create_or_update(test_data)

## Create a compute cluster

In [None]:
from azure.ai.ml.entities import AmlCompute
import time

compute_name = 'trainingcompute'

my_compute = AmlCompute(
    name=compute_name,
    size='Standard_DS12_v2',
    min_instances=0,
    max_instances=4,
    idle_time_before_scale_down=3600
)
ml_client.compute.begin_create_or_update(my_compute).result()

## Create the job

In [None]:
from azure.ai.ml import command, Input, Output

target_column_name = 'Cost'

# Create the job
job = command(
    description='Trains market cost model',
    experiment_name='market_costs_test',
    compute=compute_name,
    inputs=dict(training_data=Input(type='uri_file', path=f'{train_data_name}@latest'), 
                target_column_name=target_column_name),
    outputs=dict(model_output=Output(type=AssetTypes.MLFLOW_MODEL)),
    code='../src/',
    environment='azureml://registries/azureml/environments/responsibleai-ubuntu20.04-py38-cpu/versions/37',
    command='python train.py ' + 
            '--training_data ${{inputs.training_data}} ' +
            '--target_column_name ${{inputs.target_column_name}} ' +
            '--model_output ${{outputs.model_output}}'
)
job = ml_client.jobs.create_or_update(job)
ml_client.jobs.stream(job.name)

## Register the model

In [16]:
from azure.ai.ml.entities import Model

model_name = 'market_cost_model'

# Register the model.
model_path = f'azureml://jobs/{job.name}/outputs/model_output'
model = Model(name=model_name,
                path=model_path,
                type=AssetTypes.MLFLOW_MODEL)
registered_model = ml_client.models.create_or_update(model)