In [2]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from datetime import date, timedelta
from datetime import datetime
import time
from time import gmtime, strftime

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Functions

In [3]:
# needed functions

def regression_scores(y_test, preds, transformed=False):
    '''
    Returns and prints evaluation metics for a regression model
    '''
    if transformed:

        mse = mean_squared_error(y_test, preds)
        rmse =  mean_squared_error(y_test, preds, squared=False)
        rmse_exp = mean_squared_error(np.exp(y_test), np.exp(preds), squared=False)
        mae = mean_absolute_error(y_test, preds)
        mae_exp = mean_absolute_error(np.exp(y_test), np.exp(preds))

    else:
        mse = mean_squared_error(y_test, preds)
        rmse =  mean_squared_error(y_test, preds, squared=False)
        rmse_exp = 'N/A'
        mae = mean_absolute_error(y_test, preds)
        mae_exp = 'N/A'
        
    r2 = r2_score(y_test, preds)
    adj_r2 = 'N/A'

    print('MSE: ', mse)
    print('RMSE: ', rmse)
    print('RMSE (retuned to normal scale): ', rmse_exp)
    print('MAE: ', mae)
    print('MAE (retuned to normal scale): ', mae_exp)
    print('R-squared: ', r2)

    return mse, rmse, rmse_exp, mae, mae_exp, r2, adj_r2

def score_table(scores, model_name, y_test, preds, pct_in_range=0, num_skus=0, transformed=False, notes=None):
    '''
    creates a data frame with various scores for each model
    '''
    
    mse, rmse, rmse_exp, mae, mae_exp, r2, adj_r2 = regression_scores(y_test, preds, transformed)
    
    score_list = []
    score_list.extend((mse, rmse, rmse_exp, mae, mae_exp, r2, adj_r2, pct_in_range, num_skus, notes))
    
    scores.loc[model_name] = score_list
    return scores

# Sneaker

## 1 Day Ago

In [5]:
# evaluate model
scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_sneakers', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions1.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions2.csv')
preds = pd.concat([preds1, preds2])

#preds.pred_date = pd.to_datetime(preds.pred_date, format='%Y-%m-%d %H:%M:%S')
# preds = preds[preds['pred_date'].dt.date==date.today()] # replace with yesterday
# preds['id'] = preds['SKU'].astype(str) + " " + preds['SIZE_USM'].astype(str)

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_1d.csv')
print(len(y_truth))


# dropping missing values
y_truth = y_truth.dropna()

#y_truth['RELEASEDATE'] = pd.to_datetime(y_truth['RELEASEDATE'])
#y_truth['RELEASEYEAR'] = [int(date.year) for date in y_truth.RELEASEDATE]
y_truth['SIZE'] = y_truth['SIZE'].astype(str).apply(lambda x: x.rstrip('.0') if x.endswith('.0') else x)

y_truth['ID'] = y_truth['SKU'].astype(str) + " " + y_truth['SIZE'].astype(str) + " " + y_truth['RELEASEDATE'].astype(str)
y_truth['ID'] = y_truth['ID'].str.upper()

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['ID', 'PRICE']], preds[['ID', 'predictions', 'prediction_low', 'prediction_high']], on='ID', how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()

final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor {str(date.today())} 1 Day')

eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sneaker.csv'

scores['eval_date'] = date.today()
#scores.reset_index().to_csv(eval_path, index=False)
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))

7051
Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: 


  result = getattr(ufunc, method)(*inputs, **kwargs)


MSE:  0.017169152018956108
RMSE:  0.13103111088194325
RMSE (retuned to normal scale):  30.375809165175276
MAE:  0.08970123266996316
MAE (retuned to normal scale):  14.889091285451535
R-squared:  0.9366909825004979


In [6]:
display(scores)

Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_sneakers,notes,eval_date
AlgoV1,0.017169,0.131031,30.375809,0.089701,14.889091,0.936691,,0.633683,6205,Monitor 2024-03-20 1 Day,2024-03-20


# Sneaker count

In [8]:
ct = pd.read_csv('s3://historicaldata-sample/sale_count.csv')
ct.head()

Unnamed: 0,SKU,SALE_CT,MODEL_NO
0,FV5029-006,11877,4
1,CT8012-170,7227,1
2,AQ9129-170,6960,4
3,HP8739,4799,3
4,FJ9479-100,4689,4


### 1000 Sales or More

In [9]:
sku_list = ct[ct.SALE_CT >= 1000]

# evaluate model
scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_sneakers', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions1.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions2.csv')
preds = pd.concat([preds1, preds2])

#preds.pred_date = pd.to_datetime(preds.pred_date, format='%Y-%m-%d %H:%M:%S')
# preds = preds[preds['pred_date'].dt.date==date.today()] # replace with yesterday
# preds['id'] = preds['SKU'].astype(str) + " " + preds['SIZE_USM'].astype(str)

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_1d.csv')
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]
print(len(y_truth))


# dropping missing values
y_truth = y_truth.dropna()

#y_truth['RELEASEDATE'] = pd.to_datetime(y_truth['RELEASEDATE'])
#y_truth['RELEASEYEAR'] = [int(date.year) for date in y_truth.RELEASEDATE]
y_truth['SIZE'] = y_truth['SIZE'].astype(str).apply(lambda x: x.rstrip('.0') if x.endswith('.0') else x)

y_truth['ID'] = y_truth['SKU'].astype(str) + " " + y_truth['SIZE'].astype(str) + " " + y_truth['RELEASEDATE'].astype(str)
y_truth['ID'] = y_truth['ID'].str.upper()

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['ID', 'PRICE']], preds[['ID', 'predictions', 'prediction_low', 'prediction_high']], on='ID', how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()

final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor {str(date.today())} 1 Day - 1000 Sales or More in 30 Days')

eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sneaker.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))

display(scores)

1207
Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: 
MSE:  0.00812917501916672
RMSE:  0.09016193775184027
RMSE (retuned to normal scale):  16.70887787171226
MAE:  0.060524146238191474
MAE (retuned to normal scale):  10.393564385553994
R-squared:  0.9597398690230025


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_sneakers,notes,eval_date
AlgoV1,0.008129,0.090162,16.708878,0.060524,10.393564,0.95974,,0.734694,1127,Monitor 2024-03-20 1 Day - 1000 Sales or More ...,2024-03-20


### 100-999 sales in 30 days

In [10]:
sku_list = ct[(ct.SALE_CT >= 100) & (ct.SALE_CT < 1000)]

# evaluate model
scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_sneakers', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions1.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions2.csv')
preds = pd.concat([preds1, preds2])

#preds.pred_date = pd.to_datetime(preds.pred_date, format='%Y-%m-%d %H:%M:%S')
# preds = preds[preds['pred_date'].dt.date==date.today()] # replace with yesterday
# preds['id'] = preds['SKU'].astype(str) + " " + preds['SIZE_USM'].astype(str)

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_1d.csv')
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]
print(len(y_truth))


# dropping missing values
y_truth = y_truth.dropna()

#y_truth['RELEASEDATE'] = pd.to_datetime(y_truth['RELEASEDATE'])
#y_truth['RELEASEYEAR'] = [int(date.year) for date in y_truth.RELEASEDATE]
y_truth['SIZE'] = y_truth['SIZE'].astype(str).apply(lambda x: x.rstrip('.0') if x.endswith('.0') else x)

y_truth['ID'] = y_truth['SKU'].astype(str) + " " + y_truth['SIZE'].astype(str) + " " + y_truth['RELEASEDATE'].astype(str)
y_truth['ID'] = y_truth['ID'].str.upper()

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['ID', 'PRICE']], preds[['ID', 'predictions', 'prediction_low', 'prediction_high']], on='ID', how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()

final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor {str(date.today())} 1 Day - 100-999 Sales in 30 Days')

eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sneaker.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))

display(scores)

3472
Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: 
MSE:  0.015574589945857522
RMSE:  0.12479819688544191
RMSE (retuned to normal scale):  32.89207130590182
MAE:  0.08566920530734078
MAE (retuned to normal scale):  14.7668135738855
R-squared:  0.9404823153589925


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_sneakers,notes,eval_date
AlgoV1,0.015575,0.124798,32.892071,0.085669,14.766814,0.940482,,0.668652,3190,Monitor 2024-03-20 1 Day - 100-999 Sales in 30...,2024-03-20


### 50-100 Sales

In [15]:
sku_list = ct[(ct.SALE_CT >= 50) & (ct.SALE_CT < 100)]

# evaluate model
scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_sneakers', 'notes'])  ## CHECK THIS
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions1.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions2.csv')
preds = pd.concat([preds1, preds2])

#preds.pred_date = pd.to_datetime(preds.pred_date, format='%Y-%m-%d %H:%M:%S')
# preds = preds[preds['pred_date'].dt.date==date.today()] # replace with yesterday
# preds['id'] = preds['SKU'].astype(str) + " " + preds['SIZE_USM'].astype(str)

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_1d.csv')
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]
print(len(y_truth))


# dropping missing values
y_truth = y_truth.dropna()

#y_truth['RELEASEDATE'] = pd.to_datetime(y_truth['RELEASEDATE'])
#y_truth['RELEASEYEAR'] = [int(date.year) for date in y_truth.RELEASEDATE]
y_truth['SIZE'] = y_truth['SIZE'].astype(str).apply(lambda x: x.rstrip('.0') if x.endswith('.0') else x)

y_truth['ID'] = y_truth['SKU'].astype(str) + " " + y_truth['SIZE'].astype(str) + " " + y_truth['RELEASEDATE'].astype(str)
y_truth['ID'] = y_truth['ID'].str.upper()

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['ID', 'PRICE']], preds[['ID', 'predictions', 'prediction_low', 'prediction_high']], on='ID', how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()

final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            num_skus=num_skus, transformed=True, 
            notes=f'Monitor {str(date.today())} 1 Day - 50-100 Sales in 30 Days')

eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sneaker.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))

display(scores)

836
Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: 
MSE:  0.024265959312893538
RMSE:  0.15577534886140854
RMSE (retuned to normal scale):  29.224300574125564
MAE:  0.10867651996147033
MAE (retuned to normal scale):  17.23472661221585
R-squared:  0.912050180048445


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_sneakers,notes,eval_date
AlgoV1,0.024266,0.155775,29.224301,0.108677,17.234727,0.91205,,0.566757,734,Monitor 2024-03-20 1 Day - 50-100 Sales in 30 ...,2024-03-20


### Less than 50 sales

In [16]:
sku_list = ct[(ct.SALE_CT < 50)]

# evaluate model
scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_sneakers', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions1.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions2.csv')
preds = pd.concat([preds1, preds2])

#preds.pred_date = pd.to_datetime(preds.pred_date, format='%Y-%m-%d %H:%M:%S')
# preds = preds[preds['pred_date'].dt.date==date.today()] # replace with yesterday
# preds['id'] = preds['SKU'].astype(str) + " " + preds['SIZE_USM'].astype(str)

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_1d.csv')
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]
print(len(y_truth))


# dropping missing values
y_truth = y_truth.dropna()

#y_truth['RELEASEDATE'] = pd.to_datetime(y_truth['RELEASEDATE'])
#y_truth['RELEASEYEAR'] = [int(date.year) for date in y_truth.RELEASEDATE]
y_truth['SIZE'] = y_truth['SIZE'].astype(str).apply(lambda x: x.rstrip('.0') if x.endswith('.0') else x)

y_truth['ID'] = y_truth['SKU'].astype(str) + " " + y_truth['SIZE'].astype(str) + " " + y_truth['RELEASEDATE'].astype(str)
y_truth['ID'] = y_truth['ID'].str.upper()

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['ID', 'PRICE']], preds[['ID', 'predictions', 'prediction_low', 'prediction_high']], on='ID', how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()

final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor {str(date.today())} 1 Day - Less than 50 sales in 30 Days')

eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sneaker.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))

display(scores)

1536
Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: 
MSE:  0.02589155283195182
RMSE:  0.16090852317994786
RMSE (retuned to normal scale):  33.981123037549615
MAE:  0.11727218832282778
MAE (retuned to normal scale):  18.125511030889292
R-squared:  0.9230399832414388


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_sneakers,notes,eval_date
AlgoV1,0.025892,0.160909,33.981123,0.117272,18.125511,0.92304,,0.480936,1154,Monitor 2024-03-20 1 Day - Less than 50 sales ...,2024-03-20


# SKU

## 1 Day Ago

In [17]:
# evaluate model
scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_skus', 'notes']) ## CHECK THIS
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku2.csv')
preds = pd.concat([preds1, preds2])
# preds = pd.concat([df_final, df_final2])

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_sku_1d.csv')

# dropping missing values
y_truth = y_truth.dropna()

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['SKU', 'RELEASEDATE', 'PRICE']], preds[['SKU', 'RELEASEDATE', 'predictions', 'prediction_low', 'prediction_high']], 
                 on=['SKU', 'RELEASEDATE'], how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()
final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor SKU {str(date.today())} 1 Day')

eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sku.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))
display(scores)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: 
MSE:  0.020589677146025186
RMSE:  0.1434910350719695
RMSE (retuned to normal scale):  30.81647133573274
MAE:  0.10158215930386068
MAE (retuned to normal scale):  15.988244145929965
R-squared:  0.9313955495301339


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_skus,notes,eval_date
AlgoV1,0.02059,0.143491,30.816471,0.101582,15.988244,0.931396,,0.67918,3170,Monitor SKU 2024-03-20 1 Day,2024-03-20


### 1000 or More Sales in 30 Days

In [18]:
sku_list = ct[(ct.SALE_CT >= 1000)]

scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_skus', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku2.csv')
preds = pd.concat([preds1, preds2])
# preds = pd.concat([df_final, df_final2])

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_sku_1d.csv')

# dropping missing values
y_truth = y_truth.dropna()
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['SKU', 'RELEASEDATE', 'PRICE']], preds[['SKU', 'RELEASEDATE', 'predictions', 'prediction_low', 'prediction_high']], 
                 on=['SKU', 'RELEASEDATE'], how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()
final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor SKU {str(date.today())} 1 Day, 1000 or More Sales in 30 Days')
eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sku.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))
display(scores)

Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: 
MSE:  0.0023874931973617377
RMSE:  0.048861981103530155
RMSE (retuned to normal scale):  9.30758906636142
MAE:  0.03539117704198486
MAE (retuned to normal scale):  6.084211495882361
R-squared:  0.9865328355488935


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_skus,notes,eval_date
AlgoV1,0.002387,0.048862,9.307589,0.035391,6.084211,0.986533,,0.647059,102,"Monitor SKU 2024-03-20 1 Day, 1000 or More Sal...",2024-03-20


### 100 to 1000 Sales in 30 Days

In [19]:
sku_list = ct[(ct.SALE_CT >= 100) & (ct.SALE_CT < 1000)]

scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_skus', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku2.csv')
preds = pd.concat([preds1, preds2])
# preds = pd.concat([df_final, df_final2])

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_sku_1d.csv')

# dropping missing values
y_truth = y_truth.dropna()
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['SKU', 'RELEASEDATE', 'PRICE']], preds[['SKU', 'RELEASEDATE', 'predictions', 'prediction_low', 'prediction_high']], 
                 on=['SKU', 'RELEASEDATE'], how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()
final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor SKU {str(date.today())} 1 Day 100-999 Sales in 30 Days')
eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sku.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))
display(scores)

Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: 
MSE:  0.010450931813822237
RMSE:  0.10222979905009222
RMSE (retuned to normal scale):  20.69827912306345
MAE:  0.07152488831450965
MAE (retuned to normal scale):  12.23073819102388
R-squared:  0.9577090593301514


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_skus,notes,eval_date
AlgoV1,0.010451,0.10223,20.698279,0.071525,12.230738,0.957709,,0.629283,963,Monitor SKU 2024-03-20 1 Day 100-999 Sales in ...,2024-03-20


### 50-100 Sales in 30 Days

In [20]:
sku_list = ct[(ct.SALE_CT >= 50) & (ct.SALE_CT < 100)]

scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_skus', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku2.csv')
preds = pd.concat([preds1, preds2])
# preds = pd.concat([df_final, df_final2])

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_sku_1d.csv')

# dropping missing values
y_truth = y_truth.dropna()
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['SKU', 'RELEASEDATE', 'PRICE']], preds[['SKU', 'RELEASEDATE', 'predictions', 'prediction_low', 'prediction_high']], 
                 on=['SKU', 'RELEASEDATE'], how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()
final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor SKU {str(date.today())} 1 Day 50-100 Sales in 30 Days')
eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sku.csv'

#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))
display(scores)

Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: 
MSE:  0.01911352610173029
RMSE:  0.13825167666878507
RMSE (retuned to normal scale):  26.18602188572893
MAE:  0.10581283869333608
MAE (retuned to normal scale):  16.57171244278464
R-squared:  0.9222175790648568


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_skus,notes,eval_date
AlgoV1,0.019114,0.138252,26.186022,0.105813,16.571712,0.922218,,0.640449,534,Monitor SKU 2024-03-20 1 Day 50-100 Sales in 3...,2024-03-20


### Less than 50 Sales in 30 Days

In [21]:
sku_list = ct[(ct.SALE_CT < 50)]

scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 
                                 'adj_r2', 'pct_in_range', 'num_skus', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-sku/dt-{yesterday}/predictions_sku2.csv')
preds = pd.concat([preds1, preds2])
# preds = pd.concat([df_final, df_final2])

# ground truth
# X_test = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/test/X_test.csv')
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth_sku_1d.csv')

# dropping missing values
y_truth = y_truth.dropna()
y_truth = y_truth[y_truth.SKU.isin(sku_list.SKU)]

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['SKU', 'RELEASEDATE', 'PRICE']], preds[['SKU', 'RELEASEDATE', 'predictions', 'prediction_low', 'prediction_high']], 
                 on=['SKU', 'RELEASEDATE'], how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()
final = final.drop_duplicates().reset_index(drop=True)
# final.to_csv('s3://arbit-algo/sagemaker/algo-v1/output/eval_values.csv', mode='a', header=False, index=False)

pct_in_range = sum(final.price_in_range)/len(final)
num_skus = len(final)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], pct_in_range=pct_in_range, 
            transformed=True, num_skus=num_skus,
            notes=f'Monitor SKU {str(date.today())} 1 Day Less than 50 Sales in 30 Days')
eval_path = 's3://justin-automation-output/outputs/output/evaluation_v1_sku.csv'
#scores.reset_index().to_csv(eval_path, index=False)
scores['eval_date'] = date.today()
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))
display(scores)

Yesterday's Predictions vs Yesterday's Actual Prices, SKU Model, Evaluation: 
MSE:  0.028488148931721295
RMSE:  0.1687843266767424
RMSE (retuned to normal scale):  37.61538131510079
MAE:  0.12286634094579357
MAE (retuned to normal scale):  18.736250189443027
R-squared:  0.9158399159067228


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,mse,rmse,rmse_exp,mae,mae_exp,r2,adj_r2,pct_in_range,num_skus,notes,eval_date
AlgoV1,0.028488,0.168784,37.615381,0.122866,18.73625,0.91584,,0.725016,1571,Monitor SKU 2024-03-20 1 Day Less than 50 Sale...,2024-03-20
