## Setup

In [2]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
import numpy as np
import pandas as pd

from IPython.display import Image
from IPython.display import display

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout

from sagemaker import get_execution_role
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.transformer import Transformer
from sagemaker.image_uris import retrieve
from sagemaker.utils import name_from_base
from sagemaker.tensorflow import TensorFlow
from sagemaker.tensorflow import TensorFlowModel

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import tarfile
import io
import shutil
from io import StringIO, BytesIO
import joblib

from datetime import date, timedelta
from datetime import datetime
import time
from time import gmtime, strftime

In [3]:
bucket_name = 'justin-automation-output'
prefix = 'outputs'

#s3 = boto3.resource('s3')

In [4]:
# needed functions

def regression_scores(y_test, preds, transformed=False):
    '''
    Returns and prints evaluation metics for a regression model
    '''
    if transformed:

        mse = mean_squared_error(y_test, preds)
        rmse =  mean_squared_error(y_test, preds, squared=False)
        rmse_exp = mean_squared_error(np.exp(y_test), np.exp(preds), squared=False)
        mae = mean_absolute_error(y_test, preds)
        mae_exp = mean_absolute_error(np.exp(y_test), np.exp(preds))

    else:
        mse = mean_squared_error(y_test, preds)
        rmse =  mean_squared_error(y_test, preds, squared=False)
        rmse_exp = 'N/A'
        mae = mean_absolute_error(y_test, preds)
        mae_exp = 'N/A'
        
    r2 = r2_score(y_test, preds)
    adj_r2 = 'N/A'

    print('MSE: ', mse)
    print('RMSE: ', rmse)
    print('RMSE (retuned to normal scale): ', rmse_exp)
    print('MAE: ', mae)
    print('MAE (retuned to normal scale): ', mae_exp)
    print('R-squared: ', r2)

    return mse, rmse, rmse_exp, mae, mae_exp, r2, adj_r2

def score_table(scores, model_name, y_test, preds, transformed=False, notes=None):
    '''
    creates a data frame with various scores for each model
    '''
    
    mse, rmse, rmse_exp, mae, mae_exp, r2, adj_r2 = regression_scores(y_test, preds, transformed)
    
    score_list = []
    score_list.extend((mse, rmse, rmse_exp, mae, mae_exp, r2, adj_r2, notes))
    
    scores.loc[model_name] = score_list
    return scores

In [4]:
!aws s3 cp --recursive s3://arbit-algo/sagemaker/algo-v1/input/inference/ inference/
!aws s3 cp --recursive s3://arbit-algo/sagemaker/algo-v1/input/code1/ code1/

## Inference

### Model 1

In [5]:
get_last_modified = lambda obj: int(obj['LastModified'].strftime('%s'))
s3 = boto3.client('s3')
objs = s3.list_objects_v2(Bucket='arbit-algo', Prefix='sagemaker/algo-v1/output/models/1/')['Contents']
keys_with_model_tar_gz = [item for item in objs if 'model.tar.gz' in item['Key']]
last_added = [obj['Key'] for obj in sorted(keys_with_model_tar_gz, key=get_last_modified, reverse=True)][0]

if os.path.exists('1') and os.path.isdir('1'):
    shutil.rmtree('1')

s3_object = s3.get_object(Bucket='arbit-algo', Key=last_added)

wholefile = s3_object['Body'].read()
fileobj = io.BytesIO(wholefile)
tarf = tarfile.open(fileobj=fileobj)
names = tarf.getnames()
for name in names:
    print(name)

model_files = [names]
tarf.extractall()

model = tf.keras.models.load_model('1')

1
1/keras_metadata.pb
1/saved_model.pb
1/assets
1/variables
1/variables/variables.data-00000-of-00001
1/variables/variables.index


In [6]:
from inference.preprocess import *

training_data = 's3://arbit-algo/sagemaker/algo-v1/processing/input/input_training.csv'

# just to make sure we capture all columns
X_train_preprocessed = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/train/X_train.csv')

df_model = final(training_data) #from preprocess_v1
df_model = df_model[(df_model['SOLD_PRICE'] > 5) & 
                    (df_model['RETAILPRICE']>5) & 
                    (df_model['num_sales']>=20)]
df_model = df_model.dropna().reset_index(drop=True)

X, y = final_preprocess(df_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


In [7]:
model_no = 1

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0', 'lag1', 'lag2', 'lag3', 'lag4']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed1 = X_new_preprocessed[X_train_preprocessed.columns]

#X_new_preprocessed.to_csv('X_inference1.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 6785
length of df_new: 6785
lengths match? True


In [8]:
model_no = 2

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0', 'lag1', 'lag2', 'lag3', 'lag4']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed2 = X_new_preprocessed[X_train_preprocessed.columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 8552
length of df_new: 8552
lengths match? True


In [9]:
model_no = 3

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0', 'lag1', 'lag2', 'lag3', 'lag4']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed3 = X_new_preprocessed[X_train_preprocessed.columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 1100
length of df_new: 1100
lengths match? True


In [10]:
model_no = 4

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0', 'lag1', 'lag2', 'lag3', 'lag4']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed4 = X_new_preprocessed[X_train_preprocessed.columns]

  data = final_cleanup(data_filepath, min_sale, inference=inference)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 26
length of df_new: 26
lengths match? True


In [11]:
# concatenating all preds

prediction = model.predict(X_new_preprocessed1)
df1 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info_{len(df1)}.csv')
product_info1 = product_info1.reset_index()

df_merge1 = pd.concat([df1, product_info1], axis=1)

prediction = model.predict(X_new_preprocessed2)
df2 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info_{len(df2)}.csv')
product_info2 = product_info2.reset_index()

df_merge2 = pd.concat([df2, product_info2], axis=1)

prediction = model.predict(X_new_preprocessed3)
df3 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info3 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info_{len(df3)}.csv')
product_info3 = product_info3.reset_index()

df_merge3 = pd.concat([df3, product_info3], axis=1)

prediction = model.predict(X_new_preprocessed4)
df4 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info4 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info_{len(df4)}.csv')
product_info4 = product_info4.reset_index()

df_merge4 = pd.concat([df4, product_info4], axis=1)

# putting it all together
df_merge = pd.concat([df_merge1, df_merge2[1:], df_merge3[1:], df_merge4[1:]], ignore_index=True)
df_merge['predictions'] = [np.exp(float(x)) for x in df_merge['predictions']]

model_info = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/output/evaluation.csv')
mae = model_info['mae_exp'][(~model_info.notes.isna()) & (model_info['notes'].str.contains('Model 1'))].iloc[-1]
# mae_individual = mae * (df_merge.predictions / np.mean(df_merge.predictions))
mae_individual = mae * (df_merge.std_sale_px / np.mean(df_merge.std_sale_px)) # using sneaker's standard deviation of price

# need to get original unchanged size
df_final = df_merge[['ID', 'SKU', 'SIZE_CM', 'GENDER', 'predictions']]
                     #'num_sales', 'DAYS_SINCE_LAST_SALE', 'DAYS_SINCE_RELEASE', 'std_sale_px', 'avg_sale_px_last_5', 'lag1', 'lag2', 'lag3']] # #'avg_sale_px', 
df_final['prediction_low'] = df_final['predictions'] - mae_individual
df_final['prediction_low'] = df_final['prediction_low'].apply(lambda x: max(0, x))
df_final['prediction_high'] = df_final['predictions'] + mae_individual
df_final['SKU'] = df_final['SKU'].str.upper()
df_final['ID'] = df_final['ID'].str.upper()
df_final = df_final.rename(columns={'SIZE_CM': 'SIZE_VALUE'})

from datetime import date, timedelta
from datetime import datetime
import time
from time import gmtime, strftime

df_final['pred_date'] = datetime.now().strftime('%Y-%m-%d')
df_final.to_csv(f's3://justin-automation-output/outputs/output/predictions-test/dt-{date.today()}/predictions1.csv')
print('Model 1 Preds:')
display(df_final.head())

Model 1 Preds:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['prediction_low'] = df_final['predictions'] - mae_individual
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['prediction_high'] = df_final['predictions'] + mae_individual
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['SKU'] = df_final['SKU'].str.upper()
A value is trying 

Unnamed: 0,ID,SKU,SIZE_VALUE,GENDER,predictions,prediction_low,prediction_high,pred_date
0,1201A019-108 10 2023-05-30,1201A019-108,269.0,men,214.782534,196.009124,233.555943,2023-12-22
1,1201A019-108 10.5 2023-05-30,1201A019-108,274.0,men,221.61251,211.317934,231.907086,2023-12-22
2,1201A019-108 11 2023-05-30,1201A019-108,278.0,men,222.710947,198.570671,246.851223,2023-12-22
3,1201A019-108 11.5 2023-05-30,1201A019-108,282.0,men,221.561792,205.914492,237.209093,2023-12-22
4,1201A019-108 12 2023-05-30,1201A019-108,286.0,men,280.842367,239.875003,321.809732,2023-12-22


### Model 2

In [13]:
get_last_modified = lambda obj: int(obj['LastModified'].strftime('%s'))
s3 = boto3.client('s3')
objs = s3.list_objects_v2(Bucket='arbit-algo', Prefix='sagemaker/algo-v1/output/models/2/')['Contents']
keys_with_model_tar_gz = [item for item in objs if 'model.tar.gz' in item['Key']]
last_added = [obj['Key'] for obj in sorted(keys_with_model_tar_gz, key=get_last_modified, reverse=True)][0]

if os.path.exists('1') and os.path.isdir('1'):
    shutil.rmtree('1')

s3_object = s3.get_object(Bucket='arbit-algo', Key=last_added)

wholefile = s3_object['Body'].read()
fileobj = io.BytesIO(wholefile)
tarf = tarfile.open(fileobj=fileobj)
names = tarf.getnames()
for name in names:
    print(name)

model_files = [names]
tarf.extractall()

model = tf.keras.models.load_model('1')

1
1/saved_model.pb
1/variables
1/variables/variables.index
1/variables/variables.data-00000-of-00001
1/assets
1/keras_metadata.pb


In [42]:
from code1.preprocess import *

training_data = 's3://arbit-algo/sagemaker/algo-v1/processing/input/input_training.csv'

# just to make sure we capture all columns
X_train_preprocessed = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/processing/output/train/X_train2.csv')

df_model = final(training_data, lag=False) #from preprocess_v1
df_model = df_model[(df_model['SOLD_PRICE'] > 5) & 
                    (df_model['RETAILPRICE']>5) & 
                    (df_model['num_sales']>=1)]
df_model = df_model.dropna().reset_index(drop=True)

X, y = final_preprocess(df_model)

  data = final_cleanup(data_filepath, min_sale, inference=inference, lag=lag)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


In [43]:
model_no = 1

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True, lag=False)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed1 = X_new_preprocessed[X_train_preprocessed.columns]

#X_new_preprocessed.to_csv('X_inference1.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 8392
length of df_new: 8392
lengths match? True


In [44]:
model_no = 2

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True, lag=False)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed2 = X_new_preprocessed[X_train_preprocessed.columns]

#X_new_preprocessed.to_csv('X_inference1.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 19045
length of df_new: 19045
lengths match? True


In [45]:
from code1.preprocess import *

model_no = 3

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True, lag=False)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed3 = X_new_preprocessed[X_train_preprocessed.columns]

#X_new_preprocessed.to_csv('X_inference1.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 23179
length of df_new: 23179
lengths match? True


In [46]:
model_no = 4

model_data = f's3://arbit-algo/sagemaker/algo-v1/processing/input/inference/input_inference{model_no}.csv'

bool_cols = ['IS_COLLAB', 'IS_OG_SE']
cat = ['GENDER', 'SILHOUETTE', 'COLOR', 'BRAND', 'primary_source']
cont = ['SIZE_CM', 'RETAILPRICE', 'num_sales', 'std_sale_px', 'DAYS_SINCE_LAST_SALE', #'avg_sale_px', 
        'DAYS_SINCE_RELEASE', 'source_count', 'lag0']

scaler = StandardScaler().fit(X[cont]) # or X_train
ohe = OneHotEncoder(handle_unknown='ignore').fit(X[cat])

df_new = final(model_data, inference=True, lag=False)
df_new = string_cleanup(df_new)
df_new = df_new.dropna()#.reset_index(drop=True) # drop any final missing values

X_new, y_new = final_preprocess(df_new)

# drop any final missing & extreme values
# deleting any infinity values (already scaled and logged, so this shouldnt be dropping any actual columns)

print('length of X_new:', len(X_new))
#X_new = X_new.reset_index(drop=True)
mask = X_new.isna()
X_new = X_new[~mask]#.reset_index(drop=True)

# drop corresponing rows in model_data
print('length of df_new:', len(df_new))

df_new = df_new[~mask.any(axis=1)]#.reset_index(drop=True)
print('lengths match?', len(X_new)==len(df_new))

# one hot encode, scale & concat
df_ohe_new = pd.DataFrame(ohe.transform(X_new[cat]).toarray(), 
                          columns=ohe.get_feature_names(X_new[cat].columns), 
                          index=X_new[cat].index)
X_new_cont = pd.DataFrame(scaler.transform(X_new[cont]),columns=X_new[cont].columns,index=X_new[cont].index)
bool_cols_new = pd.get_dummies(X_new[bool_cols], columns=bool_cols, drop_first= True)

X_new_preprocessed = pd.concat([X_new_cont, df_ohe_new], axis=1)
X_new_preprocessed = pd.concat([X_new_preprocessed, bool_cols_new], axis=1)

# Get missing columns in the training test
missing_cols = set(X_train_preprocessed.columns) - set(X_new_preprocessed.columns)
for c in missing_cols:
    X_new_preprocessed[c] = 0

X_new_preprocessed4 = X_new_preprocessed[X_train_preprocessed.columns]

#X_new_preprocessed.to_csv('X_inference1.csv', index=False)

  data = final_cleanup(data_filepath, min_sale, inference=inference, lag=lag)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RELEASEDATE'][df.RELEASEDATE.isna()] = df['RELEASEDATE'][~df.RELEASEDATE.isna()].quantile(0.5, interpolation="midpoint")


length of X_new: 40103
length of df_new: 40103
lengths match? True


In [48]:
# concatenating all preds

prediction = model.predict(X_new_preprocessed1)
df1 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info2_{len(df1)}.csv')
product_info1 = product_info1.reset_index()

df_merge1 = pd.concat([df1, product_info1], axis=1)

prediction = model.predict(X_new_preprocessed2)
df2 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info2_{len(df2)}.csv')
product_info2 = product_info2.reset_index()

df_merge2 = pd.concat([df2, product_info2], axis=1)

prediction = model.predict(X_new_preprocessed3)
df3 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info3 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info2_{len(df3)}.csv')
product_info3 = product_info3.reset_index()

df_merge3 = pd.concat([df3, product_info3], axis=1)

prediction = model.predict(X_new_preprocessed4)
df4 = pd.DataFrame(prediction).rename(columns={0: 'predictions'})
product_info4 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/processing/output/product_info2_{len(df4)}.csv')
product_info4 = product_info4.reset_index()

df_merge4 = pd.concat([df4, product_info4], axis=1)

# putting it all together
df_merge = pd.concat([df_merge1, df_merge2[1:], df_merge3[1:], df_merge4[1:]], ignore_index=True)
df_merge['predictions'] = [round(np.exp(float(x)), 2) for x in df_merge['predictions']]

mean_std_pct_non_zero = np.mean(df_merge['std_sale_px'][df_merge.std_sale_px>0] / df_merge['predictions'][df_merge.std_sale_px>0])

# if std deviation is 0, replace with mean of std dev as pct of prediction value for all non zero std dev
df_merge['std_sale_px'][df_merge.std_sale_px==0] = df_merge.predictions * mean_std_pct_non_zero

# if less than 5 sales and std dev as % of pred is less than mean, replace with mean
df_merge['std_sale_px'][(df_merge.std_sale_px / df_merge.predictions < mean_std_pct_non_zero) & (df_merge.num_sales < 5)] = \
    df_merge.predictions * mean_std_pct_non_zero

model_info = pd.read_csv('s3://arbit-algo/sagemaker/algo-v1/output/evaluation.csv')
mae = model_info['mae_exp'][(~model_info.notes.isna()) & (model_info['notes'].str.contains('Model 1'))].iloc[-1]
# mae_individual = mae * (df_merge.predictions / np.mean(df_merge.predictions))
mae_individual = mae * (df_merge.std_sale_px / np.mean(df_merge.std_sale_px)) # using sneaker's standard deviation of price

# need to get original unchanged size
df_final2 = df_merge[['ID', 'SKU', 'SIZE_CM', 'GENDER', 'predictions']]
                     #'num_sales', 'DAYS_SINCE_LAST_SALE', 'DAYS_SINCE_RELEASE', 'std_sale_px', 'avg_sale_px_last_5', 'lag1', 'lag2', 'lag3']] # #'avg_sale_px', 
df_final2['prediction_low'] = df_final2['predictions'] - mae_individual
df_final2['prediction_low'] = df_final2['prediction_low'].apply(lambda x: max(0, x))
df_final2['prediction_high'] = df_final2['predictions'] + mae_individual

# round to 2 decimals
df_final2['predictions'] = df_final2['predictions'].apply(lambda x: "%.2f" % x)
df_final2['prediction_low'] = df_final2['prediction_low'].apply(lambda x: "%.2f" % x)
df_final2['prediction_high'] = df_final2['prediction_high'].apply(lambda x: "%.2f" % x)

df_final2['SKU'] = df_final2['SKU'].str.upper()
df_final2['ID'] = df_final2['ID'].str.upper()
df_final2 = df_final2.rename(columns={'SIZE_CM': 'SIZE_VALUE'})

df_final2 = df_final2[~df_final2.ID.isin(list(set(df_final.ID)))].reset_index(drop=True)

from datetime import date, timedelta
from datetime import datetime
import time
from time import gmtime, strftime

df_final2['pred_date'] = datetime.now().strftime('%Y-%m-%d')
df_final2.to_csv(f's3://justin-automation-output/outputs/output/predictions-test/dt-{date.today()}/predictions2.csv')
print('Model 2 Preds:')
display(df_final2.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['std_sale_px'][df_merge.std_sale_px==0] = df_merge.predictions * mean_std_pct_non_zero
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['std_sale_px'][(df_merge.std_sale_px / df_merge.predictions < mean_std_pct_non_zero) & (df_merge.num_sales < 5)] = \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final2['prediction_low'] = df_final2['predictions'] - mae_individual
A value is trying to be s

Model 2 Preds:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final2['ID'] = df_final2['ID'].str.upper()


Unnamed: 0,ID,SKU,SIZE_VALUE,GENDER,predictions,prediction_low,prediction_high,pred_date
0,1201A019-108 14 2023-05-30,1201A019-108,303.0,men,287.873287,263.156358,312.590216,2023-12-22
1,1201A906-001 12.5 2023-05-31,1201A906-001,290.0,men,163.403779,135.911398,190.896159,2023-12-22
2,1201A906-001 13 2023-05-31,1201A906-001,295.0,men,212.662906,157.396835,267.928977,2023-12-22
3,1201A906-001 14 2023-05-31,1201A906-001,303.0,men,186.744538,149.535551,223.953524,2023-12-22
4,1201A906-001 4 2023-05-31,1201A906-001,219.0,men,139.088985,124.344938,153.833032,2023-12-22


## Monitoring

In [None]:
# evaluate model
scores = pd.DataFrame(columns = ['mse', 'rmse', 'rmse_exp', 'mae', 'mae_exp', 'r2', 'adj_r2', 'notes']) 
yesterday = date.today() - timedelta(days=1)

# preds for T-1
preds1 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions1.csv')
preds2 = pd.read_csv(f's3://arbit-algo/sagemaker/algo-v1/output/predictions-test/dt-{yesterday}/predictions2.csv')
preds = pd.concat([preds1, preds2])
y_truth = pd.read_csv('s3://historicaldata-sample/ground_truth.csv')


# dropping missing values
y_truth = y_truth.dropna()


y_truth['SIZE'] = y_truth['SIZE'].astype(str).apply(lambda x: x.rstrip('.0') if x.endswith('.0') else x)

y_truth['ID'] = y_truth['SKU'].astype(str) + " " + y_truth['SIZE'].astype(str) + " " + y_truth['RELEASEDATE'].astype(str)
y_truth['ID'] = y_truth['ID'].str.upper()

# subset where ground truth exists, log transform to get to scale of model
final = pd.merge(y_truth[['ID', 'PRICE']], preds[['ID', 'predictions', 'prediction_low', 'prediction_high']], on='ID', how='inner')
final['predictions'] = np.log(final.predictions)
final['prediction_low'] = np.log(final.prediction_low)
final['prediction_high'] = np.log(final.prediction_high)
final['PRICE'] = np.log(final.PRICE)
final['price_in_range'] = final.apply(lambda x: 1 if x['prediction_low'] <= x['PRICE'] <= x['prediction_high'] else 0, axis=1)
final['eval_date'] = date.today()

final = final.drop_duplicates().reset_index(drop=True)

# get and save scores
model='AlgoV1'
print("Yesterday's Predictions vs Yesterday's Actual Prices, Sneaker Model, Evaluation: ") 
score_table(scores, model, final['PRICE'], final['predictions'], transformed=True, 
            notes=f'Monitor {str(date.today())}')

eval_path = 's3://justin-automation-output/outputs/output/evaluation.csv'
scores.reset_index().to_csv(eval_path, mode='a', header=False, index=False) #(not os.path.exists(eval_path)))