In [8]:
import pandas as pd
import numpy as np

orders_df = pd.read_csv("../data/ecommerce/olist_orders_dataset.csv")
orders_df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [9]:
df = orders_df.dropna()

# Only keep  delivered orders
df = df[df['order_status'] == 'delivered']

# Drop rows where order_purchase_timestamp <= order_approved_at <= order_delivered_carrier_date <= order_delivered_customer_date
df = df[~((df['order_purchase_timestamp'] >= df['order_approved_at']) | (df['order_approved_at'] >= df['order_delivered_carrier_date']) | (df['order_delivered_carrier_date'] >= df['order_delivered_customer_date']))]


def bucketize_actual_delivery_vs_expectation (row):
  if row['days_between_delivery_expectation'] <= 0 :
    return -1
  elif row['days_between_delivery_expectation'] <= 7 :
    return 1
  elif row['days_between_delivery_expectation'] <= 14 :
    return 2
  else:
    return 3


# Days between purchase and delivery dates
df['days_between_purhcase_and_delivery'] = (pd.to_datetime(df['order_delivered_customer_date']) - pd.to_datetime(df['order_purchase_timestamp'])).dt.days

# if the order was approved late or on time (0=on time, 1=late)
df['order_approved_late']=np.where((pd.to_datetime(df['order_approved_at']) - pd.to_datetime(df['order_purchase_timestamp'])).dt.days == 0, 0, 1)

# Actual delivery vs. Expected delivery: 1=Delivered before expected date, 2= Delivered one week later than expected date, 3= Delivered two weeks later than expected date, 4= Delivered more than two weeks later than expected date
df['days_between_delivery_expectation']=(pd.to_datetime(df['order_estimated_delivery_date']) - pd.to_datetime(df['order_delivered_customer_date'])).dt.days
df['actual_delivery_vs_expectation_bucket'] = df.apply (lambda row: bucketize_actual_delivery_vs_expectation(row), axis=1)

df = df[['order_id','days_between_purhcase_and_delivery','order_approved_late','actual_delivery_vs_expectation_bucket','order_delivered_carrier_date']]

orders_df = df 
orders_df

Unnamed: 0,order_id,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,order_delivered_carrier_date
0,e481f51cbdc54678b7cc49136f2d6af7,8,0,1,2017-10-04 19:55:00
1,53cdb2fc8bc7dce0b6741e2150273451,13,1,1,2018-07-26 14:31:00
2,47770eb9100c2d0c44946d9cf07ec65d,9,0,3,2018-08-08 13:50:00
3,949d5b44dbf5de918fe9c16f97b45f8a,13,0,2,2017-11-22 13:39:59
4,ad21c59c0840e6cb83a9ceb5573f8159,2,0,2,2018-02-14 19:46:34
...,...,...,...,...,...
99435,880675dff2150932f1601e1c07eadeeb,11,0,3,2017-03-01 10:22:52
99437,63943bddc261676b46f01ca7ac2f7bd8,22,0,1,2018-02-07 23:22:42
99438,83c1379a015df1e13d02aae0204711ab,24,0,1,2017-08-28 20:52:26
99439,11c177c8e97725db2631073c19f07b62,17,0,3,2018-01-12 15:35:03


In [11]:
# Load the reviews table from csv file
reviews_df = pd.read_csv("../data/ecommerce/olist_order_reviews_dataset.csv")

# Drop extra reviews if an order has multiple order review scores
df_reviews = reviews_df.groupby('order_id', as_index= False).agg(review_score=('review_score', 'max'))
df_reviews

Unnamed: 0,order_id,review_score
0,00010242fe8c5a6d1ba2dd792cb16214,5
1,00018f77f2f0320c557190d7a144bdd3,4
2,000229ec398224ef6ca0657da4fc703e,5
3,00024acbcdf0a6daa1e931b038114c75,4
4,00042b26cf59d7ce69dfabb4e55b4fd9,5
...,...,...
99436,fffc94f6ce00a00581880bf54a75a037,5
99437,fffcd46ef2263f404302a634eb57f7eb,5
99438,fffce4705a9662cd70adb13d4a31832d,5
99439,fffe18544ffabc95dfada21779c9644f,5


In [None]:
orders_data = orders_df
items_data = items_df
reviews_data = df_reviews

In [3]:
!pip3 uninstall hsfs -y
#!pip3 install 'git+https://github.com/logicalclocks/feature-store-api@master#egg=hsfs[python]&subdirectory=python'
!pip3 install 'git+https://github.com/moritzmeister/feature-store-api@py39#egg=hsfs[python]&subdirectory=python'

Collecting hsfs[python]
  Cloning https://github.com/moritzmeister/feature-store-api (to revision py39) to /tmp/pip-install-l0ik6o87/hsfs_e9c7fa8acd2041bca4f4d92069770630
  Running command git clone -q https://github.com/moritzmeister/feature-store-api /tmp/pip-install-l0ik6o87/hsfs_e9c7fa8acd2041bca4f4d92069770630
  Running command git checkout -b py39 --track origin/py39
  Switched to a new branch 'py39'
  Branch 'py39' set up to track remote branch 'py39' from 'origin'.
  Resolved https://github.com/moritzmeister/feature-store-api to commit e76f7939d259e939fc20b7a2ea67f7e2f3b62026
Collecting confluent-kafka==1.8.2
  Downloading confluent_kafka-1.8.2-cp39-cp39-manylinux2010_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.1 MB/s eta 0:00:01
Building wheels for collected packages: hsfs
  Building wheel for hsfs (setup.py) ... [?25ldone
[?25h  Created wheel for hsfs: filename=hsfs-2.6.0.dev1-py3-none-any.whl size=168423 sha256=429b91218636c2fc53b70aa9ee994be11a

In [4]:
import sys
print(sys.version)

3.9.7 (default, Sep 16 2021, 13:09:58) 
[GCC 7.5.0]


In [6]:
!pip show pyspark

Name: pyspark
Version: 3.2.0
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /home/jdowling/.local/lib/python3.9/site-packages
Requires: py4j
Required-by: 


In [13]:
import hsfs

connection = hsfs.connection(
    host="35.195.111.34",
    project="rec",
    secrets_store="local",
    api_key_file="./api-key.txt",
    engine="python"
)
fs = connection.get_feature_store()



Connected. Call `.close()` to terminate connection gracefully.


In [14]:
orders_data = orders_df
items_data = items_df
reviews_data = df_reviews

orders_fg = fs.create_feature_group(
                        name="orders",
                        primary_key=["order_id"],
                        version=3,
                        description="Order details",
                        online_enabled=True
)

orders_fg.save(orders_data)

Feature Group created successfully, explore it at https://35.195.111.34:443/p/122/fs/70/fg/25
Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at https://35.195.111.34/p/122/jobs/named/orders_3_offline_fg_backfill/executions


<hsfs.core.job.Job at 0x7f7c81e2b910>

In [None]:
items_fg = fs.create_feature_group(
                        name="items",
                        primary_key=["order_id"],
                        version=2,
                        description="Order Item details",
                        online_enabled=True
)

items_fg.save(items_data)

In [None]:
reviews_fg = fs.create_feature_group(
                        name="reviews",
                        primary_key=["order_id"],
                        version=2,
                        description="Review details",
                        online_enabled=True
)

reviews_fg.save(reviews_data)

In [None]:
query = orders_fg.select(['order_id','order_delivered_carrier_date']).join(items_fg.select(['max_shipping_limit_date']))
derived_df = query.read()

In [None]:
derived_df['days_between_delivered_carrier_and_shipping_limit'] = (pd.to_datetime(derived_df['order_delivered_carrier_date']) - pd.to_datetime(derived_df['max_shipping_limit_date'])).dt.days
derived_df

In [None]:
derived_df['seller_shipped_late'] = \
    np.where(derived_df['days_between_delivered_carrier_and_shipping_limit'] > 0 , 1 ,0) 
derived_df

In [None]:
derived_df = derived_df.drop(columns=['order_delivered_carrier_date', \
                                  'max_shipping_limit_date','days_between_delivered_carrier_and_shipping_limit'])
derived_df

In [None]:
(derived_df['seller_shipped_late'] == 1).sum()

In [None]:
(derived_df['seller_shipped_late'] == 0).sum()

In [None]:
derived_fg = fs.create_feature_group(
                        name="orders_late_shipped",
                        primary_key=["order_id"],
                        version=1,
                        description="Orders shippped late",
                        online_enabled=True
)

derived_fg.save(derived_df)

In [5]:
orders_fg = fs.get_feature_group(name="orders", version=1)
items_fg = fs.get_feature_group(name="items", version=1)
reviews_fg = fs.get_feature_group(name="reviews", version=1)
derived_fg = fs.get_feature_group(name="orders_late_shipped", version=1)


In [6]:
query2 = orders_fg.select_all().join(items_fg.select(
    ['is_multiItems_order', 'total_order_price', 'total_order_freight']))\
    .join(reviews_fg.select(['review_score'])).join(derived_fg.select(['seller_shipped_late']))
training_df = query2.read()
training_df



Unnamed: 0,order_id,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,order_delivered_carrier_date,is_multiitems_order,total_order_price,total_order_freight,review_score,seller_shipped_late
0,00010242fe8c5a6d1ba2dd792cb16214,7,0,2,2017-09-19 18:34:16,0,58.90,13.29,5,0
1,00018f77f2f0320c557190d7a144bdd3,16,0,1,2017-05-04 14:35:00,0,239.90,19.93,4,1
2,000229ec398224ef6ca0657da4fc703e,7,0,2,2018-01-16 12:36:48,0,199.00,17.87,5,0
3,00024acbcdf0a6daa1e931b038114c75,6,0,1,2018-08-10 13:28:00,0,12.99,12.79,4,0
4,00042b26cf59d7ce69dfabb4e55b4fd9,25,0,3,2017-02-16 09:46:09,0,199.90,18.14,5,1
...,...,...,...,...,...,...,...,...,...,...
93825,fffc94f6ce00a00581880bf54a75a037,17,1,1,2018-04-25 12:09:00,0,299.99,43.41,5,0
93826,fffcd46ef2263f404302a634eb57f7eb,9,1,2,2018-07-17 08:05:00,0,350.00,36.53,5,0
93827,fffce4705a9662cd70adb13d4a31832d,4,1,2,2017-10-26 15:13:14,0,99.90,16.95,5,0
93828,fffe18544ffabc95dfada21779c9644f,1,0,2,2017-08-15 19:02:53,0,55.99,8.72,5,0


In [9]:
#trans_fn = hsfs.transformations.minmaxscalar


fv = fs.create_feature_view(name="order_reviews",
                            description="Dataset to train the order review model",
                            version = 2,
#s                            transformations= {"myfeature" : trans_fn},
                            label = ["review_score"],
                            query = query2)

Feature view created successfully, explore it at https://35.195.111.34:443/p/122/fs/70/fv/order_reviews/version/2


In [None]:
train_ds = fv.create_training_dataset(
    version = 2,
    start_time="2017-01-01 00:00",
    description = 'reviews dataset',
    data_format = 'csv',
    coalesce = True,
    splits = {'train': 80, 'test': 20},
#    train_split = "train",
    write_options = {'wait_for_job': True}
)

In [15]:
fv = fs.get_feature_view(name="order_reviews", version=2)
#training_data = fv.get_training_dataset(version=3)
#training_data

In [11]:
fvs = fs.get_feature_views("order_reviews")

In [16]:
!pip3 uninstall hsml -y
!pip3 install 'git+https://github.com/logicalclocks/machine-learning-api@main#egg=hsml&subdirectory=python'

Collecting hsml
  Cloning https://github.com/logicalclocks/machine-learning-api (to revision main) to /tmp/pip-install-fktr4rdy/hsml_a4c38f8fd45f4e4e8339719073c212f5
  Running command git clone -q https://github.com/logicalclocks/machine-learning-api /tmp/pip-install-fktr4rdy/hsml_a4c38f8fd45f4e4e8339719073c212f5
  Resolved https://github.com/logicalclocks/machine-learning-api to commit 478f493bbc3009939ce0a557329add9cbd923194
Building wheels for collected packages: hsml
  Building wheel for hsml (setup.py) ... [?25ldone
[?25h  Created wheel for hsml: filename=hsml-2.6.0.dev1-py3-none-any.whl size=89971 sha256=23d2ade52c96163c9c1b8368ef8037f5838bd97b565dacf9b87bed7ce77242bb
  Stored in directory: /tmp/pip-ephem-wheel-cache-lcibcjbl/wheels/88/bc/8e/19063dd38b55ab47312e325972be1b7df2458a7926321af1f9
Successfully built hsml
Installing collected packages: hsml
Successfully installed hsml-2.6.0.dev1


In [17]:
td_version, df  = fv.get_training_dataset()
df



Unnamed: 0,order_id,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,order_delivered_carrier_date,is_multiitems_order,total_order_price,total_order_freight,review_score,seller_shipped_late
0,00010242fe8c5a6d1ba2dd792cb16214,7,0,2,2017-09-19 18:34:16,0,58.90,13.29,5,0
1,00018f77f2f0320c557190d7a144bdd3,16,0,1,2017-05-04 14:35:00,0,239.90,19.93,4,1
2,000229ec398224ef6ca0657da4fc703e,7,0,2,2018-01-16 12:36:48,0,199.00,17.87,5,0
3,00024acbcdf0a6daa1e931b038114c75,6,0,1,2018-08-10 13:28:00,0,12.99,12.79,4,0
4,00042b26cf59d7ce69dfabb4e55b4fd9,25,0,3,2017-02-16 09:46:09,0,199.90,18.14,5,1
...,...,...,...,...,...,...,...,...,...,...
93825,fffc94f6ce00a00581880bf54a75a037,17,1,1,2018-04-25 12:09:00,0,299.99,43.41,5,0
93826,fffcd46ef2263f404302a634eb57f7eb,9,1,2,2018-07-17 08:05:00,0,350.00,36.53,5,0
93827,fffce4705a9662cd70adb13d4a31832d,4,1,2,2017-10-26 15:13:14,0,99.90,16.95,5,0
93828,fffe18544ffabc95dfada21779c9644f,1,0,2,2017-08-15 19:02:53,0,55.99,8.72,5,0


In [26]:
from sklearn.model_selection import train_test_split

def data_split(data):
  training_data, testing_data = train_test_split(data, test_size=0.1, random_state=42)
  X_train, X_valid, y_train, y_valid = train_test_split(training_data.drop(
      ['review_score', 'order_id', 'order_delivered_carrier_date'], axis=1), 
    training_data['review_score'], test_size=0.1, random_state=7)
  data_pair = [(X_train, y_train), (X_valid, y_valid)]
  return data_pair

df2 = data_split(df)

In [27]:
data_pair = df2 #data_split(training_data)

train_data_X = data_pair[0][0]
train_data_X

Unnamed: 0,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,is_multiitems_order,total_order_price,total_order_freight,seller_shipped_late
76378,11,0,3,0,35.00,16.05,0
73272,7,1,3,0,169.90,13.53,0
76656,2,0,2,0,139.90,23.56,0
7919,11,0,1,0,99.00,15.44,0
32862,15,0,1,0,120.00,34.20,0
...,...,...,...,...,...,...,...
47871,7,0,2,0,34.99,8.72,0
33729,8,0,3,1,181.90,27.98,0
7721,22,0,3,0,228.00,26.09,0
11857,18,0,2,0,259.99,16.57,0


In [22]:
train_data_Y = data_pair[0][1]
train_data_Y

76378    3
73272    5
76656    4
7919     4
32862    4
        ..
47871    5
33729    5
7721     3
11857    5
15212    4
Name: review_score, Length: 76002, dtype: int64

In [23]:
test_data_X = data_pair[1][0]
test_data_X

Unnamed: 0,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,order_delivered_carrier_date,is_multiitems_order,total_order_price,total_order_freight,seller_shipped_late
14495,17,0,-1,2018-08-02 08:34:00,1,25.00,30.46,1
72178,7,0,2,2018-06-04 12:02:00,0,35.00,8.88,0
83704,16,0,1,2018-05-21 14:31:00,0,12.90,18.23,0
32530,20,1,1,2018-02-08 23:47:11,0,24.99,14.10,0
77875,8,0,2,2017-05-11 13:55:39,0,110.00,15.52,0
...,...,...,...,...,...,...,...,...
23243,8,1,2,2017-10-05 17:58:50,0,42.00,12.23,0
83048,4,0,3,2017-06-19 20:02:46,0,249.99,19.00,0
68255,31,0,1,2018-03-20 20:18:52,0,39.00,18.23,0
66112,8,1,-1,2018-08-15 15:10:00,0,179.90,23.01,0


In [24]:
test_data_Y = data_pair[1][1]
test_data_Y

14495    1
72178    4
83704    4
32530    5
77875    4
        ..
23243    4
83048    3
68255    3
66112    1
92907    5
Name: review_score, Length: 8445, dtype: int64

In [39]:
import xgboost
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import time
import hsml
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

def model_performance(X_train, y_train, eval_set):
  param_dict = {
    'colsample_bytree' : 1.0,
    'learning_rate': 0.02,
    'max_depth': 5,
    'min_child_weight': 10,
    'subsample' : 0.5
  }

  xgb_model = XGBRegressor(n_estimators=1000, objective='reg:squarederror', colsample_bytree = param_dict['colsample_bytree'], learning_rate=param_dict['learning_rate'], max_depth=param_dict['max_depth'], min_child_weight=param_dict['min_child_weight'], subsample=param_dict['subsample'])

  xgb_model.fit(X_train, y_train, eval_metric='rmse', eval_set=eval_set, verbose=False)
  return xgb_model.evals_result()

def test_model(xgb_model: XGBRegressor, test_data_X, test_data_Y):
  yhat = xgb_model.predict(test_data_X)

    
param_dict = {
'colsample_bytree' : 1.0,
'learning_rate': 0.02,
'max_depth': 5,
'min_child_weight': 10,
'subsample' : 0.5
}

#   training_data = df_merged
#training_data = fv.get_training_dataset(version=3)

#data_pair = data_split(training_data)

train_data_X = data_pair[0][0]
train_data_Y = data_pair[0][1]

test_data_X = data_pair[1][0]
test_data_Y = data_pair[1][1]

xgb_model_final = XGBRegressor(objective='reg:squarederror', n_estimators=200, colsample_bytree = param_dict['colsample_bytree'], learning_rate=param_dict['learning_rate'], max_depth=param_dict['max_depth'], min_child_weight=param_dict['min_child_weight'], subsample=param_dict['subsample'])
xgb_model_final.fit(train_data_X, train_data_Y,verbose=False)

metrics = model_performance(train_data_X,train_data_Y, data_pair)
test_model(xgb_model_final, test_data_X, test_data_Y)

#return xgb_model_final
metrics

{'validation_0': OrderedDict([('rmse',
               [3.795387,
                3.726762,
                3.659553,
                3.593956,
                3.529881,
                3.467237,
                3.405847,
                3.345748,
                3.287074,
                3.229755,
                3.173834,
                3.119045,
                3.065573,
                3.013261,
                2.962018,
                2.912178,
                2.863294,
                2.815583,
                2.769081,
                2.723594,
                2.679212,
                2.635885,
                2.593415,
                2.552048,
                2.511737,
                2.472218,
                2.433864,
                2.396434,
                2.359911,
                2.32431,
                2.289535,
                2.255698,
                2.22266,
                2.190471,
                2.159096,
                2.128561,
                2.098848,
 

In [31]:
import os
import joblib
MODEL_DIR="model"
REVIEW_MODEL_PKL = MODEL_DIR + "/review.pkl"

if ! os.exists_dir(MODEL_DIR) :
    os.mkdir(MODEL_DIR)
joblib.dump(xgb_model_final, REVIEW_MODEL_PKL)


['model/review.pkl']

In [38]:
import hsml
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(train_data_X)
output_schema = Schema(train_data_Y)

conn = hsml.connection(
    host="35.195.111.34",
    project="rec",
    hostname_verification=True,
#    api_key_file="./api-key.txt"
    api_key_value="ooeY4kuI1WMhA8Xh.m0yNQw70N2lSn2zOLwL1ta5eZeDmHqJH4YUKRM8rANqAPGihMpjd8TRMt69GVAjb"
    )
mr = conn.get_model_registry()

sk_model = mr.sklearn.("review_score", 
                                   metrics=metrics,
                                   input_example=train_data_X,
                                   model_schema=ModelSchema(input_schema=input_schema, output_schema=output_schema))

sk_model.save(MODEL_DIR)

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Exported model review_score with version 1


Model(name: 'review_score', version: 1)

22/05/31 16:36:02 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3302861 ms exceeds timeout 120000 ms
22/05/31 16:36:02 WARN SparkContext: Killing executors is not supported by current scheduler.


In [None]:
model = train_final_model()

In [None]:
test_sample = df_merged.drop(['review_score', 'order_id'], axis=1).sample()
predicted_review_score = model.predict(test_sample)
print("PREDICTED REVIEW SCORE [1-5]: ",predicted_review_score)
