In [1]:
import pandas as pd
import numpy as np
from hops import hdfs

path=hdfs.project_path() + "Jupyter/fsbook/data/ecommerce/"
#path = "../data/ecommerce/"


In [None]:
orders_df = pd.read_csv(path + "olist_orders_dataset.csv")
orders_df.head()

In [None]:
df = orders_df.dropna()

# Only keep  delivered orders
df = df[df['order_status'] == 'delivered']

# Drop rows where order_purchase_timestamp <= order_approved_at <= order_delivered_carrier_date <= order_delivered_customer_date
df = df[~((df['order_purchase_timestamp'] >= df['order_approved_at']) | (df['order_approved_at'] >= df['order_delivered_carrier_date']) | (df['order_delivered_carrier_date'] >= df['order_delivered_customer_date']))]


def bucketize_actual_delivery_vs_expectation (row):
  if row['days_between_delivery_expectation'] <= 0 :
    return -1
  elif row['days_between_delivery_expectation'] <= 7 :
    return 1
  elif row['days_between_delivery_expectation'] <= 14 :
    return 2
  else:
    return 3


# Days between purchase and delivery dates
df['days_between_purhcase_and_delivery'] = (pd.to_datetime(df['order_delivered_customer_date']) - pd.to_datetime(df['order_purchase_timestamp'])).dt.days

# if the order was approved late or on time (0=on time, 1=late)
df['order_approved_late']=np.where((pd.to_datetime(df['order_approved_at']) - pd.to_datetime(df['order_purchase_timestamp'])).dt.days == 0, 0, 1)

# Actual delivery vs. Expected delivery: 1=Delivered before expected date, 2= Delivered one week later than expected date, 3= Delivered two weeks later than expected date, 4= Delivered more than two weeks later than expected date
df['days_between_delivery_expectation']=(pd.to_datetime(df['order_estimated_delivery_date']) - pd.to_datetime(df['order_delivered_customer_date'])).dt.days
df['actual_delivery_vs_expectation_bucket'] = df.apply (lambda row: bucketize_actual_delivery_vs_expectation(row), axis=1)

df = df[['order_id','days_between_purhcase_and_delivery','order_approved_late','actual_delivery_vs_expectation_bucket','order_delivered_carrier_date']]

orders_df = df 
orders_df

In [None]:
items_df = pd.read_csv(path + "olist_order_items_dataset.csv")

items_df = items_df[['order_id','shipping_limit_date','price','freight_value']].dropna()

# Price and Freight Value must be non-negative
df = items_df[(items_df['price']>=0) & (items_df['freight_value']>=0)]

df1 = df.groupby('order_id').agg(total_order_price=('price', 'sum'), 
                                 total_order_freight=('freight_value','sum'),
                                 max_shipping_limit_date=('shipping_limit_date','max')).reset_index()

# Extract 1 feature: is_multiItems_order -- If the order has multiple items or not (1 or 0)
df2 = df.groupby('order_id').agg(cnt=('price', 'count')).reset_index()
df2['is_multiItems_order'] = np.where(df2['cnt'] > 1, 1, 0)

items_df = df1.merge(df2, how="inner", on='order_id')[['order_id','is_multiItems_order','total_order_price','total_order_freight','max_shipping_limit_date']]

items_df

In [None]:
# Load the reviews table from csv file
reviews_df = pd.read_csv(path + "olist_order_reviews_dataset.csv")

# Drop extra reviews if an order has multiple order review scores
df_reviews = reviews_df.groupby('order_id', as_index= False).agg(review_score=('review_score', 'max'))
df_reviews

In [None]:
orders_data = orders_df
items_data = items_df
reviews_data = df_reviews

In [None]:
!pip3 uninstall hsfs -y
!pip3 install 'git+https://github.com/logicalclocks/feature-store-api@master#egg=hsfs[python]&subdirectory=python'
#!pip3 install 'git+https://github.com/moritzmeister/feature-store-api@py39#egg=hsfs[python]&subdirectory=python'

In [None]:
import sys
print(sys.version)

In [None]:
!pip show pyspark

In [2]:
import hsfs

import hsfs
connection = hsfs.connection()
fs = connection.get_feature_store()

# connection = hsfs.connection(
#     host="35.195.111.34",
#     project="rec",
#     secrets_store="local",
#     api_key_file="./api-key.txt",
#     engine="python"
# )
# fs = connection.get_feature_store()



Connected. Call `.close()` to terminate connection gracefully.




In [None]:
orders_data = orders_df
items_data = items_df
reviews_data = df_reviews

orders_fg = fs.create_feature_group(
                        name="orders",
                        primary_key=["order_id"],
                        version=1,
                        description="Order details",
                        online_enabled=True
)

orders_fg.save(orders_data)

In [None]:
items_fg = fs.create_feature_group(
                        name="items",
                        primary_key=["order_id"],
                        version=1,
                        description="Order Item details",
                        online_enabled=True
)

items_fg.save(items_data)

In [None]:
reviews_fg = fs.create_feature_group(
                        name="reviews",
                        primary_key=["order_id"],
                        version=1,
                        description="Review details",
                        online_enabled=True
)

reviews_fg.save(reviews_data)

In [None]:
query = orders_fg.select(['order_id','order_delivered_carrier_date']).join(items_fg.select(['max_shipping_limit_date']))
derived_df = query.read()

In [None]:
derived_df['days_between_delivered_carrier_and_shipping_limit'] = (pd.to_datetime(derived_df['order_delivered_carrier_date']) - pd.to_datetime(derived_df['max_shipping_limit_date'])).dt.days
derived_df

In [None]:
derived_df['seller_shipped_late'] = \
    np.where(derived_df['days_between_delivered_carrier_and_shipping_limit'] > 0 , 1 ,0) 
derived_df

In [None]:
derived_df = derived_df.drop(columns=['order_delivered_carrier_date', \
                                  'max_shipping_limit_date','days_between_delivered_carrier_and_shipping_limit'])
derived_df

In [None]:
(derived_df['seller_shipped_late'] == 1).sum()

In [None]:
(derived_df['seller_shipped_late'] == 0).sum()

In [None]:
derived_fg = fs.create_feature_group(
                        name="orders_late_shipped",
                        primary_key=["order_id"],
                        version=1,
                        description="Orders shippped late",
                        online_enabled=True
)

derived_fg.save(derived_df)

In [None]:
orders_fg = fs.get_feature_group(name="orders", version=1)
items_fg = fs.get_feature_group(name="items", version=1)
reviews_fg = fs.get_feature_group(name="reviews", version=1)
derived_fg = fs.get_feature_group(name="orders_late_shipped", version=1)

In [None]:
query2 = orders_fg.select_all().join(items_fg.select(
    ['is_multiItems_order', 'total_order_price', 'total_order_freight']))\
    .join(reviews_fg.select(['review_score'])).join(derived_fg.select(['seller_shipped_late']))
training_df = query2.read()
training_df

In [None]:
fv = fs.create_feature_view(name="order_reviews",
                            description="Dataset to train the order review model",
                            version = 1,
#s                            transformations= {"myfeature" : trans_fn},
                            label = ["review_score"],
                            query = query2)

In [None]:
train_ds = fv.create_training_dataset(
    version = 1,
    description = 'reviews dataset',
    data_format = 'csv',
    coalesce = True,
    splits = {'train': 80, 'test': 20},
    write_options = {'wait_for_job': True}
)

In [3]:
fv = fs.get_feature_view(name="order_reviews", version=1)

In [None]:
fvs = fs.get_feature_views("order_reviews")

In [4]:
!pip3 uninstall hsml -y
!pip3 install 'git+https://github.com/logicalclocks/machine-learning-api@main#egg=hsml&subdirectory=python'

Found existing installation: hsml 2.6.0.dev1
Uninstalling hsml-2.6.0.dev1:
  Successfully uninstalled hsml-2.6.0.dev1
Collecting hsml
  Cloning https://github.com/logicalclocks/machine-learning-api (to revision main) to /tmp/pip-install-3tw9wwn5/hsml_8739e32696d6480cabe792ddd1064117
  Running command git clone -q https://github.com/logicalclocks/machine-learning-api /tmp/pip-install-3tw9wwn5/hsml_8739e32696d6480cabe792ddd1064117
  Resolved https://github.com/logicalclocks/machine-learning-api to commit 478f493bbc3009939ce0a557329add9cbd923194
Building wheels for collected packages: hsml
  Building wheel for hsml (setup.py) ... [?25ldone
[?25h  Created wheel for hsml: filename=hsml-2.6.0.dev1-py3-none-any.whl size=89971 sha256=527954b33eb041a22925188b83b2022e87a0fc2757cf64a98c3c87575ce77bdf
  Stored in directory: /tmp/pip-ephem-wheel-cache-v9tbbfal/wheels/3a/61/12/497a51bcae572b082c531cbd259a645a1bc7b63d63539aecbe
Successfully built hsml
Installing collected packages: hsml
Successfull

In [5]:
td_version, df  = fv.get_training_dataset()
df

2022-06-02 05:39:05,027 INFO: USE `demo_fs_meb10000_featurestore`
2022-06-02 05:39:05,774 INFO: SELECT `fg3`.`order_id` `order_id`, `fg3`.`days_between_purhcase_and_delivery` `days_between_purhcase_and_delivery`, `fg3`.`order_approved_late` `order_approved_late`, `fg3`.`actual_delivery_vs_expectation_bucket` `actual_delivery_vs_expectation_bucket`, `fg3`.`order_delivered_carrier_date` `order_delivered_carrier_date`, `fg0`.`is_multiitems_order` `is_multiitems_order`, `fg0`.`total_order_price` `total_order_price`, `fg0`.`total_order_freight` `total_order_freight`, `fg1`.`review_score` `review_score`, `fg2`.`seller_shipped_late` `seller_shipped_late`
FROM `demo_fs_meb10000_featurestore`.`orders_1` `fg3`
INNER JOIN `demo_fs_meb10000_featurestore`.`items_1` `fg0` ON `fg3`.`order_id` = `fg0`.`order_id`
INNER JOIN `demo_fs_meb10000_featurestore`.`reviews_1` `fg1` ON `fg3`.`order_id` = `fg1`.`order_id`
INNER JOIN `demo_fs_meb10000_featurestore`.`orders_late_shipped_1` `fg2` ON `fg3`.`order_id`



Unnamed: 0,order_id,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,order_delivered_carrier_date,is_multiitems_order,total_order_price,total_order_freight,review_score,seller_shipped_late
0,00010242fe8c5a6d1ba2dd792cb16214,7,0,2,2017-09-19 18:34:16,0,58.90,13.29,5,0
1,00018f77f2f0320c557190d7a144bdd3,16,0,1,2017-05-04 14:35:00,0,239.90,19.93,4,1
2,000229ec398224ef6ca0657da4fc703e,7,0,2,2018-01-16 12:36:48,0,199.00,17.87,5,0
3,00024acbcdf0a6daa1e931b038114c75,6,0,1,2018-08-10 13:28:00,0,12.99,12.79,4,0
4,00042b26cf59d7ce69dfabb4e55b4fd9,25,0,3,2017-02-16 09:46:09,0,199.90,18.14,5,1
...,...,...,...,...,...,...,...,...,...,...
93825,fffc94f6ce00a00581880bf54a75a037,17,1,1,2018-04-25 12:09:00,0,299.99,43.41,5,0
93826,fffcd46ef2263f404302a634eb57f7eb,9,1,2,2018-07-17 08:05:00,0,350.00,36.53,5,0
93827,fffce4705a9662cd70adb13d4a31832d,4,1,2,2017-10-26 15:13:14,0,99.90,16.95,5,0
93828,fffe18544ffabc95dfada21779c9644f,1,0,2,2017-08-15 19:02:53,0,55.99,8.72,5,0


In [6]:
from sklearn.model_selection import train_test_split

def data_split(data):
  training_data, testing_data = train_test_split(data, test_size=0.1, random_state=42)
  X_train, X_valid, y_train, y_valid = train_test_split(training_data.drop(
      ['review_score', 'order_id', 'order_delivered_carrier_date'], axis=1), 
    training_data['review_score'], test_size=0.1, random_state=7)
  data_pair = [(X_train, y_train), (X_valid, y_valid)]
  return data_pair

df2 = data_split(df)



In [7]:
data_pair = df2 #data_split(training_data)

train_data_X = data_pair[0][0]
train_data_X

Unnamed: 0,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,is_multiitems_order,total_order_price,total_order_freight,seller_shipped_late
76378,11,0,3,0,35.00,16.05,0
73272,7,1,3,0,169.90,13.53,0
76656,2,0,2,0,139.90,23.56,0
7919,11,0,1,0,99.00,15.44,0
32862,15,0,1,0,120.00,34.20,0
...,...,...,...,...,...,...,...
47871,7,0,2,0,34.99,8.72,0
33729,8,0,3,1,181.90,27.98,0
7721,22,0,3,0,228.00,26.09,0
11857,18,0,2,0,259.99,16.57,0


In [8]:
train_data_Y = data_pair[0][1]
train_data_Y

76378    3
73272    5
76656    4
7919     4
32862    4
        ..
47871    5
33729    5
7721     3
11857    5
15212    4
Name: review_score, Length: 76002, dtype: int64

In [9]:
test_data_X = data_pair[1][0]
test_data_X

Unnamed: 0,days_between_purhcase_and_delivery,order_approved_late,actual_delivery_vs_expectation_bucket,is_multiitems_order,total_order_price,total_order_freight,seller_shipped_late
14495,17,0,-1,1,25.00,30.46,1
72178,7,0,2,0,35.00,8.88,0
83704,16,0,1,0,12.90,18.23,0
32530,20,1,1,0,24.99,14.10,0
77875,8,0,2,0,110.00,15.52,0
...,...,...,...,...,...,...,...
23243,8,1,2,0,42.00,12.23,0
83048,4,0,3,0,249.99,19.00,0
68255,31,0,1,0,39.00,18.23,0
66112,8,1,-1,0,179.90,23.01,0


In [10]:
test_data_Y = data_pair[1][1]
test_data_Y

14495    1
72178    4
83704    4
32530    5
77875    4
        ..
23243    4
83048    3
68255    3
66112    1
92907    5
Name: review_score, Length: 8445, dtype: int64

In [12]:
import xgboost
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
#from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
import time
import hsml
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# def model_performance(X_train, y_train, eval_set):
#   param_dict = {
#     'colsample_bytree' : 1.0,
#     'learning_rate': 0.02,
#     'max_depth': 5,
#     'min_child_weight': 10,
#     'subsample' : 0.5
#   }

#   xgb_model = XGBRegressor(n_estimators=1000, objective='reg:squarederror', 
#                            colsample_bytree = param_dict['colsample_bytree'], 
#                            learning_rate=param_dict['learning_rate'], 
#                            max_depth=param_dict['max_depth'], min_child_weight=param_dict['min_child_weight'], 
#                            subsample=param_dict['subsample'])

#   xgb_model.fit(X_train, y_train, eval_metric='rmse', eval_set=eval_set, verbose=False)
#   return xgb_model.evals_result()

    
train_data_X = data_pair[0][0]
train_data_Y = data_pair[0][1]

test_data_X = data_pair[1][0]
test_data_Y = data_pair[1][1]


scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_data_X)
#X_test = scaler.transform(X_test)

model = GradientBoostingClassifier(n_estimators=20, 
                                       learning_rate=0.1, 
                                       max_features=2, 
                                       max_depth=2, 
                                       random_state=0)

model.fit(X_train,train_data_Y)




# print(classification_report(y_val,gradient_booster.predict(X_val)))

# def test_model(xgb_model: XGBRegressor, test_data_X, test_data_Y):
#   yhat = xgb_model.predict(test_data_X)


#xgb_model_final = XGBRegressor(objective='reg:squarederror', n_estimators=200, colsample_bytree = param_dict['colsample_bytree'], learning_rate=param_dict['learning_rate'], max_depth=param_dict['max_depth'], min_child_weight=param_dict['min_child_weight'], subsample=param_dict['subsample'])
# xgb_model_final.fit(train_data_X, train_data_Y,verbose=True)

# metrics = model_performance(train_data_X,train_data_Y, data_pair)
# test_model(xgb_model_final, test_data_X, test_data_Y)


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=2,
                           max_features=2, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=20,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [13]:
import os
import joblib
MODEL_DIR="model"
REVIEW_MODEL_PKL = MODEL_DIR + "/review.pkl"
MINMAX_TRANSFORMER_PKL = MODEL_DIR + "/minmax-transformer.pkl"

if os.path.isdir(MODEL_DIR) == False :
    os.mkdir(MODEL_DIR)
joblib.dump(model, REVIEW_MODEL_PKL)
joblib.dump(scaler, MINMAX_TRANSFORMER_PKL)


['model/minmax-transformer.pkl']

In [14]:
import hsml
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(train_data_X)
output_schema = Schema(train_data_Y)

conn = hsml.connection()

# conn = hsml.connection(
#     host="35.195.111.34",
#     project="rec",
#     hostname_verification=True,
# #    api_key_file="./api-key.txt"
#     api_key_value="ooeY4kuI1WMhA8Xh.m0yNQw70N2lSn2zOLwL1ta5eZeDmHqJH4YUKRM8rANqAPGihMpjd8TRMt69GVAjb"
#     )
mr = conn.get_model_registry()

sk_model = mr.sklearn.create_model("review_score", 
                       metrics={ "accuracy" : 0.5},
                       input_example=train_data_X,
                       model_schema=ModelSchema(input_schema=input_schema, output_schema=output_schema))

sk_model.save(MODEL_DIR)

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Exported model review_score with version 2


Model(name: 'review_score', version: 2)

In [15]:
test_sample = test_data_X
predicted_review_score = model.predict(test_sample)
print("PREDICTED REVIEW SCORE [1-5]: ",predicted_review_score)


PREDICTED REVIEW SCORE [1-5]:  [1 5 5 ... 5 1 5]
