# Imports

In [1]:
import joblib

import numpy   as np
import pandas  as pd

from lightgbm       import LGBMRegressor
from feature_engine import creation

from geopy.distance import geodesic

from sklearn.metrics           import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection   import cross_validate
from sklearn.feature_selection import SelectFromModel

## Functions

In [2]:
random_state = 42
np.random.seed(random_state)

In [3]:
def jupyter_settings():
    
    pd.options.display.max_columns = None
    pd.options.display.max_rows = 500

jupyter_settings()

In [4]:
def get_cross_validate_metrics(dict_cv_result):

    print(f"""
    R2 test mean: {dict_cv_result['test_r2'].mean()}; 
    R2 test std: {dict_cv_result['test_r2'].std()}

    MAE test mean: {dict_cv_result['test_mae'].mean()}; 
    MAE test std: {dict_cv_result['test_mae'].std()}
    
    RMSE test mean: {dict_cv_result['test_rmse'].mean()}; 
    RMSE test std: {dict_cv_result['test_rmse'].std()}
    """)

In [5]:
def get_metrics(y_true, y_pred):

    r2 = r2_score(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)

    print(f"R2: {r2}; RMSE: {rmse}; MAE: {mae}; MAPE: {mape}")

# Loading Datasets

In [6]:
df_train = pd.read_parquet('../data/processed/df_train_from_2.2_exploratory_data_analysis.parquet')
df_oot = pd.read_parquet('../data/processed/df_oot_from_2.2_exploratory_data_analysis.parquet')

In [7]:
df_train.head()

Unnamed: 0,order_purchase_timestamp,order_delivered_customer_date,order_estimated_delivery_date,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,seller_geolocation_city_count,seller_geolocation_city_nunique,seller_geolocation_city_size,seller_geolocation_city_mode,seller_geolocation_state_count,seller_geolocation_state_nunique,seller_geolocation_state_size,seller_geolocation_state_mode,distance_customer_seller_count,distance_customer_seller_min,distance_customer_seller_max,distance_customer_seller_mean,distance_customer_seller_median,distance_customer_seller_q25,distance_customer_seller_q75,distance_customer_seller_range,payment_sequential_count,payment_sequential_min,payment_sequential_max,payment_sequential_mean,payment_sequential_median,payment_sequential_q25,payment_sequential_q75,payment_sequential_range,payment_type_count,payment_type_nunique,payment_type_size,payment_type_mode,payment_installments_count,payment_installments_min,payment_installments_max,payment_installments_mean,payment_installments_median,payment_installments_q25,payment_installments_q75,payment_installments_range,payment_value_count,payment_value_min,payment_value_max,payment_value_mean,payment_value_median,payment_value_q25,payment_value_q75,payment_value_range,product_id_count,seller_id_count,price_count,price_min,price_max,price_mean,price_median,price_q25,price_q75,price_range,freight_value_count,freight_value_min,freight_value_max,freight_value_mean,freight_value_median,freight_value_q25,freight_value_q75,freight_value_range,product_category_name_count,product_category_name_nunique,product_category_name_size,product_category_name_mode,product_name_lenght_count,product_name_lenght_min,product_name_lenght_max,product_name_lenght_mean,product_name_lenght_median,product_name_lenght_q25,product_name_lenght_q75,product_name_lenght_range,product_description_lenght_count,product_description_lenght_min,product_description_lenght_max,product_description_lenght_mean,product_description_lenght_median,product_description_lenght_q25,product_description_lenght_q75,product_description_lenght_range,product_photos_qty_count,product_photos_qty_min,product_photos_qty_max,product_photos_qty_mean,product_photos_qty_median,product_photos_qty_q25,product_photos_qty_q75,product_photos_qty_range,product_weight_g_count,product_weight_g_min,product_weight_g_max,product_weight_g_mean,product_weight_g_median,product_weight_g_q25,product_weight_g_q75,product_weight_g_range,product_length_cm_count,product_length_cm_min,product_length_cm_max,product_length_cm_mean,product_length_cm_median,product_length_cm_q25,product_length_cm_q75,product_length_cm_range,product_height_cm_count,product_height_cm_min,product_height_cm_max,product_height_cm_mean,product_height_cm_median,product_height_cm_q25,product_height_cm_q75,product_height_cm_range,product_width_cm_count,product_width_cm_min,product_width_cm_max,product_width_cm_mean,product_width_cm_median,product_width_cm_q25,product_width_cm_q75,product_width_cm_range,order_purchase_until_approved_in_seconds,order_purchase_until_approved_in_minutes,order_purchase_until_approved_in_hours,order_purchase_until_approved_in_days,order_purchase_timestamp_second,order_purchase_timestamp_minute,order_purchase_timestamp_hour,order_purchase_timestamp_day,order_purchase_timestamp_month,order_approved_at_second,order_approved_at_minute,order_approved_at_hour,order_approved_at_day,order_approved_at_month,delivered_in_days,estimated_delivery_in_days,time
0,2017-10-02 10:56:33,2017-10-10 21:25:13,2017-10-18,3149,sao paulo,SP,-23.574809,-46.587471,1,1,1,maua,1,1,1,SP,1,18.051106,18.051106,18.051106,18.051106,18.051106,18.051106,0.0,3.0,1.0,3.0,2.0,2.0,1.5,2.5,2.0,3.0,2.0,3.0,voucher,3.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,18.59,12.903333,18.12,10.06,18.355,16.59,1.0,1.0,1.0,29.99,29.99,29.99,29.99,29.99,29.99,0.0,1.0,8.72,8.72,8.72,8.72,8.72,8.72,0.0,1.0,1.0,1.0,utilidades_domesticas,1.0,40.0,40.0,40.0,40.0,40.0,40.0,0.0,1.0,268.0,268.0,268.0,268.0,268.0,268.0,0.0,1.0,4.0,4.0,4.0,4.0,4.0,4.0,0.0,1.0,500.0,500.0,500.0,500.0,500.0,500.0,0.0,1.0,19.0,19.0,19.0,19.0,19.0,19.0,0.0,1.0,8.0,8.0,8.0,8.0,8.0,8.0,0.0,1.0,13.0,13.0,13.0,13.0,13.0,13.0,0.0,642.0,10.7,0.178333,0.0,33,56,10,2,10,15.0,7.0,11.0,2.0,10.0,8.0,15.0,2017-10-01
3,2017-11-18 19:28:06,2017-12-02 00:28:42,2017-12-15,59296,sao goncalo do amarante,RN,-5.767733,-35.275467,1,1,1,belo horizonte,1,1,1,MG,1,1816.652139,1816.652139,1816.652139,1816.652139,1816.652139,1816.652139,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,credit_card,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,72.2,72.2,72.2,72.2,72.2,72.2,0.0,1.0,1.0,1.0,45.0,45.0,45.0,45.0,45.0,45.0,0.0,1.0,27.2,27.2,27.2,27.2,27.2,27.2,0.0,1.0,1.0,1.0,pet_shop,1.0,59.0,59.0,59.0,59.0,59.0,59.0,0.0,1.0,468.0,468.0,468.0,468.0,468.0,468.0,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,1.0,450.0,450.0,450.0,450.0,450.0,450.0,0.0,1.0,30.0,30.0,30.0,30.0,30.0,30.0,0.0,1.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,0.0,1073.0,17.883333,0.298056,0.0,6,28,19,18,11,59.0,45.0,19.0,18.0,11.0,13.0,26.0,2017-11-01
4,2018-02-13 21:18:39,2018-02-16 18:17:02,2018-02-26,9195,santo andre,SP,-23.675037,-46.524784,1,1,1,mogi das cruzes,1,1,1,SP,1,30.189028,30.189028,30.189028,30.189028,30.189028,30.189028,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,credit_card,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,28.62,28.62,28.62,28.62,28.62,28.62,0.0,1.0,1.0,1.0,19.9,19.9,19.9,19.9,19.9,19.9,0.0,1.0,8.72,8.72,8.72,8.72,8.72,8.72,0.0,1.0,1.0,1.0,papelaria,1.0,38.0,38.0,38.0,38.0,38.0,38.0,0.0,1.0,316.0,316.0,316.0,316.0,316.0,316.0,0.0,1.0,4.0,4.0,4.0,4.0,4.0,4.0,0.0,1.0,250.0,250.0,250.0,250.0,250.0,250.0,0.0,1.0,51.0,51.0,51.0,51.0,51.0,51.0,0.0,1.0,15.0,15.0,15.0,15.0,15.0,15.0,0.0,1.0,15.0,15.0,15.0,15.0,15.0,15.0,0.0,3710.0,61.833333,1.030556,0.0,39,18,21,13,2,29.0,20.0,22.0,13.0,2.0,2.0,12.0,2018-02-01
5,2017-07-09 21:57:05,2017-07-26 10:57:55,2017-08-01,86320,congonhinhas,PR,-23.548581,-50.55066,1,1,1,guarulhos,1,1,1,SP,1,412.633095,412.633095,412.633095,412.633095,412.633095,412.633095,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,credit_card,1.0,6.0,6.0,6.0,6.0,6.0,6.0,0.0,1.0,175.26,175.26,175.26,175.26,175.26,175.26,0.0,1.0,1.0,1.0,147.9,147.9,147.9,147.9,147.9,147.9,0.0,1.0,27.36,27.36,27.36,27.36,27.36,27.36,0.0,1.0,1.0,1.0,automotivo,1.0,49.0,49.0,49.0,49.0,49.0,49.0,0.0,1.0,608.0,608.0,608.0,608.0,608.0,608.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,7150.0,7150.0,7150.0,7150.0,7150.0,7150.0,0.0,1.0,65.0,65.0,65.0,65.0,65.0,65.0,0.0,1.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,1.0,65.0,65.0,65.0,65.0,65.0,65.0,0.0,788.0,13.133333,0.218889,0.0,5,57,21,9,7,13.0,10.0,22.0,9.0,7.0,16.0,22.0,2017-07-01
7,2017-05-16 13:10:30,2017-05-26 12:55:51,2017-06-07,26525,nilopolis,RJ,-22.800936,-43.424861,1,1,1,atibaia,1,1,1,SP,1,322.233743,322.233743,322.233743,322.233743,322.233743,322.233743,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,credit_card,1.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,1.0,75.16,75.16,75.16,75.16,75.16,75.16,0.0,1.0,1.0,1.0,59.99,59.99,59.99,59.99,59.99,59.99,0.0,1.0,15.17,15.17,15.17,15.17,15.17,15.17,0.0,1.0,1.0,1.0,automotivo,1.0,59.0,59.0,59.0,59.0,59.0,59.0,0.0,1.0,956.0,956.0,956.0,956.0,956.0,956.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,0.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,0.0,1.0,17.0,17.0,17.0,17.0,17.0,17.0,0.0,701.0,11.683333,0.194722,0.0,30,10,13,16,5,11.0,22.0,13.0,16.0,5.0,9.0,21.0,2017-05-01


In [8]:
df_oot.head()

Unnamed: 0,order_purchase_timestamp,order_delivered_customer_date,order_estimated_delivery_date,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,seller_geolocation_city_count,seller_geolocation_city_nunique,seller_geolocation_city_size,seller_geolocation_city_mode,seller_geolocation_state_count,seller_geolocation_state_nunique,seller_geolocation_state_size,seller_geolocation_state_mode,distance_customer_seller_count,distance_customer_seller_min,distance_customer_seller_max,distance_customer_seller_mean,distance_customer_seller_median,distance_customer_seller_q25,distance_customer_seller_q75,distance_customer_seller_range,payment_sequential_count,payment_sequential_min,payment_sequential_max,payment_sequential_mean,payment_sequential_median,payment_sequential_q25,payment_sequential_q75,payment_sequential_range,payment_type_count,payment_type_nunique,payment_type_size,payment_type_mode,payment_installments_count,payment_installments_min,payment_installments_max,payment_installments_mean,payment_installments_median,payment_installments_q25,payment_installments_q75,payment_installments_range,payment_value_count,payment_value_min,payment_value_max,payment_value_mean,payment_value_median,payment_value_q25,payment_value_q75,payment_value_range,product_id_count,seller_id_count,price_count,price_min,price_max,price_mean,price_median,price_q25,price_q75,price_range,freight_value_count,freight_value_min,freight_value_max,freight_value_mean,freight_value_median,freight_value_q25,freight_value_q75,freight_value_range,product_category_name_count,product_category_name_nunique,product_category_name_size,product_category_name_mode,product_name_lenght_count,product_name_lenght_min,product_name_lenght_max,product_name_lenght_mean,product_name_lenght_median,product_name_lenght_q25,product_name_lenght_q75,product_name_lenght_range,product_description_lenght_count,product_description_lenght_min,product_description_lenght_max,product_description_lenght_mean,product_description_lenght_median,product_description_lenght_q25,product_description_lenght_q75,product_description_lenght_range,product_photos_qty_count,product_photos_qty_min,product_photos_qty_max,product_photos_qty_mean,product_photos_qty_median,product_photos_qty_q25,product_photos_qty_q75,product_photos_qty_range,product_weight_g_count,product_weight_g_min,product_weight_g_max,product_weight_g_mean,product_weight_g_median,product_weight_g_q25,product_weight_g_q75,product_weight_g_range,product_length_cm_count,product_length_cm_min,product_length_cm_max,product_length_cm_mean,product_length_cm_median,product_length_cm_q25,product_length_cm_q75,product_length_cm_range,product_height_cm_count,product_height_cm_min,product_height_cm_max,product_height_cm_mean,product_height_cm_median,product_height_cm_q25,product_height_cm_q75,product_height_cm_range,product_width_cm_count,product_width_cm_min,product_width_cm_max,product_width_cm_mean,product_width_cm_median,product_width_cm_q25,product_width_cm_q75,product_width_cm_range,order_purchase_until_approved_in_seconds,order_purchase_until_approved_in_minutes,order_purchase_until_approved_in_hours,order_purchase_until_approved_in_days,order_purchase_timestamp_second,order_purchase_timestamp_minute,order_purchase_timestamp_hour,order_purchase_timestamp_day,order_purchase_timestamp_month,order_approved_at_second,order_approved_at_minute,order_approved_at_hour,order_approved_at_day,order_approved_at_month,delivered_in_days,estimated_delivery_in_days,time
1,2018-07-24 20:41:37,2018-08-07 15:27:45,2018-08-13,47813,barreiras,BA,-12.16986,-44.988369,1,1,1,belo horizonte,1,1,1,MG,1,852.256379,852.256379,852.256379,852.256379,852.256379,852.256379,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,boleto,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,141.46,141.46,141.46,141.46,141.46,141.46,0.0,1.0,1.0,1.0,118.7,118.7,118.7,118.7,118.7,118.7,0.0,1.0,22.76,22.76,22.76,22.76,22.76,22.76,0.0,1.0,1.0,1.0,perfumaria,1.0,29.0,29.0,29.0,29.0,29.0,29.0,0.0,1.0,178.0,178.0,178.0,178.0,178.0,178.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,400.0,400.0,400.0,400.0,400.0,400.0,0.0,1.0,19.0,19.0,19.0,19.0,19.0,19.0,0.0,1.0,13.0,13.0,13.0,13.0,13.0,13.0,0.0,1.0,19.0,19.0,19.0,19.0,19.0,19.0,0.0,110570.0,1842.833333,30.713889,1.0,37,41,20,24,7,27.0,24.0,3.0,26.0,7.0,12.0,17.0,2018-07-01
2,2018-08-08 08:38:49,2018-08-17 18:06:29,2018-09-04,75265,vianopolis,GO,-16.746337,-48.514624,1,1,1,guariba,1,1,1,SP,1,511.820721,511.820721,511.820721,511.820721,511.820721,511.820721,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,credit_card,1.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,1.0,179.12,179.12,179.12,179.12,179.12,179.12,0.0,1.0,1.0,1.0,159.9,159.9,159.9,159.9,159.9,159.9,0.0,1.0,19.22,19.22,19.22,19.22,19.22,19.22,0.0,1.0,1.0,1.0,automotivo,1.0,46.0,46.0,46.0,46.0,46.0,46.0,0.0,1.0,232.0,232.0,232.0,232.0,232.0,232.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,420.0,420.0,420.0,420.0,420.0,420.0,0.0,1.0,24.0,24.0,24.0,24.0,24.0,24.0,0.0,1.0,19.0,19.0,19.0,19.0,19.0,19.0,0.0,1.0,21.0,21.0,21.0,21.0,21.0,21.0,0.0,994.0,16.566667,0.276111,0.0,49,38,8,8,8,23.0,55.0,8.0,8.0,8.0,9.0,26.0,2018-08-01
12,2018-06-07 10:06:19,2018-06-19 12:05:52,2018-07-18,35400,ouro preto,MG,-20.385237,-43.505513,1,1,1,ribeirao preto,1,1,1,SP,1,452.800022,452.800022,452.800022,452.800022,452.800022,452.800022,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,boleto,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,50.13,50.13,50.13,50.13,50.13,50.13,0.0,1.0,1.0,1.0,31.9,31.9,31.9,31.9,31.9,31.9,0.0,1.0,18.23,18.23,18.23,18.23,18.23,18.23,0.0,1.0,1.0,1.0,perfumaria,1.0,59.0,59.0,59.0,59.0,59.0,59.0,0.0,1.0,685.0,685.0,685.0,685.0,685.0,685.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,450.0,450.0,450.0,450.0,450.0,450.0,0.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,0.0,1.0,17.0,17.0,17.0,17.0,17.0,17.0,0.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,0.0,148013.0,2466.883333,41.114722,1.0,19,6,10,7,6,12.0,13.0,3.0,9.0,6.0,10.0,38.0,2018-06-01
13,2018-07-25 17:44:10,2018-07-30 15:52:25,2018-08-08,4812,sao paulo,SP,-23.711919,-46.687252,1,1,1,ibitinga,1,1,1,SP,1,308.090858,308.090858,308.090858,308.090858,308.090858,308.090858,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,credit_card,1.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,1.0,32.7,32.7,32.7,32.7,32.7,32.7,0.0,1.0,1.0,1.0,19.9,19.9,19.9,19.9,19.9,19.9,0.0,1.0,12.8,12.8,12.8,12.8,12.8,12.8,0.0,1.0,1.0,1.0,cama_mesa_banho,1.0,52.0,52.0,52.0,52.0,52.0,52.0,0.0,1.0,155.0,155.0,155.0,155.0,155.0,155.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,200.0,200.0,200.0,200.0,200.0,200.0,0.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,0.0,1.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,0.0,664.0,11.066667,0.184444,0.0,10,44,17,25,7,14.0,55.0,17.0,25.0,7.0,4.0,13.0,2018-07-01
15,2018-06-07 19:03:12,2018-06-21 15:34:32,2018-07-04,74820,goiania,GO,-16.712157,-49.248251,1,1,1,brasília,1,1,1,DF,1,161.466018,161.466018,161.466018,161.466018,161.466018,161.466018,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,credit_card,1.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,1.0,146.45,146.45,146.45,146.45,146.45,146.45,0.0,1.0,1.0,1.0,132.4,132.4,132.4,132.4,132.4,132.4,0.0,1.0,14.05,14.05,14.05,14.05,14.05,14.05,0.0,1.0,1.0,1.0,perfumaria,1.0,39.0,39.0,39.0,39.0,39.0,39.0,0.0,1.0,991.0,991.0,991.0,991.0,991.0,991.0,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,1.0,150.0,150.0,150.0,150.0,150.0,150.0,0.0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,0.0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,0.0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,0.0,448070.0,7467.833333,124.463889,5.0,12,3,19,7,6,2.0,31.0,23.0,12.0,6.0,8.0,21.0,2018-06-01


## Data Dimensions

In [9]:
print('Number of Rows: {}'.format(df_train.shape[0]))
print('Number of Cols: {}'.format(df_train.shape[1]))

Number of Rows: 77853
Number of Cols: 147


In [10]:
print('Number of Rows: {}'.format(df_oot.shape[0]))
print('Number of Cols: {}'.format(df_oot.shape[1]))

Number of Rows: 18603
Number of Cols: 147


## Splitting into X and y

In [11]:
list_columns = [
    'order_purchase_timestamp', 'order_delivered_customer_date', 
    'order_estimated_delivery_date', 'delivered_in_days', 'estimated_delivery_in_days', 'time']

X_train = df_train.drop(columns=list_columns)
X_oot = df_oot.drop(columns=list_columns)

In [12]:
target = 'delivered_in_days'
list_columns = ['delivered_in_days', 'estimated_delivery_in_days']

y_train = df_train.loc[:, list_columns]
y_oot = df_oot.loc[:, list_columns]

# Machine Learning Modeling

## Training

In [13]:
target = 'delivered_in_days'

In [14]:
list_selected_features = [
    'distance_customer_seller_min',
    'product_height_cm_min',
    'distance_customer_seller_q25',
    'freight_value_min',
    'customer_lat',
    'order_approved_at_second',
    'customer_lng',
    'order_purchase_timestamp_month',
    'product_length_cm_min',
    'product_weight_g_min',
    'payment_value_q75',
    'payment_value_max',
    'payment_value_mean',
    'order_approved_at_month']

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'bagging_freq': 1,
    'random_state': random_state,
    'n_estimators': 832, 
    'learning_rate': 0.006365688565352602, 
    'num_leaves': 867, 
    'subsample': 0.6922606184521125, 
    'colsample_bytree': 0.4826899205085819, 
    'min_data_in_leaf': 52}

dict_cv_results = cross_validate(
    LGBMRegressor(**params), X=X_train[list_selected_features], y=y_train[target], 
    scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error', 'rmse': 'neg_root_mean_squared_error'}, 
    cv=3, n_jobs=-1)

get_cross_validate_metrics(dict_cv_results)


    R2 test mean: 0.3008610346273051; 
    R2 test std: 0.005168349210662973

    MAE test mean: -5.123517471052414; 
    MAE test std: 0.030781855582665323
    
    RMSE test mean: -8.347833620063584; 
    RMSE test std: 0.19346030702225256
    


In [15]:
lgbm = LGBMRegressor(**params)
lgbm.fit(X_train[list_selected_features], y_train[target])

y_train['pred'] = lgbm.predict(X_train[list_selected_features])

y_oot['pred'] = lgbm.predict(X_oot[list_selected_features])
get_metrics(y_oot[target], y_oot['pred'])

R2: 0.010447901186684594; RMSE: 5.776548950980762; MAE: 4.355468761199317; MAPE: 114987946674777.47


In [16]:
joblib.dump(lgbm, '../models/machine_learning_model_from_5.0_api_development.joblib')

['../models/machine_learning_model_from_5.0_api_development.joblib']

# API

In [None]:
list_selected_features = [
    'distance_customer_seller_min',
    'customer_lat',
    'customer_lng',
    'product_height_cm_min',
    'distance_customer_seller_q25',
    'freight_value_min',
    'order_approved_at_second',
    'order_purchase_timestamp_month',
    'product_length_cm_min',
    'product_weight_g_min',
    'payment_value_q75',
    'payment_value_max',
    'payment_value_mean',
    'order_approved_at_month']

def get_distance_in_km(lat, lng, lat1, lng1):

    tuple_lat_lng = (lat, lng)
    tuple_lat1_lng1 = (lat1, lng1)
    
    try:
        dist = geodesic(tuple_lat_lng, tuple_lat1_lng1).km

    except:
        dist = np.nan

    return dist

In [22]:
df_customers = pd.read_csv('../data/raw/olist_customers_dataset.csv')
df_geolocation = pd.read_csv('../data/raw/olist_geolocation_dataset.csv')
df_items = pd.read_csv('../data/raw/olist_order_items_dataset.csv')
df_payments = pd.read_csv('../data/raw/olist_order_payments_dataset.csv')
df_orders = pd.read_csv('../data/raw/olist_orders_dataset.csv')
df_products = pd.read_csv('../data/raw/olist_products_dataset.csv')
df_sellers = pd.read_csv('../data/raw/olist_sellers_dataset.csv')

In [35]:
df_aux = df_customers.loc[df_customers['customer_id'] == '9ef432eb6251297304e76186b10a928d', :]

df_customers_order = df_orders.merge(df_aux, how='right', on='customer_id')
df_customers_order = df_customers_order.merge(
    df_geolocation.drop_duplicates(subset='geolocation_zip_code_prefix'), # there's some duplicated zip code 
    how='left', 
    left_on='customer_zip_code_prefix', 
    right_on='geolocation_zip_code_prefix')

df_aux = df_customers_order.loc[:, ['customer_id', 'geolocation_lat', 'geolocation_lng']]

In [36]:
df_aux

Unnamed: 0,customer_id,geolocation_lat,geolocation_lng
0,9ef432eb6251297304e76186b10a928d,-23.574809,-46.587471


In [18]:
from flask import Flask
 
app = Flask(__name__)
 
app.config['MYSQL_HOST'] = 'localhost'
app.config['MYSQL_USER'] = 'root'
app.config['MYSQL_PASSWORD'] = ''
app.config['MYSQL_DB'] = 'flask'
 
mysql = MySQL(app)
 
@app.route('/form')
def form():
    return render_template('form.html')
 
@app.route('/login', methods = ['POST', 'GET'])
def login():
    if request.method == 'GET':
        return "Login via the login Form"
     
    if request.method == 'POST':
        name = request.form['name']
        age = request.form['age']
        cursor = mysql.connection.cursor()
        cursor.execute(''' INSERT INTO info_table VALUES(%s,%s)''',(name,age))
        mysql.connection.commit()
        cursor.close()
        return f"Done!!"
 
app.run(host='localhost', port=5000)

NameError: name 'MySQL' is not defined

In [16]:
import json
import joblib


list_func_num_var = [
    'min', 
    'max', 
    'mean',
    [
        'q25', lambda i: np.quantile(i, 0.25)
    ],
    [
        'q75', lambda i: np.quantile(i, 0.75)
    ]
]


def get_prediction(df):

    list_selected_features = [
        'distance_customer_seller_min',
        'product_height_cm_min',
        'distance_customer_seller_q25',
        'freight_value_min',
        'customer_lat',
        'order_approved_at_second',
        'customer_lng',
        'order_purchase_timestamp_month',
        'product_length_cm_min',
        'product_weight_g_min',
        'payment_value_q75',
        'payment_value_max',
        'payment_value_mean',
        'order_approved_at_month']
    
    model = joblib.load('../models/machine_learning_model_from_5.0_api_development.joblib')

    df.loc[:, 'pred'] = model.predict(df[list_selected_features])

    return df


def get_distance_in_km(lat, lng, lat1, lng1):

    tuple_lat_lng = (lat, lng)
    tuple_lat1_lng1 = (lat1, lng1)
    
    try:
        dist = geodesic(tuple_lat_lng, tuple_lat1_lng1).km

    except:
        dist = np.nan

    return dist


def get_date_prediction(df):

    df['pred_int'] = df['pred'].round().astype(int)
    
    df['date_pred'] = df.apply(lambda i: i['order_purchase_timestamp'] + pd.DateOffset(day=i['pred_int']), axis=1)

    df['date_pred_max'] = df['date_pred'] + pd.DateOffset(days=4)

    df['date_pred_min'] = df['date_pred'] - pd.DateOffset(days=4)
    
    return df


def get_result(df):

    df = get_prediction(df)

    df = get_date_prediction(df)

    pred_max = df['date_pred_max'].dt.strftime('%Y-%m-%d')[0]
    pred_min = df['date_pred_min'].dt.strftime('%Y-%m-%d')[0]

    json_result = {'date_min': pred_min, 'date_max': pred_max}
    
    return json.dumps(json_result)

In [17]:
df = pd.read_json('../src/api/schema.json')

df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'], format='%Y-%m-%d %H:%M:%S')
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d')

get_result(df)

ValueError: If using all scalar values, you must pass an index

## Production Test

In [1]:
import json
import requests

In [3]:
with open('../api/app/schema.json') as f:
    json_data = json.load(f)

In [4]:
url = 'http://127.0.0.1:8000/predict'

response = requests.post(url, json=json_data)

print(response.text)

{"date_min": "2018-07-06", "date_max": "2018-07-14"}
