# Modelo MLR para cálculo de precio de envío

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mpdates
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression as LR
import statsmodels.api as sm


In [4]:

dataset=pd.read_csv('data\master_join.csv')
X=dataset.iloc[:,3:].values
y=dataset.iloc[:,2:3].values
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)
regresion=LR()
regresion.fit(X_train, y_train)
y_pred_MLR=regresion.predict(X_test)
X_opt=X_test
regresion_OLS = sm.OLS(endog = y_test, exog = X_opt.tolist()).fit()
regresion_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.81
Model:,OLS,Adj. R-squared (uncentered):,0.81
Method:,Least Squares,F-statistic:,19170.0
Date:,"Wed, 27 Jul 2022",Prob (F-statistic):,0.0
Time:,00:38:16,Log-Likelihood:,-86275.0
No. Observations:,22527,AIC:,172600.0
Df Residuals:,22522,BIC:,172600.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0019,2.51e-05,74.077,0.000,0.002,0.002
x2,0.1039,0.005,20.473,0.000,0.094,0.114
x3,0.1667,0.006,27.218,0.000,0.155,0.179
x4,0.1410,0.007,19.951,0.000,0.127,0.155
x5,1.2668,0.013,100.740,0.000,1.242,1.291

0,1,2,3
Omnibus:,23927.111,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5854446.457
Skew:,4.907,Prob(JB):,0.0
Kurtosis:,81.364,Cond. No.,735.0


# Modelo MLR para cálculo de tiempo de envío

In [5]:
from math import radians, sin, cos, asin, sqrt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor


In [15]:

def haversine_distance(lat1, lon1, lat2, lon2):

    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) * 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) * 2
    return a



In [13]:

customers = pd.read_csv('data/olist_customers_dataset.csv')
geolocation =  pd.read_csv('data/olist_geolocation_dataset.csv')
order_items =  pd.read_csv('data/olist_order_items_dataset.csv')
orders =  pd.read_csv('data/olist_orders_dataset.csv')
products = pd.read_csv('data/olist_products_dataset.csv')
sellers = pd.read_csv('data/olist_sellers_dataset.csv')

order_items = pd.merge(order_items, sellers[['seller_id', 'seller_zip_code_prefix']], left_on='seller_id', right_on='seller_id').drop(['order_item_id','shipping_limit_date'], axis=1)
merge_df = pd.merge(order_items, orders, left_on='order_id', right_on='order_id', how='left')
merge_df = pd.merge(merge_df, customers[['customer_id', 'customer_zip_code_prefix']], how='left',left_on='customer_id',right_on='customer_id')
geo = geolocation.groupby('geolocation_zip_code_prefix').mean().reset_index()


merge_df = pd.merge(merge_df, geo[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']], how='left', 
                    left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix').rename(columns={'geolocation_lat': 'seller_lat', 
                                                                                                              'geolocation_lng': 'seller_lon'})
merge_df = pd.merge(merge_df, geo[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']], how='inner', 
                    left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix').rename(columns={'geolocation_lat': 'customer_lat', 
                                                                                                              'geolocation_lng': 'customer_lon'})
merge_df = pd.merge(merge_df, products[['product_id','product_category_name','product_photos_qty','product_weight_g','product_length_cm','product_height_cm','product_width_cm']], 
                   left_on='product_id', right_on='product_id', how='left')
                   
merge_df['product_volume_cm3']=merge_df.product_length_cm * merge_df.product_height_cm * merge_df.product_width_cm
merge_df.order_delivered_customer_date = pd.to_datetime(merge_df.order_delivered_customer_date)
merge_df.order_delivered_carrier_date = pd.to_datetime(merge_df.order_delivered_carrier_date)
merge_df.order_purchase_timestamp = pd.to_datetime(merge_df.order_purchase_timestamp)
merge_df.order_estimated_delivery_date = pd.to_datetime(merge_df.order_estimated_delivery_date)
merge_df['purchase_month']=merge_df.order_purchase_timestamp.dt.month
merge_df['purchase_day_of_week']=merge_df.order_purchase_timestamp.dt.day_of_week
merge_df['actual_delivery_time']=(merge_df.order_delivered_customer_date-merge_df.order_purchase_timestamp).dt.days
merge_df['carrier_delivery_time']=(merge_df.order_delivered_carrier_date-merge_df.order_purchase_timestamp).dt.days
merge_df['estimated_delivery_time']=(merge_df.order_estimated_delivery_date-merge_df.order_purchase_timestamp).dt.days


In [16]:

merge_df['distance'] = merge_df.apply(
    lambda row: haversine_distance(
        row['seller_lat'],
        row['seller_lon'],
        row['customer_lat'],
        row['customer_lon'],
    ),
    axis=1,
)


In [17]:

merge_df=merge_df.drop(['order_status','product_length_cm','product_height_cm', 'order_delivered_carrier_date', 
                        'product_width_cm', 'order_id', 'product_id', 'order_purchase_timestamp', 'order_delivered_customer_date', 
                        'product_category_name', 'seller_id', 'customer_zip_code_prefix', 'seller_zip_code_prefix', 'customer_id', 'order_estimated_delivery_date', 
                        'geolocation_zip_code_prefix_x', 'geolocation_zip_code_prefix_y', 'order_approved_at', 'product_photos_qty', 'seller_lat', 
                        'seller_lon', 'customer_lat', 'customer_lon'], axis=1, errors='ignore')
merge_df = merge_df.dropna()

mask = np.zeros_like(merge_df[merge_df.columns[::-1]].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask = np.zeros_like(merge_df[merge_df.columns[::-1]].corr(), dtype=np.bool)


In [21]:
merge_df.to_csv('data\merge_df.csv')

In [22]:
merge_df = pd.read_csv('data/merge_df.csv')

In [40]:

target = 'estimated_delivery_time'
features = ['freight_value', 
            'product_volume_cm3', 
            'product_weight_g',
            'carrier_delivery_time',
            'distance'
           ]



X = merge_df[features]
y = merge_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 14)
regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [41]:
regressor.score(X_train, y_train)

0.9327056044186326

In [27]:
round(regressor.score(X_train, y_train), 2)

0.93