In [2]:

# Imports
import pandas as pd, numpy as np, json
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             roc_auc_score, roc_curve, mean_absolute_error,
                             mean_squared_error, r2_score)
from imblearn.over_sampling import SMOTE
from math import radians, sin, cos, sqrt, atan2
from xgboost import XGBClassifier
import xgboost as xgb



In [3]:

# Load + basic prep

df = pd.read_csv("olist_final_dataset.csv")
for c in ['order_purchase_timestamp','order_delivered_customer_date','order_estimated_delivery_date']:
    df[c] = pd.to_datetime(df[c], errors='coerce')

# targets
df['delivery_delay'] = (df['order_delivered_customer_date'] - df['order_estimated_delivery_date']).dt.days
df['delay_days']     = df['delivery_delay'].apply(lambda x: x if x and x>0 else 0)
df['is_delayed']     = (df['delay_days'] > 0).astype(int)



In [4]:

#  Geo features

def haversine(lat1, lon1, lat2, lon2):
    R=6371.0
    dlat=radians(lat2-lat1); dlon=radians(lon2-lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1))*cos(radians(lat2))*sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1-a))

df['distance_km'] = df.apply(lambda r: haversine(r['customer_lat'], r['customer_lng'],
                                                 r['seller_lat'], r['seller_lng']), axis=1)

# state ‚Üí deterministic ID mapping used for BOTH training and app
UF_LIST = ["AC","AL","AM","AP","BA","CE","DF","ES","GO","MA","MG","MS","MT","PA","PB","PE",
           "PI","PR","RJ","RN","RO","RR","RS","SC","SE","SP","TO"]
UF2ID = {uf:i for i,uf in enumerate(UF_LIST)}
df['customer_state_id'] = df['customer_state'].map(UF2ID).fillna(-1).astype(int)
df['seller_state_id']   = df['seller_state'].map(UF2ID).fillna(-1).astype(int)

# time
df['purchase_month']     = df['order_purchase_timestamp'].dt.month
df['purchase_dayofweek'] = df['order_purchase_timestamp'].dt.dayofweek
df['is_weekend']         = (df['purchase_dayofweek'] >= 5).astype(int)

# basic cleaning
df = df.dropna(subset=['freight_value','product_weight_g','customer_lat','seller_lat'])
df = df.fillna(df.median(numeric_only=True))



In [5]:

# CLASSIFIER (no ZIPs)

features_cls = [
    'freight_value','product_weight_g','product_length_cm','product_height_cm','product_width_cm',
    'customer_state_id','seller_state_id',
    'customer_lat','customer_lng','seller_lat','seller_lng','distance_km',
    'purchase_month','purchase_dayofweek','is_weekend'
]
X = df[features_cls]
y = df['is_delayed']

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train delay counts before SMOTE:\n", ytr.value_counts())

sm = SMOTE(random_state=42)
Xtr_res, ytr_res = sm.fit_resample(Xtr, ytr)
print("After SMOTE:\n", ytr_res.value_counts())

xgb_cls = XGBClassifier(
    n_estimators=400, max_depth=10, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, scale_pos_weight=10,
    eval_metric='auc', random_state=42, n_jobs=-1
)
xgb_cls.fit(Xtr_res, ytr_res)

yproba = xgb_cls.predict_proba(Xte)[:,1]
thr = 0.35
ypred = (yproba > thr).astype(int)

print("Accuracy:", accuracy_score(yte, ypred))
print("ROC-AUC:", roc_auc_score(yte, yproba))
print(classification_report(yte, ypred))

# save classifier and feature order
xgb_cls.save_model("xgb_classifier_model.json")
with open("features_cls.json","w") as f: json.dump(features_cls, f)
with open("uf2id.json","w") as f: json.dump(UF2ID, f)



Train delay counts before SMOTE:
 is_delayed
0    83887
1     5775
Name: count, dtype: int64
After SMOTE:
 is_delayed
0    83887
1    83887
Name: count, dtype: int64
Accuracy: 0.7813615274803711
ROC-AUC: 0.7563370339981076
              precision    recall  f1-score   support

           0       0.97      0.79      0.87     20972
           1       0.16      0.59      0.26      1444

    accuracy                           0.78     22416
   macro avg       0.56      0.69      0.56     22416
weighted avg       0.91      0.78      0.83     22416



In [6]:

# REGRESSOR (only delayed)

df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
df['delivery_gap']  = (df['order_estimated_delivery_date'] - df['order_purchase_timestamp']).dt.days
df['month_sin']     = np.sin(2*np.pi*df['purchase_month']/12)
df['month_cos']     = np.cos(2*np.pi*df['purchase_month']/12)
df['dayofweek_sin'] = np.sin(2*np.pi*df['purchase_dayofweek']/7)
df['dayofweek_cos'] = np.cos(2*np.pi*df['purchase_dayofweek']/7)
df['region_pair']   = (df['customer_state_id'].astype(str) + "_" + df['seller_state_id'].astype(str)).astype('category').cat.codes

df['product_volume_cm3']   = df['product_length_cm']*df['product_height_cm']*df['product_width_cm']
df['density_g_per_cm3']    = df['product_weight_g']/(df['product_volume_cm3']+1e-9)
df['distance_per_kg']      = df['distance_km']/(df['product_weight_g']+1e-9)
df['freight_per_weight']   = df['freight_value']/(df['product_weight_g']+1e-9)
df['distance_weight_interaction'] = df['distance_km']*df['product_weight_g']
df['freight_per_day']      = df['freight_value']/(df['delivery_gap']+1.0)
df['weekend_gap']          = df['is_weekend']*df['delivery_gap']

delayed_df = df[df['is_delayed']==1].copy()
delayed_df = delayed_df[
    (delayed_df['delay_days'] > 0) &
    (delayed_df['delay_days'] <= 40)
]

delayed_df['delay_days_log'] = np.log1p(delayed_df['delay_days'])

features_reg = [
    'freight_value','product_weight_g','product_length_cm','product_height_cm','product_width_cm',
    'product_volume_cm3','density_g_per_cm3',
    'customer_state_id','seller_state_id','customer_lat','customer_lng','seller_lat','seller_lng',
    'distance_km','distance_per_kg','freight_per_weight','region_pair',
    'purchase_month','purchase_dayofweek','purchase_hour','is_weekend',
    'delivery_gap','freight_per_day','weekend_gap',
    'dayofweek_sin','dayofweek_cos','month_sin','month_cos'
]
Xr = delayed_df[features_reg]; yr = delayed_df['delay_days_log']

Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(Xr, yr, test_size=0.2, random_state=42)
dtr = xgb.DMatrix(Xr_tr, label=yr_tr); dte = xgb.DMatrix(Xr_te, label=yr_te)

params = {
    "max_depth": 8, "eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8,
    "objective": "reg:quantileerror", "quantile_alpha": 0.5,
    "eval_metric": "mae", "seed": 42
}
xgb_q = xgb.train(params, dtr, num_boost_round=400)

yp_log = xgb_q.predict(dte)
yp     = np.expm1(yp_log); yt = np.expm1(yr_te)
print("Reg MAE:", mean_absolute_error(yt, yp))

xgb_q.save_model("xgb_quantile_model.json")
with open("features_reg.json","w") as f: json.dump(features_reg, f)

print("‚úÖ Saved: xgb_classifier_model.json, xgb_quantile_model.json, features_cls.json, features_reg.json, uf2id.json")


Reg MAE: 5.08101552266341
‚úÖ Saved: xgb_classifier_model.json, xgb_quantile_model.json, features_cls.json, features_reg.json, uf2id.json


In [7]:
!pip install streamlit pyngrok


Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.5.0 streamlit-1.52.1


In [8]:
from pyngrok import ngrok
ngrok.set_auth_token("2weZY34ksUUkKcjjkisw0RRCrn1_6ppvH8NZKCvB3Gq5sdaTx")




In [9]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import json
import xgboost as xgb
from xgboost import XGBClassifier
from math import radians, sin, cos, sqrt, atan2
import pydeck as pdk

# Load Models & Metadata

st.set_page_config(page_title="Delivery Delay Predictor", layout="wide")

clf = XGBClassifier()
clf.load_model("xgb_classifier_model.json")

quant_model = xgb.Booster()
quant_model.load_model("xgb_quantile_model.json")

features_cls = json.load(open("features_cls.json"))
features_reg = json.load(open("features_reg.json"))
UF2ID = json.load(open("uf2id.json"))

# Brazil State Centroids

BRAZIL_STATE_CENTROIDS = {
    "AC": (-8.77, -70.55), "AL": (-9.62, -36.82), "AM": (-3.47, -65.10),
    "AP": (1.41, -51.77), "BA": (-12.96, -38.51), "CE": (-3.71, -38.54),
    "DF": (-15.83, -47.86), "ES": (-19.19, -40.34), "GO": (-16.64, -49.31),
    "MA": (-2.55, -44.30), "MG": (-19.92, -43.94), "MS": (-20.51, -54.54),
    "MT": (-12.64, -55.42), "PA": (-1.41, -48.50), "PB": (-7.12, -34.86),
    "PE": (-8.05, -34.90), "PI": (-8.28, -43.68), "PR": (-25.42, -49.27),
    "RJ": (-22.90, -43.20), "RN": (-5.81, -35.21), "RO": (-11.22, -62.80),
    "RR": (1.89, -61.22), "RS": (-30.03, -51.23), "SC": (-27.59, -48.55),
    "SE": (-10.91, -37.07), "SP": (-23.55, -46.63), "TO": (-10.25, -48.25)
}

# Haversine Distance

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1))*cos(radians(lat2))*sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

# Product Catalog Defaults
PRODUCT_CATALOG = {
    "Laptop": {"weight": 1800, "L": 35, "H": 3, "W": 25},
    "Smartphone": {"weight": 200, "L": 15, "H": 2, "W": 8},
    "Microwave": {"weight": 14000, "L": 45, "H": 28, "W": 35},
    "Refrigerator (Mini)": {"weight": 18000, "L": 50, "H": 60, "W": 50},
    "Shoes (Box)": {"weight": 900, "L": 30, "H": 12, "W": 20},
    "T-Shirt (Packed)": {"weight": 300, "L": 25, "H": 3, "W": 20},
    "Chair (Foldable)": {"weight": 5000, "L": 80, "H": 100, "W": 50},
    "Television (32\")": {"weight": 6000, "L": 80, "H": 50, "W": 12},
    "Custom (Enter Manually)": None
}

# Dynamic Map Zoom Function

def get_dynamic_zoom(distance_km):
    if distance_km < 300:
        return 8
    elif distance_km < 700:
        return 7
    elif distance_km < 1500:
        return 6
    elif distance_km < 2500:
        return 5
    else:
        return 4

# UI TITLE
st.title("üì¶ E-Commerce Delivery Delay Predictor")
st.write("Predict lateness, delay days, and expected arrival date.")

# TWO COLUMN LAYOUT ‚Äî LEFT: INPUTS | RIGHT: MAP

left, right = st.columns([1.1, 1])

# ================= LEFT SIDE INPUTS
with left:
    st.subheader("Order Details")

    order_date = st.date_input("Order Date", pd.to_datetime("today"))
    freight_value = st.number_input("Freight Value (BRL)", 0.0, 1000.0, 50.0)

    product_choice = st.selectbox("Select Product Type", list(PRODUCT_CATALOG.keys()))

    if product_choice != "Custom (Enter Manually)":
        preset = PRODUCT_CATALOG[product_choice]
        product_weight = st.number_input("Product Weight (g)", 1, 50000, preset["weight"])
        product_length = st.number_input("Length (cm)", 1, 200, preset["L"])
        product_height = st.number_input("Height (cm)", 1, 200, preset["H"])
        product_width = st.number_input("Width (cm)", 1, 200, preset["W"])
    else:
        st.write("### Enter Custom Dimensions")
        product_weight = st.number_input("Product Weight (g)", 1, 50000, 1000)
        product_length = st.number_input("Length (cm)", 1, 200, 20)
        product_height = st.number_input("Height (cm)", 1, 200, 20)
        product_width = st.number_input("Width (cm)", 1, 200, 20)

    customer_state = st.selectbox("Customer State (UF)", list(UF2ID.keys()))
    seller_state = st.selectbox("Seller State (UF)", list(UF2ID.keys()))

# ================= RIGHT SIDE MAP
with right:
    st.subheader("üó∫Ô∏è Route Map (Seller ‚Üí Customer)")

    # Coordinates
    customer_lat, customer_lng = BRAZIL_STATE_CENTROIDS[customer_state]
    seller_lat, seller_lng = BRAZIL_STATE_CENTROIDS[seller_state]

    # Distance for dynamic zoom
    distance_km = haversine(customer_lat, customer_lng, seller_lat, seller_lng)
    zoom_level = get_dynamic_zoom(distance_km)

    map_points = pd.DataFrame([
        {"name": "Seller", "lat": seller_lat, "lon": seller_lng},
        {"name": "Customer", "lat": customer_lat, "lon": customer_lng},
    ])

    route_line = pd.DataFrame([{
        "lat": [seller_lat, customer_lat],
        "lon": [seller_lng, customer_lng]
    }])

    deck = pdk.Deck(
        layers=[
            pdk.Layer(
                "ScatterplotLayer",
                data=map_points,
                get_position='[lon, lat]',
                get_color='[0,120,255]',
                get_radius=35000,
            ),
            pdk.Layer(
                "PathLayer",
                data=route_line,
                get_path='[ [lon[0], lat[0]], [lon[1], lat[1]] ]',
                get_color='[255,0,0]',
                width_scale=10,
                width_min_pixels=4
            ),
        ],
        initial_view_state=pdk.ViewState(
            latitude=(seller_lat + customer_lat)/2,
            longitude=(seller_lng + customer_lng)/2,
            zoom=zoom_level,
        ),
        map_provider="carto",
        map_style="light"
    )

    st.pydeck_chart(deck, use_container_width=True, height=500)

# Auto Time Features
order_date = pd.to_datetime(order_date)
purchase_month = order_date.month
purchase_dow = order_date.weekday()
is_weekend = 1 if purchase_dow >= 5 else 0

# Classifier Input Vector
cls_row = pd.DataFrame([{
    'freight_value': freight_value,
    'product_weight_g': product_weight,
    'product_length_cm': product_length,
    'product_height_cm': product_height,
    'product_width_cm': product_width,
    'customer_state_id': UF2ID[customer_state],
    'seller_state_id': UF2ID[seller_state],
    'customer_lat': customer_lat,
    'customer_lng': customer_lng,
    'seller_lat': seller_lat,
    'seller_lng': seller_lng,
    'distance_km': distance_km,
    'purchase_month': purchase_month,
    'purchase_dayofweek': purchase_dow,
    'is_weekend': is_weekend
}])[features_cls]

# Predict Delay
thr = 0.35
delay_prob = clf.predict_proba(cls_row)[0][1]
delayed_flag = int(delay_prob > thr)

st.subheader("Prediction")

if delayed_flag:
    st.error("üö® Likely Delayed")
else:
    st.success("‚úÖ Expected On-Time")

# Delivery Timeline
default_gap = 8
estimated_delivery = order_date + pd.Timedelta(days=default_gap)

if delayed_flag:
    volume_cm3 = product_length * product_height * product_width
    dens = product_weight / (volume_cm3 + 1e-9)
    dpkg = distance_km / (product_weight + 1e-9)
    fpw = freight_value / (product_weight + 1e-9)

    reg_row = pd.DataFrame([{
        'freight_value': freight_value,
        'product_weight_g': product_weight,
        'product_length_cm': product_length,
        'product_height_cm': product_height,
        'product_width_cm': product_width,
        'product_volume_cm3': volume_cm3,
        'density_g_per_cm3': dens,
        'customer_state_id': UF2ID[customer_state],
        'seller_state_id': UF2ID[seller_state],
        'customer_lat': customer_lat,
        'customer_lng': customer_lng,
        'seller_lat': seller_lat,
        'seller_lng': seller_lng,
        'distance_km': distance_km,
        'distance_per_kg': dpkg,
        'freight_per_weight': fpw,
        'region_pair': 0,
        'purchase_month': purchase_month,
        'purchase_dayofweek': purchase_dow,
        'purchase_hour': 12,
        'is_weekend': is_weekend,
        'delivery_gap': default_gap,
        'freight_per_day': freight_value / (default_gap + 1),
        'weekend_gap': default_gap * is_weekend,
        'dayofweek_sin': np.sin(2*np.pi*purchase_dow/7),
        'dayofweek_cos': np.cos(2*np.pi*purchase_dow/7),
        'month_sin': np.sin(2*np.pi*purchase_month/12),
        'month_cos': np.cos(2*np.pi*purchase_month/12)
    }])[features_reg]

    dmat = xgb.DMatrix(reg_row)
    pred_log = quant_model.predict(dmat)[0]
    delay_days = np.expm1(pred_log)

    predicted_arrival = estimated_delivery + pd.Timedelta(days=delay_days)

    st.subheader("üìÖ Delivery Timeline")
    st.write(f"**ETA (Promised):** {estimated_delivery.date()}")
    st.write(f"**Predicted Arrival:** {predicted_arrival.date()}")
    st.write(f"**Expected Days Late:** {delay_days:.1f}")

else:
    st.subheader("üìÖ Delivery Timeline")
    st.write(f"**ETA (Promised):** {estimated_delivery.date()}")
    st.write(f"**Predicted Arrival:** {estimated_delivery.date()}")
    st.write("No delay expected.")


Writing app.py


In [10]:
!streamlit run app.py &>/dev/null &


In [12]:
from pyngrok import ngrok
ngrok.kill()

In [14]:
public_url = ngrok.connect(8501)
public_url


<NgrokTunnel: "https://8560a2997b12.ngrok-free.app" -> "http://localhost:8501">