In [1]:
BASE_PATH = ".."
RAW_PATH = f"{BASE_PATH}/data/raw"
CLEAN_PATH = f"{BASE_PATH}/data/processed"

In [2]:
import pandas as pd

orders = pd.read_csv(f"{CLEAN_PATH}/orders_cleaned.csv")
reviews = pd.read_csv(f"{CLEAN_PATH}/reviews_cleaned.csv")

In [3]:
date_cols_orders = [
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]

for col in date_cols_orders:
    orders[col] = pd.to_datetime(orders[col], errors="coerce")

date_cols_reviews = [
    "review_creation_date",
    "review_answer_timestamp"
]

for col in date_cols_reviews:
    reviews[col] = pd.to_datetime(reviews[col], errors="coerce")

reviews["review_score"] = pd.to_numeric(
    reviews["review_score"], errors="coerce"
)

In [4]:
df = orders.merge(
    reviews,
    on="order_id",
    how="inner"
)

In [5]:
df.shape

(98410, 15)

In [6]:
df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,is_delivered,review_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,True,a54f0611adc9ed256b57ede6b6eb5114,4,,"Não testei o produto ainda, mas ele veio corre...",2017-10-11,2017-10-12 03:43:48
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,True,8d5266042046a06655c8db133d120ba5,4,Muito boa a loja,Muito bom o produto.,2018-08-08,2018-08-08 18:37:50
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,True,e73b67b67587f7644d5bd1a52deb1b01,5,,,2018-08-18,2018-08-22 19:07:58
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,True,359d03e676b3c069f62cadba8dd3f6e8,5,,O produto foi exatamente o que eu esperava e e...,2017-12-03,2017-12-05 19:21:58
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,True,e50934924e227544ba8246aeb3770dd4,5,,,2018-02-17,2018-02-18 13:02:51


In [9]:
df["delay_days"] = (
    df["order_delivered_customer_date"]
    - df["order_estimated_delivery_date"]
).dt.days
df["delay_days"].describe()

count    95607.000000
mean       -11.910802
std         10.109845
min       -147.000000
25%        -17.000000
50%        -12.000000
75%         -7.000000
max        188.000000
Name: delay_days, dtype: float64

In [11]:
df["delivery_status"] = "on_time"

df.loc[df["delay_days"] > 1, "delivery_status"] = "late"

df["delivery_status"].value_counts(normalize=True)

delivery_status
on_time    0.943644
late       0.056356
Name: proportion, dtype: float64

In [12]:
def review_category(score):
    if score <= 2:
        return "bad"
    elif score == 3:
        return "neutral"
    else:
        return "good"

df["review_category"] = df["review_score"].apply(review_category)

In [13]:
df["review_category"].value_counts(normalize=True)

review_category
good       0.771436
bad        0.146286
neutral    0.082278
Name: proportion, dtype: float64

In [14]:
final_cols = [
    "order_id",
    "review_score",
    "review_category",
    "delay_days",
    "delivery_status"
]

df_engineered = df[final_cols]

In [17]:
import os

df_engineered.to_csv(
    f"{CLEAN_PATH}/orders_reviews_engineered.csv",
    index=False
)

- This notebook performs feature engineering to create business-relevant variables related to delivery performance and customer review outcomes. The engineered dataset is used for visualization and insight generation in subsequent analysis.