In [None]:
# !pip install -r ../requirements.txt

In [1]:
import pandas as pd

base_path = "../data/"  

orders = pd.read_csv(base_path + "olist_orders.csv")
items = pd.read_csv(base_path + "olist_order_items.csv")
customers = pd.read_csv(base_path + "olist_customers.csv")

# print("orders:", orders.shape)
# print("items:", items.shape)
# print("customers:", customers.shape)

# orders.head()

In [2]:
# import sys
# !{sys.executable} -m pip install ydata-profiling


In [3]:
from ydata_profiling import ProfileReport

dfs = {
    "orders": orders,
    "items": items,
    "customers": customers
}

for name, df in dfs.items():
    print(f"Generating report for: {name}")
    profile = ProfileReport(df, title=f"{name.capitalize()} EDA Report", explorative=True)
    profile.to_file(f"{name}_eda_report.html")


Generating report for: orders




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/8 [00:00<?, ?it/s][A
 12%|██████████▌                                                                         | 1/8 [00:04<00:33,  4.73s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.62it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generating report for: items




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/7 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.69it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generating report for: customers




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.79it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# print("=== ORDERS ===")
# print(orders.shape)
print(orders.dtypes)
# print(orders.isnull().sum())
# print(orders.nunique())
# print(orders.describe())
# print(orders.describe(include='object'))


order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object


In [5]:
# Convert dates to datetime format
orders["order_purchase_timestamp"] = pd.to_datetime(orders["order_purchase_timestamp"])
orders["order_delivered_customer_date"] = pd.to_datetime(orders["order_delivered_customer_date"])
orders["order_estimated_delivery_date"] = pd.to_datetime(orders["order_estimated_delivery_date"])

# Calculate total number of orders and last order date per customer
customer_orders = orders.groupby("customer_id").agg(
    total_orders=("order_id", "nunique"),
    last_order_date=("order_purchase_timestamp", "max")
).reset_index()

customer_orders.head()


Unnamed: 0,customer_id,total_orders,last_order_date
0,00012a2ce6f8dcda20d059ce98491703,1,2017-11-14 16:08:26
1,000161a058600d5901f007fab4c27140,1,2017-07-16 09:40:32
2,0001fd6190edaaf884bcaf3d49edf079,1,2017-02-28 11:06:43
3,0002414f95344307404f0ace7a26f1d5,1,2017-08-16 13:09:20
4,000379cdec625522490c315e70c7a9fb,1,2018-04-02 13:42:17


In [6]:
# Delivery delay = actual delivery date - estimated delivery date
orders["delivery_delay_days"] = (orders["order_delivered_customer_date"] - orders["order_estimated_delivery_date"]).dt.days


In [7]:
avg_delay = orders.groupby("customer_id").agg(
    avg_delivery_delay=("delivery_delay_days", "mean")
).reset_index()


In [11]:
# 기준 날짜 설정 (예: 가장 마지막 주문 날짜 기준)
reference_date = orders["order_purchase_timestamp"].max()

# 각 고객의 마지막 구매일로부터 얼마나 시간이 지났는지 계산
customer_orders["days_since_last_order"] = (reference_date - customer_orders["last_order_date"]).dt.days

# churn 라벨 정의 (예: 90일 이상이면 churn)
customer_orders["churned"] = customer_orders["days_since_last_order"] > 90
customer_orders["churned"] = customer_orders["churned"].astype(int)


In [12]:
# 주문별 총 금액 계산
order_prices = items.groupby("order_id").agg(
    order_value=("price", "sum"),
    num_items=("product_id", "count"),
    unique_products=("product_id", "nunique")
).reset_index()

# orders에 붙이기
orders = orders.merge(order_prices, on="order_id", how="left")

# 고객 단위 평균 주문 금액, 아이템 수 등 집계
order_agg = orders.groupby("customer_id").agg(
    avg_order_value=("order_value", "mean"),
    avg_num_items=("num_items", "mean"),
    avg_unique_products=("unique_products", "mean")
).reset_index()


In [13]:
orders["actual_delivery_days"] = (orders["order_delivered_customer_date"] - orders["order_purchase_timestamp"]).dt.days

delivery_agg = orders.groupby("customer_id").agg(
    avg_actual_delivery_days=("actual_delivery_days", "mean")
).reset_index()


In [14]:
# 모두 customer_id 기준으로 merge
features = customer_orders.merge(avg_delay, on="customer_id", how="left")
features = features.merge(order_agg, on="customer_id", how="left")
features = features.merge(delivery_agg, on="customer_id", how="left")

features.head()


Unnamed: 0,customer_id,total_orders,last_order_date,days_since_last_order,churned,avg_delivery_delay,avg_order_value,avg_num_items,avg_unique_products,avg_actual_delivery_days
0,00012a2ce6f8dcda20d059ce98491703,1,2017-11-14 16:08:26,337,1,-6.0,89.8,1.0,1.0,13.0
1,000161a058600d5901f007fab4c27140,1,2017-07-16 09:40:32,458,1,-10.0,54.9,1.0,1.0,9.0
2,0001fd6190edaaf884bcaf3d49edf079,1,2017-02-28 11:06:43,596,1,-16.0,179.99,1.0,1.0,5.0
3,0002414f95344307404f0ace7a26f1d5,1,2017-08-16 13:09:20,427,1,-1.0,149.9,1.0,1.0,28.0
4,000379cdec625522490c315e70c7a9fb,1,2018-04-02 13:42:17,198,1,-5.0,93.0,1.0,1.0,11.0


In [None]:
from sklearn.model_selection import train_test_split

X = features.drop(columns=["customer_id", "last_order_date", "days_since_last_order", "churned"])
y = features["churned"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
