## 1. Problem Definition

이 프로젝트의 목표는 이커머스 플랫폼에서 고객의 이탈(churn) 여부를 예측하는 것이다.

예측된 churn 고객을 조기에 식별함으로써,
리텐션 전략을 설계하고 고객 생애 가치를 극대화하는 데 목적이 있다.


In [None]:
# !pip install -r ../requirements.txt

In [1]:
import pandas as pd

base_path = "../data/"  

orders = pd.read_csv(base_path + "olist_orders.csv")
items = pd.read_csv(base_path + "olist_order_items.csv")
customers = pd.read_csv(base_path + "olist_customers.csv")

# print("orders:", orders.shape)
# print("items:", items.shape)
# print("customers:", customers.shape)

# orders.head()

In [None]:
# import sys
# !{sys.executable} -m pip install ydata-profiling


In [None]:
from ydata_profiling import ProfileReport

dfs = {
    "orders": orders,
    "items": items,
    "customers": customers
}

for name, df in dfs.items():
    print(f"Generating report for: {name}")
    profile = ProfileReport(df, title=f"{name.capitalize()} EDA Report", explorative=True)
    profile.to_file(f"{name}_eda_report.html")


In [None]:
# print("=== ORDERS ===")
# print(orders.shape)
print(orders.dtypes)
# print(orders.isnull().sum())
# print(orders.nunique())
# print(orders.describe())
# print(orders.describe(include='object'))


In [2]:
# Convert columns to datetime format
orders["order_purchase_timestamp"] = pd.to_datetime(orders["order_purchase_timestamp"])
orders["order_delivered_customer_date"] = pd.to_datetime(orders["order_delivered_customer_date"])
orders["order_estimated_delivery_date"] = pd.to_datetime(orders["order_estimated_delivery_date"])

In [3]:
# Calculate total number of orders and last order date per customer
customer_orders = orders.groupby("customer_id").agg(
    total_orders=("order_id", "nunique"),
    last_order_date=("order_purchase_timestamp", "max")
).reset_index()

customer_orders.head()

Unnamed: 0,customer_id,total_orders,last_order_date
0,00012a2ce6f8dcda20d059ce98491703,1,2017-11-14 16:08:26
1,000161a058600d5901f007fab4c27140,1,2017-07-16 09:40:32
2,0001fd6190edaaf884bcaf3d49edf079,1,2017-02-28 11:06:43
3,0002414f95344307404f0ace7a26f1d5,1,2017-08-16 13:09:20
4,000379cdec625522490c315e70c7a9fb,1,2018-04-02 13:42:17


In [5]:
# Delivery delay = actual delivery date - estimated delivery date
orders["delivery_delay_days"] = (orders["order_delivered_customer_date"] - orders["order_estimated_delivery_date"]).dt.days

# 고객별 평균 배송 지연 일수 계산
avg_delay = orders.groupby("customer_id").agg(
    avg_delivery_delay=("delivery_delay_days", "mean")
).reset_index()

avg_delay.head()

Unnamed: 0,customer_id,avg_delivery_delay
0,00012a2ce6f8dcda20d059ce98491703,-6.0
1,000161a058600d5901f007fab4c27140,-10.0
2,0001fd6190edaaf884bcaf3d49edf079,-16.0
3,0002414f95344307404f0ace7a26f1d5,-1.0
4,000379cdec625522490c315e70c7a9fb,-5.0


In [6]:
# 평균 배송 지연일이 양수인 고객만 추출 (예상보다 늦게 받은 경우)
delayed_customers = avg_delay[avg_delay["avg_delivery_delay"] > 0]

# 평균 지연일이 높은 순으로 정렬
delayed_customers_sorted = delayed_customers.sort_values(by="avg_delivery_delay", ascending=False)

# 상위 10명 확인
delayed_customers_sorted.head(10)


Unnamed: 0,customer_id,avg_delivery_delay
82210,d306426abe5fca15e54b645e4462dc7b,188.0
45446,75683a92331068e2d281b11a7866ba44,181.0
79201,cb2caaaead400c97350c37a3fc536867,175.0
39321,65b14237885b3972ebec28c0f7dd2220,167.0
60971,9cf2c3fa2632cee748e1a59ca9d09b21,166.0
46543,7815125148cfa1e8c7fee1ff7974f16c,165.0
96540,f85e9ec0719b16dc4dd0edd438793553,162.0
10320,1a8a4a30dc296976717f44e7801fdeef,161.0
74427,beeda72b31be3b8a38b5c2b77d7705c4,161.0
50265,8199345f57c6d1cbe9701f92481beb8d,159.0


In [7]:
# 배송 지연 여부 그룹화: Early vs Delayed
avg_delay["delay_group"] = avg_delay["avg_delivery_delay"].apply(
    lambda x: "Early" if x < 0 else "Delayed"
)

group_summary = avg_delay.groupby("delay_group").agg(
    avg_delay_days=("avg_delivery_delay", "mean"),
    customer_count=("customer_id", "count")
).reset_index()

print(group_summary)


  delay_group  avg_delay_days  customer_count
0     Delayed        8.867254           10792
1       Early      -13.708423           88649


In [9]:
# 기준 날짜 설정 (예: 가장 마지막 주문 날짜 기준)
reference_date = orders["order_purchase_timestamp"].max()

# 각 고객의 마지막 구매일로부터 얼마나 시간이 지났는지 계산
customer_orders["days_since_last_order"] = (reference_date - customer_orders["last_order_date"]).dt.days

# churn 라벨 정의 (예: 90일 이상이면 churn)
customer_orders["churned"] = customer_orders["days_since_last_order"] > 90
customer_orders["churned"] = customer_orders["churned"].astype(int)


In [10]:
# 주문별 총 구매 금액 및 아이템 수 계산
order_prices = items.groupby("order_id").agg(
    order_value=("price", "sum"),
    num_items=("product_id", "count"),
    unique_products=("product_id", "nunique")
).reset_index()

# 주문 정보에 금액/아이템 정보 병합
orders = orders.merge(order_prices, on="order_id", how="left")

In [13]:
# 고객별 평균 주문 금액, 평균 구매 아이템 수, 제품 다양성 계산
order_agg = orders.groupby("customer_id").agg(
    avg_order_value=("order_value", "mean"),
    avg_num_items=("num_items", "mean"),
    avg_unique_products=("unique_products", "mean")
).reset_index()

In [15]:
# 고객별 평균 실제 배송 소요일 계산 (배송일 - 주문일)
orders["actual_delivery_days"] = (orders["order_delivered_customer_date"] - orders["order_purchase_timestamp"]).dt.days

delivery_agg = orders.groupby("customer_id").agg(
    avg_actual_delivery_days=("actual_delivery_days", "mean")
).reset_index()


## 2. Data Preprocessing & Feature Engineering

- 날짜 → datetime 변환  
- 주문 / 배송 정보 기반 feature 생성  
- 고객별 집계 (총 주문 수, 평균 주문 금액, 평균 배송 지연 등)  
- churn 라벨 정의: 마지막 구매 이후 90일 이상 경과 → churned = 1

최종적으로 고객 단위 feature 테이블 `features`를 생성하였다.


In [16]:
# 고객 단위 모든 feature 와 라벨 통합
# churn prediction을 위한 최종 feature table 생성
features = customer_orders.merge(avg_delay, on="customer_id", how="left")
features = features.merge(order_agg, on="customer_id", how="left")
features = features.merge(delivery_agg, on="customer_id", how="left")

features.head()


Unnamed: 0,customer_id,total_orders,last_order_date,days_since_last_order,churned,avg_delivery_delay,delay_group,avg_order_value,avg_num_items,avg_unique_products,avg_actual_delivery_days
0,00012a2ce6f8dcda20d059ce98491703,1,2017-11-14 16:08:26,337,1,-6.0,Early,89.8,1.0,1.0,13.0
1,000161a058600d5901f007fab4c27140,1,2017-07-16 09:40:32,458,1,-10.0,Early,54.9,1.0,1.0,9.0
2,0001fd6190edaaf884bcaf3d49edf079,1,2017-02-28 11:06:43,596,1,-16.0,Early,179.99,1.0,1.0,5.0
3,0002414f95344307404f0ace7a26f1d5,1,2017-08-16 13:09:20,427,1,-1.0,Early,149.9,1.0,1.0,28.0
4,000379cdec625522490c315e70c7a9fb,1,2018-04-02 13:42:17,198,1,-5.0,Early,93.0,1.0,1.0,11.0


In [30]:
X = features.drop(columns=["customer_id", "last_order_date", "days_since_last_order", "delay_group", "churned"])
y = features["churned"]

# 평균값으로 채우기
X = X.fillna(X.mean())

## 3.Train-Test Split

- `X` = 입력 변수 (churn 예측용 feature들)
- `y` = 타겟 라벨 (`churned`)

결측치는 평균으로 채운 뒤, 전체 데이터셋을 8:2 비율로 분할하였다.


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 4. Modeling

세 가지 모델을 적용해 churn 예측 성능을 비교하였다:

1. XGBoost
2. Logistic Regression
3. Random Forest

평가지표:
- Accuracy
- Recall (특히 churn = 1)
- AUC


In [27]:
# %pip install xgboost


In [32]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

# 1. 모델 정의 및 학습
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# 2. 예측
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # AUC용 확률값

# 3. 평가
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.73      0.25      0.37      1958
           1       0.92      0.99      0.96     17931

    accuracy                           0.92     19889
   macro avg       0.83      0.62      0.66     19889
weighted avg       0.90      0.92      0.90     19889

AUC: 0.8004566961913757


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 모델 정의 및 학습
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# 예측
y_pred_logreg = logreg.predict(X_test)
y_proba_logreg = logreg.predict_proba(X_test)[:, 1]

# 평가
print("🔹 Logistic Regression")
print(classification_report(y_test, y_pred_logreg))
print("AUC:", roc_auc_score(y_test, y_proba_logreg))


🔹 Logistic Regression
              precision    recall  f1-score   support

           0       0.99      0.10      0.18      1958
           1       0.91      1.00      0.95     17931

    accuracy                           0.91     19889
   macro avg       0.95      0.55      0.56     19889
weighted avg       0.92      0.91      0.88     19889

AUC: 0.7651036497927107


In [34]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("🔹 Random Forest Classifier")
print(classification_report(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, y_proba_rf))


🔹 Random Forest Classifier
              precision    recall  f1-score   support

           0       0.43      0.29      0.35      1958
           1       0.93      0.96      0.94     17931

    accuracy                           0.89     19889
   macro avg       0.68      0.62      0.64     19889
weighted avg       0.88      0.89      0.88     19889

AUC: 0.7399519204504795


In [35]:
# import pandas as pd

# base_path = "../data/"  

# orders = pd.read_csv(base_path + "olist_orders.csv")
# items = pd.read_csv(base_path + "olist_order_items.csv")
# customers = pd.read_csv(base_path + "olist_customers.csv")

# # Convert columns to datetime format
# orders["order_purchase_timestamp"] = pd.to_datetime(orders["order_purchase_timestamp"])
# orders["order_delivered_customer_date"] = pd.to_datetime(orders["order_delivered_customer_date"])
# orders["order_estimated_delivery_date"] = pd.to_datetime(orders["order_estimated_delivery_date"])

# # Calculate total number of orders and last order date per customer
# customer_orders = orders.groupby("customer_id").agg(
#     total_orders=("order_id", "nunique"),
#     last_order_date=("order_purchase_timestamp", "max")
# ).reset_index()

# customer_orders.head()

# # Delivery delay = actual delivery date - estimated delivery date
# orders["delivery_delay_days"] = (orders["order_delivered_customer_date"] - orders["order_estimated_delivery_date"]).dt.days

# # 고객별 평균 배송 지연 일수 계산
# avg_delay = orders.groupby("customer_id").agg(
#     avg_delivery_delay=("delivery_delay_days", "mean")
# ).reset_index()

# avg_delay.head()

# # 평균 배송 지연일이 양수인 고객만 추출 (예상보다 늦게 받은 경우)
# delayed_customers = avg_delay[avg_delay["avg_delivery_delay"] > 0]

# # 평균 지연일이 높은 순으로 정렬
# delayed_customers_sorted = delayed_customers.sort_values(by="avg_delivery_delay", ascending=False)

# # 상위 10명 확인
# delayed_customers_sorted.head(10)

# # 배송 지연 여부 그룹화: Early vs Delayed
# avg_delay["delay_group"] = avg_delay["avg_delivery_delay"].apply(
#     lambda x: "Early" if x < 0 else "Delayed"
# )

# group_summary = avg_delay.groupby("delay_group").agg(
#     avg_delay_days=("avg_delivery_delay", "mean"),
#     customer_count=("customer_id", "count")
# ).reset_index()

# print(group_summary)

# # 기준 날짜 설정 (예: 가장 마지막 주문 날짜 기준)
# reference_date = orders["order_purchase_timestamp"].max()

# # 각 고객의 마지막 구매일로부터 얼마나 시간이 지났는지 계산
# customer_orders["days_since_last_order"] = (reference_date - customer_orders["last_order_date"]).dt.days

# # churn 라벨 정의 (예: 90일 이상이면 churn)
# customer_orders["churned"] = customer_orders["days_since_last_order"] > 90
# customer_orders["churned"] = customer_orders["churned"].astype(int)

# # 주문별 총 구매 금액 및 아이템 수 계산
# order_prices = items.groupby("order_id").agg(
#     order_value=("price", "sum"),
#     num_items=("product_id", "count"),
#     unique_products=("product_id", "nunique")
# ).reset_index()

# # 주문 정보에 금액/아이템 정보 병합
# orders = orders.merge(order_prices, on="order_id", how="left")

# # 고객별 평균 주문 금액, 평균 구매 아이템 수, 제품 다양성 계산
# order_agg = orders.groupby("customer_id").agg(
#     avg_order_value=("order_value", "mean"),
#     avg_num_items=("num_items", "mean"),
#     avg_unique_products=("unique_products", "mean")
# ).reset_index()

# # 고객별 평균 실제 배송 소요일 계산 (배송일 - 주문일)
# orders["actual_delivery_days"] = (orders["order_delivered_customer_date"] - orders["order_purchase_timestamp"]).dt.days

# delivery_agg = orders.groupby("customer_id").agg(
#     avg_actual_delivery_days=("actual_delivery_days", "mean")
# ).reset_index()

# # 고객 단위 모든 feature 와 라벨 통합
# # churn prediction을 위한 최종 feature table 생성
# features = customer_orders.merge(avg_delay, on="customer_id", how="left")
# features = features.merge(order_agg, on="customer_id", how="left")
# features = features.merge(delivery_agg, on="customer_id", how="left")

# features.head()

# X = features.drop(columns=["customer_id", "last_order_date", "days_since_last_order", "delay_group", "churned"])
# y = features["churned"]

# # 평균값으로 채우기
# X = X.fillna(X.mean())

# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# from xgboost import XGBClassifier
# from sklearn.metrics import classification_report, roc_auc_score

# # 1. 모델 정의 및 학습
# model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# model.fit(X_train, y_train)

# # 2. 예측
# y_pred = model.predict(X_test)
# y_proba = model.predict_proba(X_test)[:, 1]  # AUC용 확률값

# # 3. 평가
# print(classification_report(y_test, y_pred))
# print("AUC:", roc_auc_score(y_test, y_proba))

# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, roc_auc_score

# # 모델 정의 및 학습
# logreg = LogisticRegression(max_iter=1000, random_state=42)
# logreg.fit(X_train, y_train)

# # 예측
# y_pred_logreg = logreg.predict(X_test)
# y_proba_logreg = logreg.predict_proba(X_test)[:, 1]

# # 평가
# print("🔹 Logistic Regression")
# print(classification_report(y_test, y_pred_logreg))
# print("AUC:", roc_auc_score(y_test, y_proba_logreg))

# from sklearn.ensemble import RandomForestClassifier

# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf.fit(X_train, y_train)

# y_pred_rf = rf.predict(X_test)
# y_proba_rf = rf.predict_proba(X_test)[:, 1]

# print("🔹 Random Forest Classifier")
# print(classification_report(y_test, y_pred_rf))
# print("AUC:", roc_auc_score(y_test, y_proba_rf))
