In [2]:
import pandas as pd
import numpy as np
import random
import os

# 确保目录存在
os.makedirs("projects/merchant-churn-prediction/data", exist_ok=True)

# 模拟参数
n_merchants = 500

industries = ["Food", "Retail", "Service", "Electronics", "Fashion"]
regions = ["East", "West", "North", "South"]

data = []
for i in range(n_merchants):
    merchant_id = f"M{i+1:04d}"
    register_days = np.random.randint(30, 1000)
    industry = random.choice(industries)
    region = random.choice(regions)
    gmv_prev_30d = np.random.randint(1000, 50000)
    gmv_last_30d = max(0, gmv_prev_30d + np.random.randint(-15000, 15000))
    orders_last_30d = max(0, int(gmv_last_30d / np.random.randint(50, 500)))
    active_days_last_30d = np.random.randint(0, 30)
    complaints_last_30d = np.random.randint(0, 5)
    promo_participation = np.random.randint(0, 5)
    
    # 流失规则（简单模拟）
    churn = 1 if active_days_last_30d < 5 or gmv_last_30d < gmv_prev_30d * 0.5 else 0
    
    data.append([
        merchant_id, register_days, industry, region,
        gmv_last_30d, gmv_prev_30d, orders_last_30d,
        active_days_last_30d, complaints_last_30d,
        promo_participation, churn
    ])

df = pd.DataFrame(data, columns=[
    "merchant_id", "register_days", "industry", "region",
    "gmv_last_30d", "gmv_prev_30d", "orders_last_30d",
    "active_days_last_30d", "complaints_last_30d",
    "promo_participation", "churn"
])

# 保存到 CSV
df.to_csv("projects/merchant-churn-prediction/data/merchant_data.csv", index=False)

print("✅ 模拟数据已生成：projects/merchant-churn-prediction/data/merchant_data.csv")
print(df.head())

✅ 模拟数据已生成：projects/merchant-churn-prediction/data/merchant_data.csv
  merchant_id  register_days     industry region  gmv_last_30d  gmv_prev_30d  \
0       M0001            841       Retail  North         28372         28784   
1       M0002            724  Electronics  North         12394         21922   
2       M0003            594      Fashion  South         30675         21658   
3       M0004            833       Retail   East         37282         42591   
4       M0005            756      Fashion   West         27598         24954   

   orders_last_30d  active_days_last_30d  complaints_last_30d  \
0               65                    23                    4   
1               44                    14                    0   
2              166                     5                    1   
3              146                     2                    4   
4              192                    22                    4   

   promo_participation  churn  
0                    0      

In [9]:
# 商户流失预测 (Merchant Churn Prediction)
# 作者: jing
# 目标: 预测未来30天可能流失的商户，支持运营提前干预

# =========================
# 1. 导入依赖库
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# =========================
# 2. 数据加载
# =========================
# TODO: 替换为你的数据路径
df = pd.read_csv("projects/merchant-churn-prediction/data/merchant_data.csv")
print(df.shape)
df.head()

# =========================
# 3. 数据清洗
# =========================
# 缺失值检查
df.isnull().sum()

# 缺失值处理（示例）
df.fillna(0, inplace=True)

# 异常值处理（可选）
# TODO: 根据业务逻辑处理异常值

# =========================
# 4. 特征工程
# =========================
# 示例: 近30天订单量、GMV增长率等
# TODO: 根据业务定义生成特征
df['gmv_growth_30d'] = (df['gmv_last_30d'] - df['gmv_prev_30d']) / (df['gmv_prev_30d'] + 1)

# 标签生成: 1=流失, 0=未流失
# TODO: 根据业务规则生成标签
df['churn'] = np.where(df['active_days_last_30d'] == 0, 1, 0)

# =========================
# 5. EDA（探索性数据分析）
# =========================
sns.countplot(x='churn', data=df)
plt.title("Churn Distribution")
plt.show()

# 特征与流失的关系
# TODO: 绘制箱线图、分布图等

# =========================
# 6. 数据集划分
# =========================
X = df.drop(columns=['churn'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 标准化（对数值型特征）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =========================
# 7. 模型训练与评估
# =========================
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
print("Logistic Regression AUC:", roc_auc_score(y_test, lr.predict_proba(X_test_scaled)[:,1]))

# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

# XGBoost
xgb = XGBClassifier(n_estimators=200, random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))

# =========================
# 8. 特征重要性分析（以 XGBoost 为例）
# =========================
importances = pd.Series(xgb.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).head(10).plot(kind='barh')
plt.title("Top 10 Feature Importances")
plt.show()

# =========================
# 9. 结论与下一步
# =========================
# TODO: 总结模型表现、关键特征、业务建议

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/opt/homebrew/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <89AD948E-E564-3266-867D-7AF89D6488F0> /opt/homebrew/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS@rpath/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file)"]
