In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

# 加载数据
file_path = '/mnt/data/数据字段说明.xlsx'
sheet_name = 'PSY_HKUST_CAUSAL_DATA_202409'
data = pd.read_excel(file_path, sheet_name=sheet_name)

# 数据预处理
data = data.dropna()  # 删除缺失值
data['is_converted'] = (data['办理字段'] == '是').astype(int)  # 办理目标变量
data['is_marketing'] = (data['营销字段'] == '有营销干预').astype(int)  # 营销干预变量

# 特征选择
features = ['用户属性1', '用户属性2', '交互频次', '消费金额']  # 替换为实际字段
X = data[features]
y = data['is_converted']
treatment = data['is_marketing']

# 训练集/测试集划分
X_train, X_test, y_train, y_test, treat_train, treat_test = train_test_split(
    X, y, treatment, test_size=0.2, random_state=42
)

# 倾向分数模型（PSM）
ps_model = LogisticRegression()
ps_model.fit(X_train, treat_train)
ps_score = ps_model.predict_proba(X_test)[:, 1]

# 匹配后效果评估
treated = ps_score[treat_test == 1]
control = ps_score[treat_test == 0]
print("平均因果效应 (ATE):", treated.mean() - control.mean())

# 分类模型
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

# 预测与评价
y_pred = clf.predict_proba(X_test)[:, 1]  # 获取预测概率
auc = roc_auc_score(y_test, y_pred)
print("AUC:", auc)
print(classification_report(y_test, y_pred > 0.5))

# 提升度可视化（Lift Curve）
def plot_lift_curve(y_true, y_scores, title="Lift Curve"):
    data = pd.DataFrame({'true': y_true, 'scores': y_scores})
    data['decile'] = pd.qcut(data['scores'], 10, labels=False)
    lift = data.groupby('decile')['true'].mean() / data['true'].mean()

    plt.figure(figsize=(10, 6))
    plt.plot(range(1, 11), lift, marker='o')
    plt.title(title)
    plt.xlabel("Decile (Predicted Probability)")
    plt.ylabel("Lift")
    plt.grid()
    plt.show()

plot_lift_curve(y_test, y_pred)

# 倾向分数分布可视化
sns.kdeplot(ps_score[treat_test == 1], label="干预组", shade=True)
sns.kdeplot(ps_score[treat_test == 0], label="对照组", shade=True)
plt.title("倾向分数分布")
plt.xlabel("倾向分数")
plt.ylabel("密度")
plt.legend()
plt.show()

# 办理概率提升度可视化
treatment_effect = y_pred[treat_test == 1].mean() - y_pred[treat_test == 0].mean()

plt.bar(["对照组", "干预组"], [y_pred[treat_test == 0].mean(), y_pred[treat_test == 1].mean()])
plt.title(f"干预对办理概率的提升度 (提升度: {treatment_effect:.2%})")
plt.ylabel("办理概率")
plt.show()