In [1]:
import os
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


# 设置中文显示
plt.rcParams['font.sans-serif'] = ['Hiragino Sans GB']  # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 设置主题
plt.style.use('ggplot')

In [2]:
treat = np.random.normal(100, 20, 2001)
control = np.random.normal(90, 30, 1998)

# 将负值设为0
treat[treat < 0] = 0
control[control < 0] = 0

In [3]:
def delta_method(treat, control):
    n_treat = len(treat)
    n_control = len(control)
    mean_treat = np.mean(treat)
    mean_control = np.mean(control)
    var_treat = np.var(treat, ddof=1)
    var_control = np.var(control, ddof=1)

    increase_rate = mean_treat / mean_control - 1
    increase_rate_var = (var_treat / n_treat) / (mean_control ** 2) + (var_control / n_control) * (mean_treat ** 2) / (mean_control ** 4)
    return {
        'n_treat': n_treat,
        'n_control': n_control,
        'treat_mean': mean_treat,
        'control_mean': mean_control,
        'increase_rate': increase_rate,
        'treat_var': var_treat,
        'control_var': var_control,
        'increase_rate_var': increase_rate_var,
        'ci_low': increase_rate - 1.96 * np.sqrt(increase_rate_var),
        'ci_high': increase_rate + 1.96 * np.sqrt(increase_rate_var)
    }

In [10]:
def bootstrap(treat, control, n_bootstrap=1000):
    """计算两组数据的均值差的bootstrap置信区间"""
    n_treat = len(treat)
    n_control = len(control)
    mean_treat = []
    mean_control = []

    for _ in range(n_bootstrap):
        sample1 = np.random.choice(treat, size=n_treat, replace=True)
        sample2 = np.random.choice(control, size=n_control, replace=True)
        mean_treat.append(np.mean(sample1))
        mean_control.append(np.mean(sample2))

    boot_diffs = (np.array(mean_treat) - np.array(mean_control)) / np.array(mean_control)

    return {
        'n_treat': n_treat,
        'n_control': n_control,
        'treat_mean': np.mean(treat),
        'control_mean': np.mean(control),
        'increase_rate': np.mean(boot_diffs),
        'increase_rate_var': np.var(boot_diffs, ddof=1),
        'ci_low': np.percentile(boot_diffs, 2.5),
        'ci_high': np.percentile(boot_diffs, 97.5)
    }

In [8]:
delta_method(treat, control)

{'n_treat': 2001,
 'n_control': 1998,
 'treat_mean': np.float64(100.31980862401721),
 'control_mean': np.float64(90.00003803521572),
 'increase_rate': np.float64(0.11466406919476535),
 'treat_var': np.float64(427.6122874590181),
 'control_var': np.float64(877.87668532801),
 'increase_rate_var': np.float64(9.377961961768836e-05),
 'ci_low': np.float64(0.09568345311616264),
 'ci_high': np.float64(0.13364468527336806)}

In [9]:
bootstrap(treat, control)

{'n_treat': 2001,
 'n_control': 1998,
 'treat_mean': np.float64(100.31980862401721),
 'control_mean': np.float64(90.00003803521572),
 'increase_rate': np.float64(0.11466406919476535),
 'increase_rate2': np.float64(0.11466702034662611),
 'increase_rate_var': np.float64(9.666187871803547e-05),
 'ci_low': np.float64(0.09544539567638048),
 'ci_high': np.float64(0.13402685972193015)}