In [1]:
%cd /Users/yantong/Desktop/Tencent/Q2

/Users/yantong/Desktop/Tencent/Q2


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.stats.proportion as sp
import matplotlib.pyplot as plt
import scipy.stats


In [3]:
simu_data=pd.read_csv("Q2.csv")
simu_data = simu_data.drop(["Unnamed: 0"],axis=1)
#simu_data.head()

## I. 缺失值处理

In [4]:
simu_data.info()  # we guess there might be replicated null values in id. And fortunately, we found them. There are 2 teams with NA id
# simu_data[simu_data.id.isnull() == True]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29972 entries, 0 to 29971
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         29942 non-null  float64
 1   year       29972 non-null  int64  
 2   Dept       29972 non-null  object 
 3   treatment  29972 non-null  int64  
 4   profit     29972 non-null  float64
 5   size       29972 non-null  float64
 6   wageindex  29972 non-null  float64
dtypes: float64(4), int64(2), object(1)
memory usage: 1.6+ MB


In [5]:
simu_data.describe()
# team id in (110003, 950445), we use 110001 and 110002 to distinguish them

Unnamed: 0,id,year,treatment,profit,size,wageindex
count,29942.0,29972.0,29972.0,29972.0,29972.0,29972.0
mean,519037.800147,2009.955192,0.014547,9828.606516,229.986832,0.977189
std,241691.891157,4.303138,0.119732,8741.118342,161.716154,0.334164
min,110003.0,2003.0,0.0,1002.0,51.0,-0.415817
25%,360360.0,2006.0,0.0,3735.0,121.0,0.74668
50%,450110.0,2010.0,0.0,6964.0,180.0,0.969626
75%,730640.0,2014.0,0.0,13179.25,290.0,1.199992
max,950445.0,2017.0,1.0,124765.0,6368.0,2.695041


In [6]:
simu_data.loc[14610:14624,"id"] = 110001
simu_data.loc[17759:17773,"id"] =110002

## II. Feature Engineering 

### 1. Meature profit per person. (avg_profit)

In [7]:
simu_data["avg_profit"] = simu_data["profit"]/simu_data["size"]

### 2. Categorize the treatment group and control group.  (group_type)
是否在当年实行改革动作，“1”为实施了改革动作，为实验组;“0” 为未实施改革动作，为控制组。当年 treatment 变量为 1，后续为 0，仍是实验组。

In [8]:
treatment_id = simu_data[simu_data.loc[:,'treatment'] == 1].id

In [9]:
simu_data['group_type'] = simu_data.apply(lambda x: 1 if x.id in treatment_id.tolist() else 0, axis=1)

In [10]:
start_year = simu_data.groupby('id').apply(lambda x: x[x.year==x.year.min()])[["id","year"]]
end_year = simu_data.groupby('id').apply(lambda x: x[x.year==x.year.max()])[["id","year"]]
treatment_year = simu_data.groupby('id').apply(lambda x: x[x.treatment==1])[["id","year"]]

In [11]:
start_year.index = range(len(start_year))
start_year.rename(columns={'year':'start_year'}, inplace = True)
end_year.index = range(len(end_year))
end_year.rename(columns={'year':'end_year'}, inplace = True)
treatment_year.index = range(len(treatment_year))
treatment_year.rename(columns={'year':'treatment_year'}, inplace = True)

In [12]:
simu_data=pd.merge(simu_data,start_year,how='left',on='id')
simu_data=pd.merge(simu_data,end_year,how='left',on='id')
simu_data=pd.merge(simu_data,treatment_year,how='left',on='id')

### 3. Categorize the before treatment group and after treatment group. (adj_treatment)

目的：为所有实验组(改革团队)找到与其最为相似的控制组(未改革团队)。在为所有实验组(改革团队)匹配与其最为相似的控制组(未改革团队)是，只使用实验组在实施改革策略前的数据，防止带来改革这一行为的影响

In [13]:
simu_data['adj_treatment'] = simu_data.apply(lambda x: 1 if x.year >= x.treatment_year else 0, axis=1)

In [14]:
simu_data = simu_data.drop(["treatment_year"],axis=1)

In [15]:
control = simu_data[simu_data.group_type == 0]
treatment_before = simu_data[(simu_data.adj_treatment == 0)&(simu_data.group_type == 1)]

## III. 对实验组匹配最相似的对应组

#### 基本思路：对离散数据进行独热编码处理，在全部变量都归一化之后计算实验组对每一个对照组的欧式距离。距离值最小的作为该实验组对应的对照组。

1. **长表转宽表**

对每一个团队来说，每一年都有指标数据(profit,size,wageindex,avg_profit)。原始表格是以团队id和年份来识别指标数据的。这里对数据进行重塑，将年份代入指标维度中。衡量每个团队的指标变成profit_2003,...,profit_profit_2017,size_2003,...,size_2017...之所以用2003作为起始值2017作为时间截止值是因为这是所有团队指标数据最早记录于2003年最晚记录于2017年。

In [16]:
nearest = pd.concat([control,treatment_before],axis = 0)

In [17]:
df1=pd.pivot_table(nearest,index=["id","Dept","start_year","end_year","group_type"],
                  columns=[ "year"], values=["profit","size","wageindex","avg_profit"])

In [18]:
wide = df1.copy(deep=True)
col = []
for i in wide.columns:
    i = list(i)
    i[1] = str(i[1])
    col.append("_".join(i))

In [19]:
wide.columns = col

In [20]:
wide = wide.reset_index()

2. **缺失值处理**
    1. **缺失值分布**：缺失值主要分布在部分团队早期(eg.2013)的profit size wageindex avg_profit中，及实验组团队在后期的profit size wageindex avg_profit中，还有部分已经关闭的团队（或者没有数据）在后期的指标中    
    2. **缺失原因** 
        1. 部分团队早期数据缺失：这些团队创立时间较晚，没有早期数据。
        2. 实验组团队后期数据缺失：为保证相似分析不受实验影响，实验过后的数据都不予考虑
        3. 已关闭的团队在后期数据缺失：这些团队在后期公司可能选择暂时关闭，所以不存在这些运营数据
    3.  **缺失值处理**：由于需要通过余弦计算相似度，需要保证没有缺失值。为了以尽可能大的程度捕获信息同时不引入过多预测带来的偏差，这里不做缺失值填充，而是保留各个团队的创建年份,关闭年份，各类指标的初始值，并计算指标平均增长率。
        1. 创建年份 start_year （前面代码已整理出，直接使用）
        2. 关闭年份 end_year （前面代码已整理出，直接使用）         
        3. 指标初始值 profit_start size_start wageindex_start avg_profit_start
        4. 指标平均年增长率 profit_gr size_gr wageindex_gr avg_profit_gr

指标初始值 profit_start size_start wageindex_start avg_profit_start

In [21]:
profit_start = []
s = 'profit_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[1,"start_year"]}']
    profit_start.append(p_s)
wide['profit_start'] = profit_start

In [22]:
size_start = []
s = 'size_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[1,"start_year"]}']
    size_start.append(p_s)
wide['size_start'] = size_start

In [23]:
wageindex_start = []
s = 'wageindex_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[1,"start_year"]}']
    wageindex_start.append(p_s)
wide['wageindex_start'] = wageindex_start

In [24]:
avg_profit_start = []
s = 'avg_profit_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[1,"start_year"]}']
    avg_profit_start.append(p_s)
wide['avg_profit_start'] = avg_profit_start

指标平均年增长率 profit_gr size_gr wageindex_gr avg_profit_gr

In [25]:
profit_index_name = ["profit_2003","profit_2004","profit_2005","profit_2006","profit_2007",
                     "profit_2008","profit_2009","profit_2010","profit_2011","profit_2012",
                     "profit_2013","profit_2014","profit_2015","profit_2016","profit_2017"]
size_index_name = ["size_2003","size_2004","size_2005","size_2006","size_2007",
                     "size_2008","size_2009","size_2010","size_2011","size_2012",
                     "size_2013","size_2014","size_2015","size_2016","size_2017"]
wageindex_index_name = ["wageindex_2003","wageindex_2004","wageindex_2005","wageindex_2006","wageindex_2007",
                     "wageindex_2008","wageindex_2009","wageindex_2010","wageindex_2011","wageindex_2012",
                     "wageindex_2013","wageindex_2014","wageindex_2015","wageindex_2016","wageindex_2017"]
avg_profit_index_name = ["avg_profit_2003","avg_profit_2004","avg_profit_2005","avg_profit_2006","avg_profit_2007",
                     "avg_profit_2008","avg_profit_2009","avg_profit_2010","avg_profit_2011","avg_profit_2012",
                     "avg_profit_2013","avg_profit_2014","avg_profit_2015","avg_profit_2016","avg_profit_2017"]

In [26]:
growth_rate = []
for i in range(wide.shape[0]):
    a = wide.loc[i,profit_index_name[1:15]]
    b = wide.loc[i,profit_index_name[0:14]]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['profit_gr'] = growth_rate

In [27]:

growth_rate = []
for i in range(wide.shape[0]):
    a = wide.loc[i,size_index_name[1:15]]
    b = wide.loc[i,size_index_name[0:14]]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['size_gr'] = growth_rate


In [28]:

growth_rate = []
for i in range(wide.shape[0]):
    a = wide.loc[i,wageindex_index_name[1:15]]
    b = wide.loc[i,wageindex_index_name[0:14]]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['wageindex_gr'] = growth_rate


In [29]:
growth_rate = []
for i in range(wide.shape[0]):
    a = wide.iloc[i,6:20]
    b = wide.iloc[i,5:19]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['avg_profit_gr'] = growth_rate


In [30]:
wide_keep = wide
#wide = wide_keep

In [31]:
wide = wide.drop(size_index_name,axis=1)
wide = wide.drop(profit_index_name,axis=1)
wide = wide.drop(wageindex_index_name,axis=1)
wide = wide.drop(avg_profit_index_name,axis=1)


In [32]:
dummies = pd.get_dummies( wide.loc[:, 'Dept'] ) 
wide = pd.concat( [wide, dummies], axis = 1 )
wide = wide.drop(['Dept'],axis = 1)
    

In [33]:
#wide_keep = wide
#wide = wide_keep


3. **数据对一化处理**

In [34]:
cols = wide.columns.values.tolist()[1:]
for i in cols:
    wide[i] = (wide[i] - np.nanmin(wide[i]))/(np.nanmax(wide[i])-np.nanmin(wide[i]))

In [35]:
treatment_norm = wide[wide.group_type == 1]
control_norm = wide[wide.group_type == 0]
treatment_norm_keep = treatment_norm
control_norm_keep = control_norm
treatment_norm = treatment_norm.drop('group_type',axis=1)
control_norm = control_norm.drop('group_type',axis=1)

In [36]:
treatment_norm = treatment_norm.drop('id',axis=1)
control_norm = control_norm.drop('id',axis=1)

4. **对每个实验组团队匹配最相似的对照组**

In [37]:
distance = pd.DataFrame(columns=range(len(control_norm)),index = range(len(treatment_norm)))
for i in range(len(treatment_norm)):
    for j in range(len(control_norm)):
        distance.iloc[i,j] = np.sqrt(np.sum(np.square(treatment_norm.iloc[i,]-control_norm.iloc[j,])))

In [38]:
similar_control = []
for i in range(len(treatment_norm)):
    dist_i = distance.iloc[i,].tolist()
    max_index = dist_i.index(max(dist_i))
    similar_control.append(max_index)

In [39]:
control_norm_2 = wide_keep[wide_keep.group_type == 0]

In [40]:
control_sim_wide_1 = control_norm_2.loc[similar_control,]

In [41]:
control_sim_wide_1 = control_sim_wide_1.drop(size_index_name,axis=1)
control_sim_wide_1 = control_sim_wide_1.drop(profit_index_name,axis=1)
control_sim_wide_1 = control_sim_wide_1.drop(wageindex_index_name,axis=1)
control_sim_wide_1 = control_sim_wide_1.drop(avg_profit_index_name,axis=1)

## IV. 分析改革效果

**基本思路**
1. 从团队总体带来的利益profit，团队人数size，人均带来的利益avg_profit，和薪酬指标wage_index来分析改革是否带来好的效果。
2. 即使使用最相似分析对每个实验组找到与之最相近的对照组，我们也无法保证实验组和对照组除了自变量（是否改革）外不受其他因素的干扰。这里我采取了保守型策略：从初期指标值与指标平均增长率两个角度上衡量四大指标：
    1. 在分析改革效果中，我们只分析实验组在改革后的数据，初始年份为改革年份，所以基本上都会比对照组晚几年。因此实验组初期各指标值（初期总利益，初期总人数，初期人均利益，初期薪酬指标）皆会高于对照组的。这里用假设检验验证这一想法。结果表明确实如此。
    2. 根据以上假设，我们认为实验组有较高的初期指标。若实验组同时还保持较高的增长率，比如实验组在较高总收益的基准下还能保持较好的利益增长，那么表明这一改革能有效带来团队整体效率的提升。

### 1. 获取改革后的实验组宽数据 treatment_after_change_wide

In [42]:
# treatment_after = simu_data.treatment_year

In [43]:
treatment_after = simu_data[simu_data.adj_treatment == 1]

In [44]:
treatment_after_change = pd.merge(treatment_after,treatment_year,on="id",how = "right")

In [45]:
col = treatment_after_change.columns.tolist()[:12]
col[9] = "start_year_1"
col.append("start_year")
treatment_after_change.columns = col

In [46]:
# treatment_after_change

In [47]:
df1 = pd.pivot_table(treatment_after_change,index=["id","Dept","start_year","end_year","group_type"],
                     columns=[ "year"], values=["profit","size","wageindex","avg_profit"])
wide = df1.copy(deep=True)
col = []
for i in wide.columns:
    i = list(i)
    i[1] = str(i[1])
    col.append("_".join(i))
wide.columns = col
wide = wide.reset_index()

profit_start = []
s = 'profit_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[i,"start_year"]}']
    profit_start.append(p_s)
wide['profit_start'] = profit_start
size_start = []
s = 'size_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[i,"start_year"]}']
    size_start.append(p_s)
wide['size_start'] = size_start
wageindex_start = []
s = 'wageindex_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[i,"start_year"]}']
    wageindex_start.append(p_s)
wide['wageindex_start'] = wageindex_start
avg_profit_start = []
s = 'avg_profit_'
for i in range(wide.shape[0]):
    p_s = wide.loc[i,f'{s}{wide.loc[i,"start_year"]}']
    avg_profit_start.append(p_s)
wide['avg_profit_start'] = avg_profit_start

In [48]:

profit_index_name = ["profit_2005","profit_2006","profit_2007",
                     "profit_2008","profit_2009","profit_2010","profit_2011","profit_2012",
                     "profit_2013","profit_2014","profit_2015","profit_2016","profit_2017"]
size_index_name = ["size_2005","size_2006","size_2007",
                     "size_2008","size_2009","size_2010","size_2011","size_2012",
                     "size_2013","size_2014","size_2015","size_2016","size_2017"]
wageindex_index_name = ["wageindex_2005","wageindex_2006","wageindex_2007",
                     "wageindex_2008","wageindex_2009","wageindex_2010","wageindex_2011","wageindex_2012",
                     "wageindex_2013","wageindex_2014","wageindex_2015","wageindex_2016","wageindex_2017"]
avg_profit_index_name = ["avg_profit_2005","avg_profit_2006","avg_profit_2007",
                     "avg_profit_2008","avg_profit_2009","avg_profit_2010","avg_profit_2011","avg_profit_2012",
                     "avg_profit_2013","avg_profit_2014","avg_profit_2015","avg_profit_2016","avg_profit_2017"]
growth_rate = []
for i in range(wide.shape[0]):
    a = wide.loc[i,profit_index_name[1:15]]
    b = wide.loc[i,profit_index_name[0:14]]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['profit_gr'] = growth_rate

growth_rate = []
for i in range(wide.shape[0]):
    a = wide.loc[i,profit_index_name[1:15]]
    b = wide.loc[i,profit_index_name[0:14]]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['profit_gr'] = growth_rate

growth_rate = []
for i in range(wide.shape[0]):
    a = wide.loc[i,size_index_name[1:15]]
    b = wide.loc[i,size_index_name[0:14]]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['size_gr'] = growth_rate

growth_rate = []
for i in range(wide.shape[0]):
    a = wide.loc[i,wageindex_index_name[1:15]]
    b = wide.loc[i,wageindex_index_name[0:14]]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['wageindex_gr'] = growth_rate

growth_rate = []
for i in range(wide.shape[0]):
    a = wide.iloc[i,6:20]
    b = wide.iloc[i,5:19]
    c = [(a[j] - b[j])/b[j] for j in range(len(a))]
    c = np.nanmean(c)
    growth_rate.append(c)
wide['avg_profit_gr'] = growth_rate

In [49]:
keep_treatment_after_change_wide = wide
treatment_after_change_wide = wide

In [50]:
treatment_after_change_wide = treatment_after_change_wide.drop(size_index_name,axis=1)
treatment_after_change_wide = treatment_after_change_wide.drop(profit_index_name,axis=1)
treatment_after_change_wide = treatment_after_change_wide.drop(wageindex_index_name,axis=1)
treatment_after_change_wide = treatment_after_change_wide.drop(avg_profit_index_name,axis=1)

### 2. 获取对应对照组宽数据 control_sim_wide_1
已在前面整理为control_sim_wide_1

In [51]:
#control_sim_wide_1
#treatment_after_change_wide

### 3. 整理指标数据

In [52]:
profit_start_t = treatment_after_change_wide.profit_start
profit_start_c = control_sim_wide_1.profit_start
size_start_t = treatment_after_change_wide.size_start
size_start_c = control_sim_wide_1.size_start
wageindex_start_t = treatment_after_change_wide.wageindex_start
wageindex_start_c = control_sim_wide_1.wageindex_start
avg_profit_start_t = treatment_after_change_wide.avg_profit_start
avg_profit_start_c = control_sim_wide_1.avg_profit_start
profit_gr_t = treatment_after_change_wide.profit_gr
profit_gr_c = control_sim_wide_1.profit_gr
size_gr_t = treatment_after_change_wide.size_gr
size_gr_c = control_sim_wide_1.size_gr
wageindex_gr_t = treatment_after_change_wide.wageindex_gr
wageindex_gr_c = control_sim_wide_1.wageindex_gr
avg_profit_gr_t = treatment_after_change_wide.avg_profit_gr
avg_profit_gr_c = control_sim_wide_1.avg_profit_gr

In [53]:
profit_start_t_mean=profit_start_t.mean()
profit_start_c_mean=profit_start_c.mean()
size_start_t_mean=size_start_t.mean()
size_start_c_mean=size_start_c.mean()
wageindex_start_t_mean=wageindex_start_t.mean()
wageindex_start_c_mean=wageindex_start_c.mean()
avg_profit_start_t_mean=avg_profit_start_t.mean()
avg_profit_start_c_mean=avg_profit_start_c.mean()
profit_gr_t_mean=profit_gr_t.mean()
profit_gr_c_mean=profit_gr_c.mean()
size_gr_t_mean=size_gr_t.mean()
size_gr_c_mean=size_gr_c.mean()
wageindex_gr_t_mean=wageindex_gr_t.mean()
wageindex_gr_c_mean=wageindex_gr_c.mean()
avg_profit_gr_t_mean = avg_profit_gr_t.mean()
avg_profit_gr_c_mean = avg_profit_gr_c.mean()

### 4. 假设检验

In [54]:
def CI(t_mean,c_mean,t_std,c_std):
    t_ci=1.960
    t_n = 436
    c_n = 436
    #计算标准误差
    se=np.sqrt( np.square(t_std)/t_n + np.square(c_std)/c_n )
    sample_mean=t_mean - c_mean
    #置信区间上限
    a=round((sample_mean - t_ci * se),3)
    #置信区间下限
    b=round((sample_mean + t_ci * se),3)
    return (a,b)


#### 1. 验证实验组初期指标皆高于对照组

    1. profit_start 团队初期总盈利 (实验组团队在改革后第一年总盈利或对照组团队于2003年/创建第一年总盈利)
    
       记实验组的profit_start为 p_11，对照组的profit_start为 p_12。
       
       对实验组而言，profit_start衡量的是在改革之后第一年的盈利，对对照组而言，由于没有改革，profit_start衡量的是团队第一年的盈利。我们希望实验组在改革之后有盈利上的提升。所以我们目标是拒绝两个团队盈利相当的假设。
      
    零假设    H_0： p_11 = p_12
    备择假设   H_1 ： p_11 > p_12

In [55]:
t_mean=profit_start_t.mean()
c_mean=profit_start_c.mean()
#样本标准差
t_std=profit_start_t.std()
c_std=profit_start_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为7637.108,标准差为5529.708, 对照组的均值为5922.516,标准差为4586.234.
两个平均值差值的置信区间，95置信水平 CI=[1040.242000,2388.941000]


In [56]:
t, pval = scipy.stats.ttest_ind(profit_start_t,profit_start_c)
print(t,pval)

4.983470701107065 7.53553885285607e-07


    根据t检验拒绝原假设，认为实验组改革后第一年的利益是大于对照组的。

    2. size_start 团队初期总人数   (实验组团队在改革后第一年总人数或对照组团队于2003年/创建第一年总人数)
    
        记实验组的size_start为 p_21，对照组的size_start为 p_22。
        
        对实验组而言，size_start衡量的是在改革之后第一年团队的人数，对对照组而言，size_start衡量的是团队第一年的总人数。通常来说时间越晚，做的项目越多越庞大，团队人数应该会上升。所以我们目标是拒绝两个团队人数相当的假设。

    零假设 H_0： p_21 = p_22
    
    备择假设 H_1 ： p_21 > p_22

In [57]:
t_mean=size_start_t.mean()
c_mean=size_start_c.mean()
#样本标准差
t_std=size_start_t.std()
c_std=size_start_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为203.462,标准差为104.862, 对照组的均值为167.454,标准差为78.314.
两个平均值差值的置信区间，95置信水平 CI=[23.723000,48.293000]


In [58]:
t, pval = scipy.stats.ttest_ind(size_start_t,size_start_c)
print(t,pval)

5.744791977128681 1.2720861929519377e-08


    根据t检验拒绝原假设，认为实验组改革后第一年的团队人数多于对照组。

    3. wageindex_start 团队初期薪酬指标   (实验组团队在改革后第一年薪酬指标或对照组团队于2003年/创建第一年薪酬指标)
    
    记实验组的wageindex_start为 p_31，对照组的wageindex_start为 p_32。
    
    对实验组而言，wageindex_start衡量的是在改革之后第一年薪酬指标，对对照组而言，size_start衡量的是团队第一年的薪酬指标。通常来说时间越晚，薪酬会增多。所以我们目标是拒绝两个团队人数相当的假设。
    
    零假设 H_0： p_31 = p_32
    
    备择假设 H_1 ： p_31 > p_32

In [59]:
t_mean=wageindex_start_t.mean()
c_mean=wageindex_start_c.mean()
#样本标准差
t_std=wageindex_start_t.std()
c_std=wageindex_start_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为0.942,标准差为0.302, 对照组的均值为0.585,标准差为0.513.
两个平均值差值的置信区间，95置信水平 CI=[0.301000,0.413000]


In [60]:
t, pval = scipy.stats.ttest_ind(wageindex_start_t,wageindex_start_c)
print(t,pval)

12.537933054663652 2.870327801639024e-33


    根据t检验拒绝原假设，认为实验组改革后第一年的薪酬远多于对照组。


    4. avg_profit_start    团队初期人均盈利  (实验组团队在改革后第一年人均盈利或对照组团队于2003年/创建第一年人均盈利)
    
    记实验组的avg_profit_start为 p_41，对照组的avg_profit_start为 p_42。
    
    对实验组而言，avg_profit_start衡量的是在改革之后第一年人均盈利，对对照组而言，size_start衡量的是团队第一年的人均盈利。通常来说时间越晚，人均盈利理应上升，否则团队应该进行内部整改。所以我们目标是拒绝两个团队人数相当的假设。

    
    零假设 H_0： p_41 = p_42
    
    备择假设 H_1 ： p_41 > p_42

In [61]:
t_mean=avg_profit_start_t.mean()
c_mean=avg_profit_start_c.mean()
#样本标准差
t_std=avg_profit_start_t.std()
c_std=avg_profit_start_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为35.751,标准差为13.712, 对照组的均值为30.998,标准差为13.153.
两个平均值差值的置信区间，95置信水平 CI=[2.970000,6.537000]


In [62]:
t, pval = scipy.stats.ttest_ind(avg_profit_start_t,avg_profit_start_c)
print(t,pval)


5.223852561743809 2.1941855681069663e-07


    根据t检验拒绝原假设，认为实验组改革后第一年的人均盈利高于对照组。

#### 2. 验证实验组指标增长率皆高于对照组

    1. profit_gr  平均盈利增长率 （实验组团队在改革后平均盈利增长率或对照组团队历年以来平均盈利增长率）
    
    记实验组的profit_gr为 p_51，对照组的profit_gr为 p_52。
    
    根据第一步对期初盈利的分析，我们认为实验组的盈利已普遍高于对照组。若改革有效，团队整体盈利能力应该得到提升，总盈利指标应该逐年上升，而上升幅度要比对照组高。
    
    零假设 H_0： p_51 = p_52
    
    备择假设 H_1 ： p_51 > p_52

In [63]:
t_mean=profit_gr_t.mean()
c_mean=profit_gr_c.mean()
#样本标准差
t_std=profit_gr_t.std()
c_std=profit_gr_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为0.043,标准差为0.148, 对照组的均值为0.004,标准差为0.018.
两个平均值差值的置信区间，95置信水平 CI=[0.025000,0.053000]


In [64]:
t, pval = scipy.stats.ttest_ind(profit_gr_t,profit_gr_c)
print(t,pval)

5.500733182853414 4.974398455468453e-08


    根据t检验拒绝原假设，认为实验组改革后平均盈利增长率高于对照组。
    两组的平均总利益增长率皆大于0（实验组为4.3%，对照组为0.4%）。若该公司不对团队采取改革措施，总收益也处于上升趋势，但幅度不高，不如改革后的实验组团队。原因可能在于该改革策略带来员工效率提升。

    2. size_gr  团队历年来平均人数增长率 （实验组团队在改革后人数增长率或对照组团队历年以来平均人数增长率）
    
    记实验组的size_gr为 p_61，对照组的size_gr为 p_62。
    
    根据第一步对期初团队人数的分析，我们认为实验组的人数普遍高于对照组。团队人数上升代表该团队对团队项目有更高需求，需要更多的员工共同完成。若团队人数逐年增长，说明团队整体水平上升，负责项目需投入更多人力资源。所以我们目标是拒绝两个团队人数相当的假设来验证改革政策使得团队愈加庞大。
    
    零假设 H_0： p_61 = p_62
    
    备择假设 H_1 ： p_61 > p_62

In [65]:
t_mean=size_gr_t.mean()
c_mean=size_gr_c.mean()
#样本标准差
t_std=size_gr_t.std()
c_std=size_gr_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为0.002,标准差为0.037, 对照组的均值为-0.004,标准差为0.018.
两个平均值差值的置信区间，95置信水平 CI=[0.002000,0.010000]


In [66]:
t, pval = scipy.stats.ttest_ind(size_gr_t,size_gr_c)
print(t,pval)

2.88667781043815 0.0039897059342363555


    根据t检验拒绝原假设，认为实验组改革后需要更多人力投入。
    
    实验组的人数增长率大于0（0.2%），而对照组增长率为负值（-0.4%）。根据对照组表现，若该公司不对团队采取改革措施，公司人数会总体呈现下降趋势，但幅度不大。若采取改革，团队会接手更多项目，投入更多人力。


    3. wageindex_gr  团队历年来平均薪酬指标增长率 （实验组团队在改革以来平均薪酬指标增长率或对照组团队历年以来平均薪酬指标增长率）
    
    记实验组的wageindex_gr为 p_71，对照组的wageindex_gr为 p_72。
    
    根据第一步对期初薪酬的分析，我们认为实验组的薪酬普遍高于对照组。薪酬指标上升有两个可能原因，一是项目需要团队高薪聘请更多的专业人才，二是团队绩效突出带来的薪酬激励。无论是那种都表示团队水平表现好。所以我们目标是拒绝两个团队薪酬增长相当的假设来验证改革政策使得团队有更高薪酬指标（若t值为正数）。
   
    
    零假设 H_0： p_71 = p_72
    
    备择假设 H_1 ： p_71 > p_72

In [67]:
t_mean=wageindex_gr_t.mean()
c_mean=wageindex_gr_c.mean()
#样本标准差
t_std=wageindex_gr_t.std()
c_std=wageindex_gr_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为0.135,标准差为0.387, 对照组的均值为4.065,标准差为5.498.
两个平均值差值的置信区间，95置信水平 CI=[-4.447000,-3.412000]


In [68]:
t, pval = scipy.stats.ttest_ind(wageindex_gr_t,wageindex_gr_c)
print(t,pval)

-14.885225937544242 8.219265576419597e-45


    根据t检验拒绝原假设。但t值为负值，这就意味着对照组的薪酬指标增长要普遍高于实验组。实验组的薪酬增长率为13.5%，而对照组增长率有406.5%。但这一数据不能说明实验组绩效不突出，因为在上一组假设检验中验证了实验组起始薪酬要高于对照组，基数较高增长率低是难免的。两组均值皆为正数，说明该公司在持续走高，且给员工更多的薪酬。


    4. avg_profit_start   团队历年来平均人均带来盈利增长率 (实验组团队在改革以来人均带来盈利增长率或对照组团队历年以来平均人均带来盈利增长率)
    
    记实验组的avg_profit_start为 p_81，对照组的avg_profit_start为 p_82。
    
    根据第一步对期初人均带来的收益的分析，我们认为实验组普遍高于对照组的。类似profit_gr，若改革有效，团队整体盈利能力应该得到提升，人均盈利指标应该逐年上升，而且上升幅度要比对照组高。
   
    
    零假设 H_0： p_81 >= p_82
    
    备择假设 H_1 ： p_81 < p_82

In [69]:
t_mean=avg_profit_gr_t.mean()
c_mean=avg_profit_gr_c.mean()
#样本标准差
t_std=avg_profit_gr_t.std()
c_std=avg_profit_gr_c.std()
print(f'实验组的均值为{round(t_mean,3)},标准差为{round(t_std,3)}, 对照组的均值为{round(c_mean,3)},标准差为{round(c_std,3)}.')
print('两个平均值差值的置信区间，95置信水平 CI=[%f,%f]' % CI(t_mean,c_mean,t_std,c_std))

实验组的均值为1.84,标准差为6.107, 对照组的均值为0.01,标准差为0.029.
两个平均值差值的置信区间，95置信水平 CI=[1.256000,2.403000]


In [70]:
t, pval = scipy.stats.ttest_ind(avg_profit_gr_t,avg_profit_gr_c)
print(t,pval)

6.254893480125795 6.23134077596045e-10


    根据t检验拒绝原假设，认为实验组改革后人均带来的收益增高。实验组的人均收益增长率为6.11%。相比之下，对照组略显逊色，人均收益增长率接近0。根据实验组表现，若公司采取改革行动，团队会带来更高收益。