In [458]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [459]:
YEAR = ['2016', '2017', '2018', '2019', '2020']

# CSR

In [460]:
csr = {}
col = {}
for i in range(len(YEAR)):
    csr[i] = pd.read_excel('./社会责任评分/{}社会责任评分.xls'.format(YEAR[i])).dropna().drop(
        columns=['评分等级', '相关链接', '所属地区', '行业类别'])
    csr[i]['代码'] = csr[i]['代码'].apply(lambda x: x[0:-3])
    col[i] = set(csr[i]['代码'])

common_index = list(col[0] & col[1] & col[2] & col[3] & col[4])
# print(csr[1])

In [461]:
processed_csr = {}
for i in range(len(YEAR)):
    processed_csr[i] = csr[i][csr[i]['代码'].isin(common_index)].sort_values('代码')
    processed_csr[i] = processed_csr[i][['代码', '总评分', '截止年份']]

In [462]:
panel_csr = processed_csr[0].append(processed_csr[1]).append(processed_csr[2]).append(processed_csr[3]).append(
    processed_csr[4])
# print(processed_csr[1])
panel_csr.to_excel('./社会责任评分/panel_csr.xls', index=False)

# 经营绩效

In [463]:
perform = {}
proc_perform = {}
for i in range(len(YEAR)):
    perform[i] = pd.read_excel('./年股票综合数据/YRESSTK_{}.xls'.format(YEAR[i]), dtype={"股票代码_Stkcd": str})
    proc_perform[i] = perform[i][
        ["股票代码_Stkcd", "公司国有股_Comstateshr", "总股数_Fullshr", "证监会行业门类代码_Csrciccd1", "截止日期_Enddt",
         "每股收益(摊薄)(元/股)_EPS", "净资产收益率(摊薄)_ROE"]]
    proc_perform[i]["Year"] = proc_perform[i]["截止日期_Enddt"].apply(lambda x: x.year)
    proc_perform[i]["Performance"] = proc_perform[i]["每股收益(摊薄)(元/股)_EPS"] * 0 + proc_perform[i][
        "净资产收益率(摊薄)_ROE"] * 1
    proc_perform[i]["股权性质"] = proc_perform[i]["公司国有股_Comstateshr"] / proc_perform[i]["总股数_Fullshr"]
    proc_perform[i]["行业"] = proc_perform[i]["证监会行业门类代码_Csrciccd1"].apply(
        lambda s: "1" if "C" in str(s) else "0")
    proc_perform[i] = proc_perform[i].drop(
        columns=["公司国有股_Comstateshr", "总股数_Fullshr", "证监会行业门类代码_Csrciccd1", "截止日期_Enddt",
                 "每股收益(摊薄)(元/股)_EPS", "净资产收益率(摊薄)_ROE"])

In [464]:
panel_perform = proc_perform[0].append(proc_perform[1]).append(proc_perform[2]).append(proc_perform[3]).append(
    proc_perform[4])
# print(proc_perform[1])
panel_perform.to_excel('./年股票综合数据/panel_perform.xls', index=False)

# 控制变量

In [465]:
control = {}
proc_control = {}
for i in range(len(YEAR)):
    control[i] = pd.read_excel('./资产负债表/BS_ALL_{}.xls'.format(YEAR[i]),
                               dtype={"A股股票代码_A_StkCd": str}).dropna()
    proc_control[i] = control[i][
        ['A股股票代码_A_StkCd', '截止日期_EndDt', '资产总计(元)_TotAss', '负债合计(元)_TotLiab']]
    proc_control[i]['规模'] = np.log(proc_control[i]['资产总计(元)_TotAss'])
    proc_control[i]['负债水平'] = proc_control[i]['负债合计(元)_TotLiab'] / proc_control[i]['资产总计(元)_TotAss']
    proc_control[i]['年份'] = proc_control[i]["截止日期_EndDt"].apply(lambda x: x.year)
    proc_control[i] = proc_control[i].rename(columns={"A股股票代码_A_StkCd": "股票代码"}).drop(
        columns=['资产总计(元)_TotAss', '负债合计(元)_TotLiab', '截止日期_EndDt'])

In [466]:
panel_control = proc_control[0].append(proc_control[1]).append(proc_control[2]).append(proc_control[3]).append(
    proc_control[4])
# print(proc_control[1])
panel_control.to_excel('./资产负债表/panel_control.xls', index=False)

## 资产负债率

In [467]:
debt = pd.read_excel('./资产负债率/FI_T1.xls', dtype={"Stkcd": str}).dropna()
debt["年份"] = debt["Accper"].apply(lambda x: x[0:-6]).astype(int)
debt = debt.drop(columns=["Accper", "ShortName", "Typrep"]).rename(columns={"Stkcd": "代码"})
print(debt)

           代码  F010101A  F010201A  F011201A  F011401A    年份
0      000002  1.243616  0.437819  0.805367  0.806787  2016
1      000002  1.200857  0.495029  0.839813  0.840999  2017
2      000002  1.154342  0.485571  0.845856  0.848727  2018
3      000002  1.130738  0.425873  0.843590  0.846275  2019
4      000002  1.174494  0.413910  0.812835  0.815581  2020
...       ...       ...       ...       ...       ...   ...
22502  900957  1.404382  1.404382  0.661961  0.666012  2016
22503  900957  1.925383  1.925383  0.584235  0.588172  2017
22504  900957  2.100740  2.100740  0.551612  0.555223  2018
22505  900957  1.964754  1.964754  0.531432  0.534854  2019
22506  900957  2.991174  2.991174  0.505506  0.508647  2020

[22507 rows x 6 columns]


## 企业规模

In [468]:
size = pd.read_excel('./控制变量/PT_LCMAINFIN.xls', dtype={"Symbol": str}).dropna()
size["年份"] = size["EndDate"].apply(lambda x: x[0:-6]).astype(int)
size["企业规模"] = np.log(size["TotalAssets"])
size = size.drop(columns=["EndDate"]).rename(columns={"Symbol": "代码"})
print(size)

           代码   TotalAssets  TotalLiability    年份       企业规模
0      000002  8.306742e+11    6.689976e+11  2016  27.445504
1      000002  1.165347e+12    9.786730e+11  2017  27.784040
2      000002  1.528579e+12    1.292959e+12  2018  28.055360
3      000002  1.729929e+12    1.459350e+12  2019  28.179102
4      000002  1.869177e+12    1.519333e+12  2020  28.256519
...       ...           ...             ...   ...        ...
17884  900957  1.165033e+09    7.712067e+08  2016  20.876016
17885  900957  1.019428e+09    5.955854e+08  2017  20.742508
17886  900957  1.007002e+09    5.554740e+08  2018  20.730243
17887  900957  1.017715e+09    5.408456e+08  2019  20.740825
17888  900957  1.008562e+09    5.098344e+08  2020  20.731791

[17889 rows x 5 columns]


## 国有股持股比例

In [469]:
state = pd.read_excel('./控制变量/HLD_Capstru.xls', dtype={"Stkcd": str}).dropna()
state["年份"] = state["Reptdt"].apply(lambda x: x[0:-6]).astype(int)
state["国有股持股比例"] = state["Nshrstt"] / state["Nshrttl"]
state = state.drop(columns=["Reptdt", "Nshrstt", "Nshrttl"]).rename(columns={"Stkcd": "代码"})
print(state)

           代码    年份  国有股持股比例
0      000002  2016      0.0
1      000002  2017      0.0
2      000002  2018      0.0
3      000002  2019      0.0
4      000002  2020      0.0
...       ...   ...      ...
16939  900957  2016      0.0
16940  900957  2017      0.0
16941  900957  2018      0.0
16942  900957  2019      0.0
16943  900957  2020      0.0

[16944 rows x 3 columns]


## 托宾-Q

In [470]:
tobin = pd.read_excel('./相对价值指标/FI_T10.xls', dtype={"Stkcd": str}).dropna()
tobin["年份"] = tobin["Accper"].apply(lambda x: x[0:-6]).astype(int)
tobin = tobin.drop(columns=["Accper", "ShortName"]).rename(columns={"Stkcd": "代码"})
print(tobin)

           代码  F100901A    年份
0      000002  1.069010  2016
1      000002  1.117963  2017
2      000002  1.015736  2018
3      000002  1.046272  2019
4      000002  0.992654  2020
...       ...       ...   ...
16432  900957  2.057613  2016
16433  900957  1.923619  2017
16434  900957  1.536094  2018
16435  900957  1.454232  2019
16436  900957  1.338059  2020

[16437 rows x 3 columns]


In [471]:
# tobin = {}
# proc_tobin = {}
# for i in range(len(YEAR)):
#     tobin[i] = pd.read_excel('./相对价值指标/FI_T10_{}.xls'.format(YEAR[i]), dtype={"Stkcd": str}).dropna()
#     proc_tobin[i] = tobin[i][["Stkcd", "Accper", "F100902A"]]
#     proc_tobin[i]["Year"] = proc_tobin[i]["Accper"].apply(lambda x: x[0:-6])
#     proc_tobin[i] = proc_tobin[i].drop(columns=["Accper"])

In [472]:
# panel_tobin = proc_tobin[0].append(proc_tobin[1]).append(proc_tobin[2]).append(proc_tobin[3]).append(
#     proc_tobin[4])
# # print(proc_tobin[1])
# panel_tobin.to_excel('./相对价值指标/panel_tobin.xls', index=False)

## TOP10

In [473]:
top10 = pd.read_excel('./股权性质/EN_EquityNatureAll.xls', dtype={"Symbol": str}).dropna()
top10["年份"] = top10["EndDate"].apply(lambda x: x[0:-6]).astype(int)
top10 = top10.drop(columns=["EndDate", "ShortName", "EquityNature"]).rename(columns={"Symbol": "代码"})
print(top10)

           代码  TopTenHoldersRate    年份
0      000002              57.40  2016
1      000002              68.53  2017
2      000002              71.70  2018
3      000002              64.24  2019
4      000002              58.98  2020
...       ...                ...   ...
16938  900957              48.17  2016
16939  900957              48.43  2017
16940  900957              48.02  2018
16941  900957              48.17  2019
16942  900957              48.30  2020

[16543 rows x 3 columns]


In [474]:
# 试图篡改BP
new_bp = pd.read_excel('./盈利能力/FI_T5.xls', dtype={"Stkcd": str}).dropna()
new_bp["年份"] = new_bp["Accper"].apply(lambda x: x[0:-6]).astype(int)
new_bp = new_bp.drop(columns=["Accper", "ShortName", "Typrep"]).rename(columns={"Stkcd": "代码"})
print(new_bp)

           代码  F050201B  F050501B  F050502B  F051201B    年份
0      000002  0.034129  0.175352  0.190279  0.100603  2016
1      000002  0.031929  0.199323  0.213626  0.102409  2017
2      000002  0.032234  0.209117  0.233355  0.113374  2018
3      000002  0.031869  0.203754  0.217825  0.109815  2019
4      000002  0.031724  0.169499  0.191154  0.101030  2020
...       ...       ...       ...       ...       ...   ...
22502  900957  0.002594  0.007674  0.007513  0.040471  2016
22503  900957  0.029444  0.070818  0.073418  0.068340  2017
22504  900957  0.027493  0.061314  0.063254  0.062421  2018
22505  900957  0.024900  0.053141  0.054591  0.057661  2019
22506  900957  0.021673  0.043829  0.044811  0.052138  2020

[17767 rows x 6 columns]


# 汇总数据

In [475]:
# 合并各年的数据
temp = {}
for i in range(len(YEAR)):
    temp[i] = pd.merge(
        processed_csr[i], proc_perform[i], left_on="代码", right_on="股票代码_Stkcd", how='inner')
    temp[i] = temp[i].drop(columns=['股票代码_Stkcd', '截止年份']).rename(columns={'总评分': 'CSR'})

    temp[i] = pd.merge(
        temp[i], proc_control[i], left_on="代码", right_on="股票代码", how='inner')
    temp[i] = temp[i].drop(columns=['股票代码', 'Year'])

    # temp[i] = pd.merge(
    #     temp[i], proc_tobin[i], left_on="代码", right_on="Stkcd", how='inner')
    # temp[i] = temp[i].drop(columns=['Stkcd', 'Year'])

In [476]:
panel_raw_data = temp[0].append(temp[1]).append(temp[2]).append(temp[3]).append(temp[4])
panel_raw_data = pd.merge(panel_raw_data, top10, on=["代码", "年份"], how='inner')
panel_raw_data = pd.merge(panel_raw_data, new_bp, on=["代码", "年份"], how='inner')
panel_raw_data = pd.merge(panel_raw_data, tobin, on=["代码", "年份"], how='inner')
panel_raw_data = pd.merge(panel_raw_data, debt, on=["代码", "年份"], how='inner')
panel_raw_data = pd.merge(panel_raw_data, size, on=["代码", "年份"], how='inner')
panel_raw_data = pd.merge(panel_raw_data, state, on=["代码", "年份"], how='inner')
print(panel_raw_data)

           代码    CSR  Performance      股权性质 行业         规模      负债水平    年份  \
0      000002  76.52       8.2292  0.000000  0  27.351214  0.810123  2016   
1      000006  63.26       0.2853  0.000000  0  23.359293  0.681010  2016   
2      000008  21.29       3.0131  0.010889  1  22.692695  0.182457  2016   
3      000009  60.48       4.6359  0.000000  0  23.693515  0.626512  2016   
4      000010  15.97       0.9987  0.018431  0  22.266480  0.528967  2016   
...       ...    ...          ...       ... ..        ...       ...   ...   
14301  603991   3.33      -9.3343  0.000000  1  20.206846  0.357558  2020   
14302  603993  19.34       3.9868  0.000000  0  25.544613  0.605590  2020   
14303  603997   7.29      -7.4037  0.000000  1  23.579522  0.754537  2020   
14304  603998  27.31       5.2548  0.000000  1  21.448076  0.408192  2020   
14305  603999  16.20       3.2931  0.000000  0  21.472300  0.167235  2020   

       TopTenHoldersRate  F050201B  ...  F051201B  F100901A  F010101A  \
0 

In [477]:
balance_index = panel_raw_data.groupby(by='代码')['年份'].count()[
    panel_raw_data.groupby(by='代码')['年份'].count() == 5].index
balance_data = panel_raw_data[panel_raw_data['代码'].isin(balance_index)]

至此得到平衡面板数据balance_data

下一步去除：
1. 若数据缺失或有异常，则予以剔除
2. 上市时间小于5的公司,为了避免IPO效应带来的影响，剔除2016年及之后上市的公司样本
3. 所有“金融类”公司，鉴于金融行业具有普遍高负债等行业特性，剔除所有金融类公司
4. 连续亏损的公司所存在的问题可能对论文研究造成一定影响，故剔除所有ST、SST、PT、\*ST、S\*ST公司

## 删除2016年及之后上市的公司

In [478]:
data_year = pd.read_excel('./上市时间/Year.xls', dtype={'股票代码': str})
data_year['上市年份'] = pd.to_datetime(data_year['首次上市日期']).apply(lambda x: x.year)
data_year = data_year[data_year['上市年份'] > 2015]

In [479]:
data_year_ID_unique = data_year['股票代码'].unique()
firm_TTM = pd.DataFrame(columns=['股票代码', '上市年份'])
for code in data_year_ID_unique:
    temp = data_year[data_year['股票代码'] == code]['上市年份'].min()
    firm_TTM = firm_TTM.append({"股票代码": code, "上市年份": temp}, ignore_index=True)
# print(firm_TTM)

In [480]:
new_firm_ID = firm_TTM['股票代码']
final_data = balance_data[~balance_data['代码'].isin(new_firm_ID)]

In [481]:
# 将代码、行业性质、股权性质转化为int类型，便于Stata识别
final_data['IND'] = final_data['行业'].astype(int)
# 改正performance,负债水平,股权性质
final_data = final_data.drop(columns=['行业']).rename(
    columns={'代码': 'Stk_cd', 'F050501B': 'BP', '国有股持股比例': 'SOE', '企业规模': 'SIZE', 'F011201A': 'LEV',
             '年份': 'YEAR', 'F100901A': 'TOBINQ', 'TopTenHoldersRate': 'TOP10'})
final_data.to_excel('./final_data.xls', index=False)
# print(final_data)

## 检查是否有异常值或缺失值

In [482]:
# 检查是否平衡
imbalance_index = final_data.groupby(by='Stk_cd')['YEAR'].count()[
    final_data.groupby(by='Stk_cd')['YEAR'].count() != 5].index
print(imbalance_index)
# 检查是否有缺失值
if final_data.isnull().values.any():
    print("有缺失值")
else:
    print("无缺失值")

# final_data[final_data.isnull().values==True]

Index([], dtype='object', name='Stk_cd')
无缺失值


In [483]:
print(final_data)

       Stk_cd    CSR  Performance      股权性质         规模      负债水平  YEAR  TOP10  \
0      000002  76.52       8.2292  0.000000  27.351214  0.810123  2016  57.40   
2      000008  21.29       3.0131  0.010889  22.692695  0.182457  2016  40.42   
5      000011  38.97      -0.0955  0.663507  22.617843  0.692242  2016  66.55   
6      000012  72.45       3.1158  0.000000  23.552240  0.513330  2016  32.13   
7      000014  28.67      -1.0270  0.000000  21.498907  0.655892  2016  52.79   
...       ...    ...          ...       ...        ...       ...   ...    ...   
14298  603988  28.11      22.7413  0.000000  21.002087  0.426588  2020  67.87   
14299  603989  25.51      10.5181  0.000000  22.078823  0.334527  2020  74.30   
14303  603997   7.29      -7.4037  0.000000  23.579522  0.754537  2020  85.63   
14304  603998  27.31       5.2548  0.000000  21.448076  0.408192  2020  51.93   
14305  603999  16.20       3.2931  0.000000  21.472300  0.167235  2020  64.62   

       F050201B        BP  