In [391]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [392]:
YEAR = ['2016', '2017', '2018', '2019', '2020']

# CSR

In [393]:
csr = {}
col = {}
for i in range(len(YEAR)):
    csr[i] = pd.read_excel('./社会责任评分/{}社会责任评分.xls'.format(YEAR[i])).dropna().drop(
        columns=['评分等级', '相关链接', '所属地区', '行业类别'])
    csr[i]['代码'] = csr[i]['代码'].apply(lambda x: x[0:-3])
    col[i] = set(csr[i]['代码'])

common_index = list(col[0] & col[1] & col[2] & col[3] & col[4])
# print(csr[1])

In [394]:
processed_csr = {}
for i in range(len(YEAR)):
    processed_csr[i] = csr[i][csr[i]['代码'].isin(common_index)].sort_values('代码')
    processed_csr[i] = processed_csr[i][['代码', '总评分', '截止年份']]

In [395]:
panel_csr = processed_csr[0].append(processed_csr[1]).append(processed_csr[2]).append(processed_csr[3]).append(
    processed_csr[4])
# print(processed_csr[1])
panel_csr.to_excel('./社会责任评分/panel_csr.xls', index=False)

# 经营绩效

In [396]:
perform = {}
proc_perform = {}
for i in range(len(YEAR)):
    perform[i] = pd.read_excel('./年股票综合数据/YRESSTK_{}.xls'.format(YEAR[i]), dtype={"股票代码_Stkcd": str})
    proc_perform[i] = perform[i][
        ["股票代码_Stkcd", "公司国有股_Comstateshr", "总股数_Fullshr", "证监会行业门类代码_Csrciccd1", "截止日期_Enddt",
         "每股收益(摊薄)(元/股)_EPS", "净资产收益率(摊薄)_ROE"]]
    proc_perform[i]["Year"] = proc_perform[i]["截止日期_Enddt"].apply(lambda x: x.year)
    proc_perform[i]["Performance"] = proc_perform[i]["每股收益(摊薄)(元/股)_EPS"] * 0.99 + proc_perform[i][
        "净资产收益率(摊薄)_ROE"] * 0.01
    proc_perform[i]["股权性质"] = proc_perform[i]["公司国有股_Comstateshr"] / proc_perform[i]["总股数_Fullshr"]
    proc_perform[i]["行业"] = proc_perform[i]["证监会行业门类代码_Csrciccd1"].apply(
        lambda s: "1" if "C" in str(s) else "0")
    proc_perform[i] = proc_perform[i].drop(
        columns=["公司国有股_Comstateshr", "总股数_Fullshr", "证监会行业门类代码_Csrciccd1", "截止日期_Enddt",
                 "每股收益(摊薄)(元/股)_EPS", "净资产收益率(摊薄)_ROE"])

In [397]:
panel_perform = proc_perform[0].append(proc_perform[1]).append(proc_perform[2]).append(proc_perform[3]).append(
    proc_perform[4])
# print(proc_perform[1])
panel_perform.to_excel('./年股票综合数据/panel_perform.xls', index=False)

# 控制变量

In [398]:
control = {}
proc_control = {}
for i in range(len(YEAR)):
    control[i] = pd.read_excel('./资产负债表/BS_ALL_{}.xls'.format(YEAR[i]),
                               dtype={"A股股票代码_A_StkCd": str}).dropna()
    proc_control[i] = control[i][
        ['A股股票代码_A_StkCd', '截止日期_EndDt', '资产总计(元)_TotAss', '负债合计(元)_TotLiab']]
    proc_control[i]['规模'] = np.log(proc_control[i]['资产总计(元)_TotAss'])
    proc_control[i]['负债水平'] = proc_control[i]['负债合计(元)_TotLiab'] / proc_control[i]['资产总计(元)_TotAss']
    proc_control[i]['年份'] = proc_control[i]["截止日期_EndDt"].apply(lambda x: x.year)
    proc_control[i] = proc_control[i].rename(columns={"A股股票代码_A_StkCd": "股票代码"}).drop(
        columns=['资产总计(元)_TotAss', '负债合计(元)_TotLiab', '截止日期_EndDt'])

In [399]:
panel_control = proc_control[0].append(proc_control[1]).append(proc_control[2]).append(proc_control[3]).append(
    proc_control[4])
# print(proc_control[1])
panel_control.to_excel('./资产负债表/panel_control.xls', index=False)

# 汇总数据

In [400]:
# 合并各年的数据
temp = {}
for i in range(len(YEAR)):
    temp[i] = pd.merge(
        processed_csr[i], proc_perform[i], left_on="代码", right_on="股票代码_Stkcd", how='inner')
    temp[i] = temp[i].drop(columns=['股票代码_Stkcd', '截止年份']).rename(columns={'总评分': 'CSR'})

    temp[i] = pd.merge(
        temp[i], proc_control[i], left_on="代码", right_on="股票代码", how='inner')
    temp[i] = temp[i].drop(columns=['股票代码', 'Year'])

In [401]:
panel_raw_data = temp[0].append(temp[1]).append(temp[2]).append(temp[3]).append(temp[4])
print(panel_raw_data)

          代码    CSR  Performance  股权性质 行业         规模      负债水平    年份
0     000002  76.52     0.824792   0.0  0  27.351214  0.810123  2016
1     000004  21.30     0.330957   0.0  1  19.759923  0.552979  2016
2     000005  16.33     0.015196   0.0  0  21.647456  0.462232  2016
3     000006  63.26     0.012753   0.0  0  23.359293  0.681010  2016
4     000007  10.14     0.124470   0.0  0  20.069592  0.195184  2016
...      ...    ...          ...   ... ..        ...       ...   ...
3192  603991   3.33    -0.568543   0.0  1  20.206846  0.357558  2020
3193  603993  19.34     0.109168   0.0  0  25.544613  0.605590  2020
3194  603997   7.29    -0.361137   0.0  1  23.579522  0.754537  2020
3195  603998  27.31     0.191148   0.0  1  21.448076  0.408192  2020
3196  603999  16.20     0.131931   0.0  0  21.472300  0.167235  2020

[15572 rows x 8 columns]


In [402]:
balance_index = panel_raw_data.groupby(by='代码')['年份'].count()[
    panel_raw_data.groupby(by='代码')['年份'].count() == 5].index
balance_data = panel_raw_data[panel_raw_data['代码'].isin(balance_index)]

至此得到平衡面板数据balance_data

下一步去除：
1. 若数据缺失或有异常，则予以剔除
2. 上市时间小于5的公司,为了避免IPO效应带来的影响，剔除2016年及之后上市的公司样本
3. 所有“金融类”公司，鉴于金融行业具有普遍高负债等行业特性，剔除所有金融类公司
4. 连续亏损的公司所存在的问题可能对论文研究造成一定影响，故剔除所有ST、SST、PT、\*ST、S\*ST公司

## 删除2016年及之后上市的公司

In [403]:
data_year = pd.read_excel('./上市时间/Year.xls', dtype={'股票代码': str})
data_year['上市年份'] = pd.to_datetime(data_year['首次上市日期']).apply(lambda x: x.year)
data_year = data_year[data_year['上市年份'] > 2015]

In [404]:
data_year_ID_unique = data_year['股票代码'].unique()
firm_TTM = pd.DataFrame(columns=['股票代码', '上市年份'])
for code in data_year_ID_unique:
    temp = data_year[data_year['股票代码'] == code]['上市年份'].min()
    firm_TTM = firm_TTM.append({"股票代码": code, "上市年份": temp}, ignore_index=True)
# print(firm_TTM)

In [405]:
new_firm_ID = firm_TTM['股票代码']
final_data = balance_data[~balance_data['代码'].isin(new_firm_ID)]

In [406]:
# 将代码、行业性质、股权性质转化为int类型，便于Stata识别
final_data['IND'] = final_data['行业'].astype(int)
final_data = final_data.drop(columns=['行业']).rename(
    columns={'代码': 'Stk_cd', 'Performance': 'BP', '股权性质': 'SOE', '规模': 'SIZE', '负债水平': 'LEV',
             '年份': 'YEAR'})
final_data.to_excel('./final_data.xls', index=False)
# print(final_data)

## 检查是否有异常值或缺失值

In [407]:
# 检查是否平衡
imbalance_index = final_data.groupby(by='Stk_cd')['YEAR'].count()[
    final_data.groupby(by='Stk_cd')['YEAR'].count() != 5].index
print(imbalance_index)
# 检查是否有缺失值
if final_data.isnull().values.any():
    print("有缺失值")
else:
    print("无缺失值")

Index([], dtype='object', name='Stk_cd')
无缺失值


In [408]:
print(final_data)

      Stk_cd    CSR        BP  SOE       SIZE       LEV  YEAR  IND
0     000002  76.52  0.824792  0.0  27.351214  0.810123  2016    0
1     000004  21.30  0.330957  0.0  19.759923  0.552979  2016    1
2     000005  16.33  0.015196  0.0  21.647456  0.462232  2016    0
3     000006  63.26  0.012753  0.0  23.359293  0.681010  2016    0
4     000007  10.14  0.124470  0.0  20.069592  0.195184  2016    0
...      ...    ...       ...  ...        ...       ...   ...  ...
3189  603988  28.11  0.950113  0.0  21.002087  0.426588  2020    1
3190  603989  25.51  0.778381  0.0  22.078823  0.334527  2020    1
3194  603997   7.29 -0.361137  0.0  23.579522  0.754537  2020    1
3195  603998  27.31  0.191148  0.0  21.448076  0.408192  2020    1
3196  603999  16.20  0.131931  0.0  21.472300  0.167235  2020    0

[11700 rows x 8 columns]
