In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error,median_absolute_error,explained_variance_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV,KFold,StratifiedKFold,RandomizedSearchCV #交叉验证
from sklearn.preprocessing import StandardScaler #特征标准化
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay #部分依赖图
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split  # 划分训练集、验证集、测试集
from sklearn.svm import SVR #支持向量机
from sklearn.neural_network import MLPRegressor #神经网络
from sklearn.tree import DecisionTreeRegressor #决策树
from sklearn.svm import SVR #支持向量机
from pathlib import Path
# from xgboost.sklearn import XGBRegressor

In [None]:
TARGET_FOLDER = '参考文献/1/20240618102625WU_FILE_1'

def locate_project_root(target_folder=TARGET_FOLDER):
    current = Path.cwd().resolve()
    for candidate in [current, *current.parents]:
        if (candidate / target_folder).exists():
            return candidate
    raise FileNotFoundError(f'未能在 {current} 及其父目录中定位 {target_folder}')

PROJECT_ROOT = locate_project_root()
DATA_DIR = PROJECT_ROOT / TARGET_FOLDER / '数据' / '数据-python'
OUTPUT_DIR = PROJECT_ROOT / 'output'
TABLE_DIR = OUTPUT_DIR / 'tables'
FIG_DIR = OUTPUT_DIR / 'figures'
ML_DIR = OUTPUT_DIR / 'ml'
for path in (TABLE_DIR, FIG_DIR, ML_DIR):
    path.mkdir(parents=True, exist_ok=True)
print(f'PROJECT_ROOT: {PROJECT_ROOT}')


In [None]:
###### 数据导入
data = pd.read_csv(DATA_DIR / 'datafeiguoyou.csv', header=0)
data = pd.DataFrame(data)
print(data.head(3))
print(data.shape)

In [None]:
###### 数据预处理
x = data.iloc[:, 6:]
# x = data.iloc[:, 6:38]
y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

In [None]:
names_chinese = [ '管理费用率', '管理层持股比例', '独立董事比例','董事会女性比例', '董事长持股比例', '董事长年龄',
                 '董事长任期','董事长薪酬', '股权激励虚拟变量',  '其他应收款资产比', '股权集中度','股权制衡度','中小股东持股比例', 
                 '机构投资者持股比例', '控股股东股权质押比例', '财务报告质量','留存收益资产比','自由现金流', 
                 '税收规避程度', '实际税率', '纳税波动率','融资约束程度', '再融资动机','投资者情绪', '上一期股利水平','资产收益率',
                 '每股经营活动现金流量', '托宾Q', '账面市值比', '资产负债率', 
                 '销售增长率', '公司规模','分析师跟踪人数','公司所在省份市场化程度','ind1', 'ind2', 'ind3' ,'ind4' ,'ind4' ,'ind5',
                 'ind7', 'ind8' ,'ind11' ,'ind12' ,'ind15' ,'ind16' ,'ind17' ,'ind18' ,'ind19' ,'ind20' ,'ind21',
                 'ind22', 'ind23', 'ind24', 'ind25' ,'ind26' ,'ind27' ,'ind28' ,'ind29' ,'ind30' ,'ind31' ,'ind32',
                 'ind33' ,'ind34' ,'ind35' ,'ind37' ,'ind38' ,'ind39' ,'ind40' ,'ind41' ,'ind42']

In [None]:
x_test = sc.transform(x)
x_test = pd.DataFrame(x_test,columns= x.columns)

names = list(x_train1.columns)

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    model_gbr = GradientBoostingRegressor(n_estimators =5000 , max_depth = 6,subsample = 0.8,learning_rate = 0.001,random_state=0) 
    model_gbr.fit(x_train[i],y_train[i])
    a = model_gbr.feature_importances_.tolist()
    result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
output = open(TABLE_DIR / 'data-gbr-非国有1.xls','w')
output.write('name\tgender\tstatus\tage\n')
for i in range(len(result_gbr)):
    for j in range(len(result_gbr[i])):
        output.write(str(result_gbr[i][j]))  #write函数不能写int类型的参数，所以使用str()转化
        output.write('\t')  #相当于Tab一下，换一个单元格
    output.write('\n')    #写完一行立马换行
output.close()

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_forest = RandomForestRegressor(n_estimators=5000, max_features=19,random_state=0, n_jobs=-1)
    model_forest.fit(x_train[i],y_train[i])
    a = model_forest.feature_importances_.tolist()
    result_forest.append(a)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
output = open(TABLE_DIR / 'data-forest-非国有1.xls','w')
for i in range(len(result_forest)):
    for j in range(len(result_forest[i])):
        output.write(str(result_forest[i][j]))  #write函数不能写int类型的参数，所以使用str()转化
        output.write('\t')  #相当于Tab一下，换一个单元格
    output.write('\n')    #写完一行立马换行
output.close()