In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error,median_absolute_error,explained_variance_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV,KFold,StratifiedKFold,RandomizedSearchCV #交叉验证
from sklearn.preprocessing import StandardScaler #特征标准化
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
# import xgboost as xgb
# from xgboost.sklearn import XGBRegressor
from sklearn.inspection import PartialDependenceDisplay #部分依赖图
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split  # 划分训练集、验证集、测试集
from sklearn.svm import SVR #支持向量机
from sklearn.neural_network import MLPRegressor #神经网络

from sklearn.tree import DecisionTreeRegressor #决策树
from sklearn.svm import SVR #支持向量机from pathlib import Path


In [None]:
TARGET_FOLDER = '参考文献/1/20240618102625WU_FILE_1'

def locate_project_root(target_folder=TARGET_FOLDER):
    current = Path.cwd().resolve()
    for candidate in [current, *current.parents]:
        if (candidate / target_folder).exists():
            return candidate
    raise FileNotFoundError(f'未能在 {current} 及其父目录中定位 {target_folder}')

PROJECT_ROOT = locate_project_root()
DATA_DIR = PROJECT_ROOT / TARGET_FOLDER / '数据' / '数据-python'
OUTPUT_DIR = PROJECT_ROOT / 'output'
TABLE_DIR = OUTPUT_DIR / 'tables'
FIG_DIR = OUTPUT_DIR / 'figures'
ML_DIR = OUTPUT_DIR / 'ml'
for path in (TABLE_DIR, FIG_DIR, ML_DIR):
    path.mkdir(parents=True, exist_ok=True)
print(f'PROJECT_ROOT: {PROJECT_ROOT}')


In [2]:
###### 数据导入
data = pd.read_csv(DATA_DIR / 'data-guliup.csv', header=0)
data = pd.DataFrame(data)
print(data.head(3))
print(data.shape)

   Stkcd  year  Dividend_ratio1  Dividend_ratio2  Dividend_ratio3  Dividend  \
0      2  2013         0.298716         0.942424         5.105866         1   
1      2  2017         0.354175         0.852556         2.897618         1   
2      4  2012         0.000000         0.000000         0.000000         0   

   Managefee_ratio  Manageshare  Indep_ratio   Bgender  ...  ind32  ind33  \
0         0.022174       0.2884     0.363636  0.181818  ...      0      0   
1         0.036500       0.1090     0.363636  0.090909  ...      0      0   
2         0.175151       0.0027     0.333333  0.111111  ...      0      0   

   ind34  ind35  ind37  ind38  ind39  ind40  ind41  ind42  
0      0      0      0      1      0      0      0      0  
1      0      0      0      1      0      0      0      0  
2      0      0      0      0      0      0      0      0  

[3 rows x 77 columns]
(12185, 77)


In [3]:
###### 数据预处理
x = data.iloc[:, 6:]
y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2007]
y_train1 = y.loc[data['year']==2007]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,
                                                    random_state=0)  # 划分训练集、测试集,未分年度，按y的分布混合抽样
sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train) #训练集特征标准化
x_test = sc.transform(x_test) #测试集特征标准化，使用训练集的参数进行变换，即测试集的变化与训练集保持一致
x_train = pd.DataFrame(x_train,columns=x.columns)
x_test = pd.DataFrame(x_test,columns=x.columns)
names = list(x_train.columns)


In [5]:
model_gbr = GradientBoostingRegressor(n_estimators =5000 , max_depth = 7,subsample = 0.6,learning_rate = 0.01,random_state=0) 
model_gbr.fit(x_train,y_train)
a = model_gbr.feature_importances_.tolist()
r2 = model_gbr.score(x_train,y_train)
r2a = model_gbr.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_gbr = model_gbr.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_gbr)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_gbr)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_gbr)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_gbr)
print('MedAE=','%.4f'%median_predict)

样本内R方= 0.9945
样本外R方= 0.2500
EVS= 0.2501
MSE= 0.0784
MAE= 0.1586
MedAE= 0.0913


样本内R方= 0.998
样本外R方= 0.863
EVS= 0.863
MSE= 0.013
MAE= 0.052
MedAE= 0.022

In [6]:
sorted_index = model_gbr.feature_importances_.argsort()
print(sorted_index)
for i in range(x.shape[1]):
    print(x.columns[sorted_index[i]])
for i in range(x.shape[1]):
    print(model_gbr.feature_importances_[sorted_index[i]])

[70 35 59 68 69 66 63 42 65 64 38 37 48 67 39 62 60 54 45 55 36 22 53  8
 49 43 58 61 41 30 52 44 47 40 50 56 51 57 46 15  2  3  4 33 28 27 11  6
  7  0 23  5 26 13  1 14 34 21 32 17 20 18 10 12 31  9 29 19 16 25 24]
ind42
ind1
ind30
ind40
ind41
ind38
ind34
ind11
ind37
ind35
ind4
ind3
ind19
ind39
ind5
ind33
ind31
ind25
ind16
ind26
ind2
Refinance
ind24
Equity
ind20
ind12
ind29
ind32
ind8
Soe
ind23
ind15
ind18
ind7
ind21
ind27
ind22
ind28
ind17
Pledge
Indep_ratio
Bgender
Bshare
Analyst_num
BM
Tobinq
Top1
Btenure
Bsalary
Managefee_ratio
Sentiment
Bage
Cashflow
Minorityrate
Manageshare
Institution
Market_idx
Constraint
Lnsize
Freecash2
Tax_volatility
Tax_avoid
Tunneling
Sharebalance
Growth
Da_abs
Lev
Tax_ratio
Retainedearn_ratio
ROA
Dividend_lag
1.482121759822773e-05
2.1582853810530758e-05
0.0001466854239751314
0.0002973099010444559
0.00034504961810911467
0.0004156508150592195
0.00043680901766259586
0.00045743052232556864
0.00047575675050521644
0.00047718118608610786
0.00047947810404561883

In [7]:
for i in range(x.shape[1]):
    print(model_gbr.feature_importances_[i])

0.01908925461750205
0.0219685030203203
0.009358530635097605
0.010096763708947163
0.011831892696029635
0.019848742088442367
0.017651065286232875
0.017677317408886607
0.001564633396647034
0.03368596383644073
0.028465599939579027
0.016466276298822132
0.028984013175932834
0.02159196237451765
0.023531505066165824
0.00804360932837071
0.0555322858970877
0.0258093811662618
0.027285406672337625
0.04858622668468011
0.02697387456426025
0.024655485670000955
0.0014230327641462522
0.019754302435052665
0.15838896176342854
0.09627505324780856
0.02103086023929369
0.014063732720316583
0.013714865249981414
0.033726656796492005
0.001847798597815548
0.031151333526340152
0.02500139818364497
0.01195150533318679
0.024647753949730376
2.1582853810530758e-05
0.0013444402813422096
0.0005219010629648981
0.00047947810404561883
0.0007967346817662843
0.0026619970112959825
0.0017579188189651867
0.00045743052232556864
0.001609219232554837
0.0020461178168401707
0.0011423708794063342
0.004122160492888558
0.00263510701118

In [8]:
model_forest = RandomForestRegressor(n_estimators=500, max_features=11,random_state=0, n_jobs=-1)
model_forest.fit(x_train,y_train)
a = model_forest.feature_importances_.tolist()
r2 = model_forest.score(x_train,y_train)    
r2a = model_forest.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_forest = model_forest.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_forest)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_forest)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_forest)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_forest)
print('MedAE=','%.4f'%median_predict)

样本内R方= 0.9032
样本外R方= 0.2723
EVS= 0.2723
MSE= 0.0760
MAE= 0.1568
MedAE= 0.0970


样本内R方= 0.970
样本外R方= 0.784
EVS= 0.784
MSE= 0.021
MAE= 0.071
MedAE= 0.038

In [9]:
sorted_index = model_forest.feature_importances_.argsort()
print(sorted_index)
for i in range(x.shape[1]):
    print(x.columns[sorted_index[i]])
for i in range(x.shape[1]):
    print(model_forest.feature_importances_[sorted_index[i]])

[70 35 42 59 68 69 38 64 37 67 48 39 60 62 66 65 63 55 54 43 49 53 50 45
 44 56 52 41 61 47 36  8 58 22 51 40 46 30 57 15  2  3  4 33  7 23  6  0
 21 11 27  1 28  5 32 26 14 34 13 17 12  9 10 31 20 29 18 19 16 25 24]
ind42
ind1
ind11
ind30
ind40
ind41
ind4
ind35
ind3
ind39
ind19
ind5
ind31
ind33
ind38
ind37
ind34
ind26
ind25
ind12
ind20
ind24
ind21
ind16
ind15
ind27
ind23
ind8
ind32
ind18
ind2
Equity
ind29
Refinance
ind22
ind7
ind17
Soe
ind28
Pledge
Indep_ratio
Bgender
Bshare
Analyst_num
Bsalary
Sentiment
Btenure
Managefee_ratio
Constraint
Top1
Tobinq
Manageshare
BM
Bage
Lnsize
Cashflow
Institution
Market_idx
Minorityrate
Freecash2
Sharebalance
Da_abs
Tunneling
Growth
Tax_volatility
Lev
Tax_avoid
Tax_ratio
Retainedearn_ratio
ROA
Dividend_lag
3.0363139851417702e-05
5.713401992590449e-05
0.00015303547480166645
0.00020736506694431006
0.00030735422770479043
0.0003351163744113915
0.0004144844598819663
0.0004990691048450076
0.0005491068323530895
0.0006310055230017192
0.0006574197571960151
0.

In [10]:
for i in range(x.shape[1]):
    print(model_forest.feature_importances_[i])

0.0204259049014422
0.0209090559721594
0.010133375002670591
0.012111367121061516
0.013166117469063604
0.021367010009965347
0.020283488224185863
0.017324121686263592
0.00199482559343847
0.02663624352860935
0.02903447128332785
0.020772177912708092
0.026020419203740378
0.02492045882650508
0.023821501356509983
0.0073954629352263376
0.058655434044791226
0.0257532747026778
0.03919315794615738
0.04152638860379209
0.03163332931057872
0.020737528593826472
0.0022350861169994182
0.019534939003799465
0.14965268109237975
0.07418707891079561
0.023790261312517567
0.020826251387409105
0.021342614052960306
0.03390146617909385
0.002964204609100145
0.0314257012682384
0.023199950190298472
0.01384267789905867
0.024186811312020197
5.713401992590449e-05
0.0019847596523377476
0.0005491068323530895
0.0004144844598819663
0.0006994067065043072
0.0027732860267735615
0.0018399930819310792
0.00015303547480166645
0.0013289801179367689
0.0016270025859899
0.001576685613440624
0.0029144663205572865
0.0018686606990958993

In [11]:
###### 数据导入
data = pd.read_csv(DATA_DIR / 'data-gulidown.csv', header=0)
data = pd.DataFrame(data)
print(data.head(3))
print(data.shape)

   Stkcd  year  Dividend_ratio1  Dividend_ratio2  Dividend_ratio3  Dividend  \
0      2  2006         0.304220         1.351294         0.971503         1   
1      2  2014         0.350498         1.085495         3.597122         1   
2      2  2015         0.439151         1.301688         2.947196         1   

   Managefee_ratio  Manageshare  Indep_ratio   Bgender  ...  ind32  ind33  \
0         0.047734       0.0292     0.363636  0.181818  ...      0      0   
1         0.026659       0.1815     0.363636  0.090909  ...      0      0   
2         0.024266       0.1912     0.363636  0.090909  ...      0      0   

   ind34  ind35  ind37  ind38  ind39  ind40  ind41  ind42  
0      0      0      0      1      0      0      0      0  
1      0      0      0      1      0      0      0      0  
2      0      0      0      1      0      0      0      0  

[3 rows x 77 columns]
(19284, 77)


In [12]:
###### 数据预处理
x = data.iloc[:, 6:]
y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,
                                                    random_state=0)  # 划分训练集、测试集,未分年度，按y的分布混合抽样
sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train) #训练集特征标准化
x_test = sc.transform(x_test) #测试集特征标准化，使用训练集的参数进行变换，即测试集的变化与训练集保持一致
x_train = pd.DataFrame(x_train,columns=x.columns)
x_test = pd.DataFrame(x_test,columns=x.columns)
names = list(x_train.columns)


In [13]:
model_gbr = GradientBoostingRegressor(n_estimators =5000 , max_depth = 7,subsample = 0.6,learning_rate = 0.01,random_state=0) 
model_gbr.fit(x_train,y_train)
a = model_gbr.feature_importances_.tolist()
r2 = model_gbr.score(x_train,y_train)
r2a = model_gbr.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_gbr = model_gbr.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_gbr)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_gbr)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_gbr)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_gbr)
print('MedAE=','%.4f'%median_predict)

样本内R方= 0.9821
样本外R方= 0.3404
EVS= 0.3411
MSE= 0.0706
MAE= 0.1504
MedAE= 0.0805


In [14]:
sorted_index = model_gbr.feature_importances_.argsort()
print(sorted_index)
for i in range(x.shape[1]):
    print(x.columns[sorted_index[i]])
for i in range(x.shape[1]):
    print(model_gbr.feature_importances_[sorted_index[i]])

[42 62 70 64 59 38 35 50 63 48 56 44 61 36 39 69 49 66 22 67  8 37 65 55
 30 58 41 54 52 45 53 40 43 51 68 47 57 60 46  2  3 28 15 27 33  5  4 34
  7  0 32  1 26 11 12 10 13 21 23 14  6 17 18 29 20  9 31 19 16 25 24]
ind11
ind33
ind42
ind35
ind30
ind4
ind1
ind21
ind34
ind19
ind27
ind15
ind32
ind2
ind5
ind41
ind20
ind38
Refinance
ind39
Equity
ind3
ind37
ind26
Soe
ind29
ind8
ind25
ind23
ind16
ind24
ind7
ind12
ind22
ind40
ind18
ind28
ind31
ind17
Indep_ratio
Bgender
BM
Pledge
Tobinq
Analyst_num
Bage
Bshare
Market_idx
Bsalary
Managefee_ratio
Lnsize
Manageshare
Cashflow
Top1
Sharebalance
Tunneling
Minorityrate
Constraint
Sentiment
Institution
Btenure
Freecash2
Tax_avoid
Lev
Tax_volatility
Da_abs
Growth
Tax_ratio
Retainedearn_ratio
ROA
Dividend_lag
0.00013359930978889704
0.00027091670838454206
0.00037788955224046154
0.0004268976700036459
0.00044997662941320465
0.0004858953290106912
0.0005061427286224794
0.000639523405590516
0.0007297960146862466
0.0008101702170113336
0.0008647207964359074
0.0

In [15]:
model_forest = RandomForestRegressor(n_estimators=500, max_features=11,random_state=0, n_jobs=-1)
model_forest.fit(x_train,y_train)
a = model_forest.feature_importances_.tolist()
r2 = model_forest.score(x_train,y_train)    
r2a = model_forest.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_forest = model_forest.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_forest)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_forest)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_forest)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_forest)
print('MedAE=','%.4f'%median_predict)

样本内R方= 0.9094
样本外R方= 0.3576
EVS= 0.3585
MSE= 0.0687
MAE= 0.1472
MedAE= 0.0777


In [16]:
sorted_index = model_forest.feature_importances_.argsort()
print(sorted_index)
for i in range(x.shape[1]):
    print(x.columns[sorted_index[i]])
for i in range(x.shape[1]):
    print(model_forest.feature_importances_[sorted_index[i]])

[35 70 42 64 38 59 62 44 50 69 56 67 61 39 66 36 49 43 37 48 68 47 63 52
 55 53 54 40 22 65 51 60  8 58 41 45 46 30 57  2 15  3 33  4  7 28 27  5
 23  1 34  0 32 12 21 10  6 14 11  9 26 13 17 29 31 20 19 18 16 25 24]
ind1
ind42
ind11
ind35
ind4
ind30
ind33
ind15
ind21
ind41
ind27
ind39
ind32
ind5
ind38
ind2
ind20
ind12
ind3
ind19
ind40
ind18
ind34
ind23
ind26
ind24
ind25
ind7
Refinance
ind37
ind22
ind31
Equity
ind29
ind8
ind16
ind17
Soe
ind28
Indep_ratio
Pledge
Bgender
Analyst_num
Bshare
Bsalary
BM
Tobinq
Bage
Sentiment
Manageshare
Market_idx
Managefee_ratio
Lnsize
Sharebalance
Constraint
Tunneling
Btenure
Institution
Top1
Da_abs
Cashflow
Minorityrate
Freecash2
Lev
Growth
Tax_volatility
Tax_ratio
Tax_avoid
Retainedearn_ratio
ROA
Dividend_lag
0.0001990792960746416
0.00022312305897232562
0.00025372092070086855
0.00043476618301558647
0.00044994214324900635
0.0004956693172687767
0.0005141545601996007
0.0005779000028431567
0.0005825563645743118
0.0005987893606141304
0.0006430157423995254
0.