In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error,median_absolute_error,explained_variance_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV,KFold,StratifiedKFold,RandomizedSearchCV #交叉验证
from sklearn.preprocessing import StandardScaler #特征标准化
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay #部分依赖图
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split  # 划分训练集、验证集、测试集
from sklearn.svm import SVR #支持向量机
from sklearn.neural_network import MLPRegressor #神经网络
from sklearn.tree import DecisionTreeRegressor #决策树
from sklearn.svm import SVR #支持向量机
from pathlib import Path
# from xgboost.sklearn import XGBRegressor

In [None]:
TARGET_FOLDER = '参考文献/1/20240618102625WU_FILE_1'

def locate_project_root(target_folder=TARGET_FOLDER):
    current = Path.cwd().resolve()
    for candidate in [current, *current.parents]:
        if (candidate / target_folder).exists():
            return candidate
    raise FileNotFoundError(f'未能在 {current} 及其父目录中定位 {target_folder}')

PROJECT_ROOT = locate_project_root()
DATA_DIR = PROJECT_ROOT / TARGET_FOLDER / '数据' / '数据-python'
OUTPUT_DIR = PROJECT_ROOT / 'output'
TABLE_DIR = OUTPUT_DIR / 'tables'
FIG_DIR = OUTPUT_DIR / 'figures'
ML_DIR = OUTPUT_DIR / 'ml'
for path in (TABLE_DIR, FIG_DIR, ML_DIR):
    path.mkdir(parents=True, exist_ok=True)
print(f'PROJECT_ROOT: {PROJECT_ROOT}')


In [None]:
###### 数据导入
data = pd.read_csv(DATA_DIR / 'data.csv', header=0)
data = pd.DataFrame(data)
print(data.head(3))
print(data.shape)

In [None]:
###### 数据预处理 不加入理论表征变量
x = data.iloc[:, 30:]
y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

names = list(x_train1.columns)    

In [None]:
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
    model_lasso.fit(x_train[i],y_train[i])
#     print(model_lasso.best_params_)
    r2 = model_lasso.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_lasso.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_lasso = model_lasso.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_lasso)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_lasso)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_lasso)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_lasso)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

样本内R方= 0.090
样本外R方= 0.082
EVS= 0.086
MSE= 0.088
MAE= 0.199
MedAE= 0.158

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_tree =DecisionTreeRegressor(max_depth=3,max_features=5,random_state=0,splitter='random')
    model_tree.fit(x_train[i],y_train[i])
    r2 = model_tree.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_tree.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_tree = model_tree.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_tree)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_tree)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_tree)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_tree)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

样本内R方= 0.045
样本外R方= 0.026
EVS= 0.030
MSE= 0.093
MAE= 0.207
MedAE= 0.171

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
    model_svm.fit(x_train[i],y_train[i])
    r2 = model_svm.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_svm.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_svm = model_svm.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_svm)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_svm)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_svm)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_svm)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

样本内R方= 0.087
样本外R方= 0.072
EVS= 0.102
MSE= 0.089
MAE= 0.185
MedAE= 0.126

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    model_gbr = GradientBoostingRegressor(n_estimators =3000 , max_depth = 5,subsample = 0.7,learning_rate = 0.001,random_state=0) 
    model_gbr.fit(x_train[i],y_train[i])
    a = model_gbr.feature_importances_.tolist()
    result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_forest = RandomForestRegressor(n_estimators=5000, max_features=19,random_state=0, n_jobs=-1)
    model_forest.fit(x_train[i],y_train[i])
    a = model_forest.feature_importances_.tolist()
    result_forest.append(a)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

样本内R方= 0.878
样本外R方= 0.093
EVS= 0.103
MSE= 0.086
MAE= 0.189
MedAE= 0.133

In [None]:
for i in range(0,17):
    x_train[i] = x_train[i].iloc[:, :11]
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    lr = LinearRegression()
    lr.fit(x_train[i],y_train[i])
    r2 = lr.score(x_train[i],y_train[i], sample_weight=None)
    result.append(r2)
    r2a = lr.score(x_train[j],y_train[j], sample_weight=None)
    result1.append(r2a)
    pred_ols = lr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_ols)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_ols)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_ols)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_ols)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
###### 数据预处理 加入第一类代理问题
x1 = data.iloc[:,6:16]
x2 = data.iloc[:, 30:]
x = pd.concat([x1,x2],axis = 1)

y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

In [None]:
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
    model_lasso.fit(x_train[i],y_train[i])
#     print(model_lasso.best_params_)
    r2 = model_lasso.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_lasso.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_lasso = model_lasso.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_lasso)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_lasso)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_lasso)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_lasso)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_tree =DecisionTreeRegressor(max_depth=3,max_features=6,random_state=0,splitter='random')
    model_tree.fit(x_train[i],y_train[i])
    r2 = model_tree.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_tree.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_tree = model_tree.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_tree)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_tree)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_tree)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_tree)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
    model_svm.fit(x_train[i],y_train[i])
    r2 = model_svm.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_svm.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_svm = model_svm.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_svm)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_svm)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_svm)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_svm)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    model_gbr = GradientBoostingRegressor(n_estimators =3000 , max_depth = 4,subsample = 0.7,learning_rate = 0.001,random_state=0) 
    model_gbr.fit(x_train[i],y_train[i])
    a = model_gbr.feature_importances_.tolist()
    result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
# result = []
# result1 = []
# result2 = []
# result3 = []
# result4 = []
# result5 = []
# result_gbr=[]
# result_gbr.append(names)
# for i in range(0,16):
#     j = i+1
#     param_distributions = {'n_estimators':[1000,2000,3000,5000],'max_depth':range(3,10),
#                        'subsample':np.linspace(0.1,1,10),'learning_rate':[0.001,0.01]}
#     kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
#     model_gbr = RandomizedSearchCV(GradientBoostingRegressor(random_state=0),
#                            param_distributions=param_distributions, n_iter = 10,
#                                cv = kfold,random_state = 0,n_jobs = -1,scoring='r2')
#     model_gbr.fit(x_train[i],y_train[i])
#     print(model_gbr.best_params_) 
# #     a = model_gbr.feature_importances_.tolist()
# #     result_gbr.append(a)
#     r2 = model_gbr.score(x_train[i],y_train[i])
#     result.append(r2)
#     r2a = model_gbr.score(x_train[j],y_train[j])
#     result1.append(r2a)
#     pred_gbr = model_gbr.predict(x_train[j])
#     mse_predict = mean_squared_error(y_train[j], pred_gbr)
#     result2.append(mse_predict)
#     mae_predict = mean_absolute_error(y_train[j], pred_gbr)
#     result3.append(mae_predict)
#     median_predict = median_absolute_error(y_train[j], pred_gbr)
#     result4.append(median_predict)
#     evs_predict = explained_variance_score(y_train[j], pred_gbr)
#     result5.append(evs_predict)
# print('样本内R方=','%.4f'%np.mean(result))
# print('样本外R方=','%.4f'%np.mean(result1)) 
# print('EVS=','%.4f'%np.mean(result5))
# print('MSE=','%.4f'%np.mean(result2))
# print('MAE=','%.4f'%np.mean(result3))
# print('MedAE=','%.4f'%np.mean(result4))
# # print(result)
# # print(result1)
# # print(result2)
# # print(result3)
# # print(result4)
# # print(result5)

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
# for i in range(0,7):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,4000,5000],'max_features': range(5,15)}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1)
    model_forest.fit(x_train[i],y_train[i])
    print(model_forest.best_params_)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
print(result)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)

In [None]:
for i in range(0,17):
    x_train[i] = x_train[i].iloc[:, :21]
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    lr = LinearRegression()
    lr.fit(x_train[i],y_train[i])
    r2 = lr.score(x_train[i],y_train[i], sample_weight=None)
    result.append(r2)
    r2a = lr.score(x_train[j],y_train[j], sample_weight=None)
    result1.append(r2a)
    pred_ols = lr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_ols)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_ols)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_ols)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_ols)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
###### 数据预处理 加入第二类代理问题
x1 = data.iloc[:,16:22]
x2 = data.iloc[:, 30:]
x = pd.concat([x1,x2],axis = 1)

y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

In [None]:
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
    model_lasso.fit(x_train[i],y_train[i])
#     print(model_lasso.best_params_)
    r2 = model_lasso.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_lasso.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_lasso = model_lasso.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_lasso)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_lasso)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_lasso)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_lasso)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_tree =DecisionTreeRegressor(max_depth=3,max_features=6,random_state=0,splitter='random')
    model_tree.fit(x_train[i],y_train[i])
    r2 = model_tree.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_tree.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_tree = model_tree.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_tree)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_tree)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_tree)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_tree)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
    model_svm.fit(x_train[i],y_train[i])
    r2 = model_svm.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_svm.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_svm = model_svm.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_svm)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_svm)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_svm)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_svm)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,5000],'max_depth':range(3,10),
                       'subsample':np.linspace(0.1,1,10),'learning_rate':[0.001,0.01]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_gbr = RandomizedSearchCV(GradientBoostingRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1,scoring='r2')
    model_gbr.fit(x_train[i],y_train[i])
    print(model_gbr.best_params_) 
#     a = model_gbr.feature_importances_.tolist()
#     result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
# for i in range(0,7):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,4000,5000],'max_features': range(5,15)}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1)
    model_forest.fit(x_train[i],y_train[i])
    print(model_forest.best_params_)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
print(result)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)

In [None]:
for i in range(0,17):
    x_train[i] = x_train[i].iloc[:, :18]
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    lr = LinearRegression()
    lr.fit(x_train[i],y_train[i])
    r2 = lr.score(x_train[i],y_train[i], sample_weight=None)
    result.append(r2)
    r2a = lr.score(x_train[j],y_train[j], sample_weight=None)
    result1.append(r2a)
    pred_ols = lr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_ols)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_ols)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_ols)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_ols)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
###### 数据预处理 加入生命周期特征
x1 = data.iloc[:,22:24]
x2 = data.iloc[:, 30:]
x = pd.concat([x1,x2],axis = 1)

y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

In [None]:
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
    model_lasso.fit(x_train[i],y_train[i])
#     print(model_lasso.best_params_)
    r2 = model_lasso.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_lasso.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_lasso = model_lasso.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_lasso)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_lasso)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_lasso)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_lasso)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_tree =DecisionTreeRegressor(max_depth=3,max_features=5,random_state=0,splitter='random')
    model_tree.fit(x_train[i],y_train[i])
    r2 = model_tree.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_tree.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_tree = model_tree.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_tree)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_tree)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_tree)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_tree)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
    model_svm.fit(x_train[i],y_train[i])
    r2 = model_svm.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_svm.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_svm = model_svm.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_svm)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_svm)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_svm)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_svm)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,5000],'max_depth':range(3,10),
                       'subsample':np.linspace(0.1,1,10),'learning_rate':[0.001,0.01]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_gbr = RandomizedSearchCV(GradientBoostingRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1,scoring='r2')
    model_gbr.fit(x_train[i],y_train[i])
    print(model_gbr.best_params_) 
#     a = model_gbr.feature_importances_.tolist()
#     result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    model_gbr = GradientBoostingRegressor(n_estimators =3000 , max_depth = 5,subsample = 0.7,learning_rate = 0.001,random_state=0) 
    model_gbr.fit(x_train[i],y_train[i])
    a = model_gbr.feature_importances_.tolist()
    result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
# for i in range(0,7):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,4000,5000],'max_features': range(5,15)}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1)
    model_forest.fit(x_train[i],y_train[i])
    print(model_forest.best_params_)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
print(result)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)

In [None]:
for i in range(0,17):
    x_train[i] = x_train[i].iloc[:, :13]
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    lr = LinearRegression()
    lr.fit(x_train[i],y_train[i])
    r2 = lr.score(x_train[i],y_train[i], sample_weight=None)
    result.append(r2)
    r2a = lr.score(x_train[j],y_train[j], sample_weight=None)
    result1.append(r2a)
    pred_ols = lr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_ols)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_ols)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_ols)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_ols)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
###### 数据预处理 加入公司税率特征

x1 = data.iloc[:,24:27]
x2 = data.iloc[:, 30:]
x = pd.concat([x1,x2],axis = 1)

y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

In [None]:
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
    model_lasso.fit(x_train[i],y_train[i])
#     print(model_lasso.best_params_)
    r2 = model_lasso.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_lasso.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_lasso = model_lasso.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_lasso)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_lasso)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_lasso)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_lasso)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_tree =DecisionTreeRegressor(max_depth=4,max_features=12,random_state=0,splitter='random')
    model_tree.fit(x_train[i],y_train[i])
    r2 = model_tree.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_tree.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_tree = model_tree.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_tree)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_tree)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_tree)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_tree)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
    model_svm.fit(x_train[i],y_train[i])
    r2 = model_svm.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_svm.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_svm = model_svm.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_svm)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_svm)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_svm)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_svm)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,5000],'max_depth':range(3,10),
                       'subsample':np.linspace(0.1,1,10),'learning_rate':[0.001,0.01]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_gbr = RandomizedSearchCV(GradientBoostingRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1,scoring='r2')
    model_gbr.fit(x_train[i],y_train[i])
    print(model_gbr.best_params_) 
#     a = model_gbr.feature_importances_.tolist()
#     result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
# for i in range(0,7):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,4000,5000],'max_features': range(5,15)}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1)
    model_forest.fit(x_train[i],y_train[i])
    print(model_forest.best_params_)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
print(result)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)

In [None]:
for i in range(0,17):
    x_train[i] = x_train[i].iloc[:, :14]
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    lr = LinearRegression()
    lr.fit(x_train[i],y_train[i])
    r2 = lr.score(x_train[i],y_train[i], sample_weight=None)
    result.append(r2)
    r2a = lr.score(x_train[j],y_train[j], sample_weight=None)
    result1.append(r2a)
    pred_ols = lr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_ols)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_ols)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_ols)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_ols)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
###### 数据预处理 融资需求特征
x1 = data.iloc[:, 27:29]
x2 = data.iloc[:,30:]
x = pd.concat([x1,x2],axis = 1)

y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

In [None]:
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
    model_lasso.fit(x_train[i],y_train[i])
#     print(model_lasso.best_params_)
    r2 = model_lasso.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_lasso.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_lasso = model_lasso.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_lasso)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_lasso)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_lasso)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_lasso)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_tree =DecisionTreeRegressor(max_depth=5,max_features=14,random_state=0,splitter='random')
    model_tree.fit(x_train[i],y_train[i])
    r2 = model_tree.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_tree.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_tree = model_tree.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_tree)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_tree)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_tree)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_tree)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
    model_svm.fit(x_train[i],y_train[i])
    r2 = model_svm.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_svm.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_svm = model_svm.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_svm)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_svm)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_svm)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_svm)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,5000],'max_depth':range(3,10),
                       'subsample':np.linspace(0.1,1,10),'learning_rate':[0.001,0.01]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_gbr = RandomizedSearchCV(GradientBoostingRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1,scoring='r2')
    model_gbr.fit(x_train[i],y_train[i])
    print(model_gbr.best_params_) 
#     a = model_gbr.feature_importances_.tolist()
#     result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
# for i in range(0,7):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,4000,5000],'max_features': range(5,15)}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1)
    model_forest.fit(x_train[i],y_train[i])
    print(model_forest.best_params_)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
print(result)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)

In [None]:
for i in range(0,17):
    x_train[i] = x_train[i].iloc[:, :13]
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    lr = LinearRegression()
    lr.fit(x_train[i],y_train[i])
    r2 = lr.score(x_train[i],y_train[i], sample_weight=None)
    result.append(r2)
    r2a = lr.score(x_train[j],y_train[j], sample_weight=None)
    result1.append(r2a)
    pred_ols = lr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_ols)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_ols)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_ols)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_ols)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
###### 数据预处理 投资者情绪
x1 = data.iloc[:, 29:30]
x2 = data.iloc[:,30:]
x = pd.concat([x1,x2],axis = 1)

y = data.iloc[:, 2] #股利分配率
# y = data.iloc[:, 1] #是否发放股利

x_train1 = x.loc[data['year']==2006]
y_train1 = y.loc[data['year']==2006]
sc = StandardScaler()
sc.fit(x_train1)
x_train1 = sc.transform(x_train1)
x_train1 = pd.DataFrame(x_train1,columns= x.columns)

for i in range(2,18):
    exec ("x_train%s=1"%i)
    exec ("y_train%s=1"%i)

x_train = [x_train1,x_train2,x_train3,x_train4,x_train5,x_train6,x_train7,x_train8,x_train9,x_train10,x_train11,x_train12,x_train13,
           x_train14,x_train15,x_train16,x_train17]
y_train = [y_train1,y_train2,y_train3,y_train4,y_train5,y_train6,y_train7,y_train8,y_train9,y_train10,y_train11,y_train12,y_train13,
          y_train14,y_train15,y_train16,y_train17]

for i in range(1,18):
    j = i + 2005
    k = i - 1
    x_train[k] = x.loc[data['year']== j]
    y_train[k] = y.loc[data['year']== j]
    x_train[k] = sc.transform(x_train[k])
    x_train[k] = pd.DataFrame(x_train[k],columns= x.columns)

In [None]:
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
    model_lasso.fit(x_train[i],y_train[i])
#     print(model_lasso.best_params_)
    r2 = model_lasso.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_lasso.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_lasso = model_lasso.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_lasso)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_lasso)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_lasso)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_lasso)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_tree =DecisionTreeRegressor(max_depth=5,max_features=14,random_state=0,splitter='random')
    model_tree.fit(x_train[i],y_train[i])
    r2 = model_tree.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_tree.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_tree = model_tree.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_tree)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_tree)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_tree)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_tree)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
    j = i+1
    model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
    model_svm.fit(x_train[i],y_train[i])
    r2 = model_svm.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_svm.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_svm = model_svm.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_svm)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_svm)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_svm)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_svm)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_gbr=[]
result_gbr.append(names)
for i in range(0,16):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,5000],'max_depth':range(3,10),
                       'subsample':np.linspace(0.1,1,10),'learning_rate':[0.001,0.01]}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_gbr = RandomizedSearchCV(GradientBoostingRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1,scoring='r2')
    model_gbr.fit(x_train[i],y_train[i])
    print(model_gbr.best_params_) 
#     a = model_gbr.feature_importances_.tolist()
#     result_gbr.append(a)
    r2 = model_gbr.score(x_train[i],y_train[i])
    result.append(r2)
    r2a = model_gbr.score(x_train[j],y_train[j])
    result1.append(r2a)
    pred_gbr = model_gbr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_gbr)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_gbr)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_gbr)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_gbr)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)

In [None]:
result = []
result1 = []
result2 = []
result3 = []
result4 = []
result5 = []
result_forest = []
result_forest.append(names)
for i in range(0,16):
# for i in range(0,7):
    j = i+1
    param_distributions = {'n_estimators':[1000,2000,3000,4000,5000],'max_features': range(5,15)}
    kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
    model_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0),
                           param_distributions=param_distributions, n_iter = 10,
                               cv = kfold,random_state = 0,n_jobs = -1)
    model_forest.fit(x_train[i],y_train[i])
    print(model_forest.best_params_)
    r2 = model_forest.score(x_train[i],y_train[i])    
    result.append(r2)
    r2a = model_forest.score(x_train[j],y_train[j])
    result1.append(r2a)    
    pred_forest = model_forest.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_forest)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_forest)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_forest)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_forest)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
print(result)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)

In [None]:
for i in range(0,17):
    x_train[i] = x_train[i].iloc[:, :12]
result = []
result1 = []
result2 = []
result3 = [] 
result4 = []
result5 = []
for i in range(0,16):
    j = i+1
    lr = LinearRegression()
    lr.fit(x_train[i],y_train[i])
    r2 = lr.score(x_train[i],y_train[i], sample_weight=None)
    result.append(r2)
    r2a = lr.score(x_train[j],y_train[j], sample_weight=None)
    result1.append(r2a)
    pred_ols = lr.predict(x_train[j])
    mse_predict = mean_squared_error(y_train[j], pred_ols)
    result2.append(mse_predict)
    mae_predict = mean_absolute_error(y_train[j], pred_ols)
    result3.append(mae_predict)
    median_predict = median_absolute_error(y_train[j], pred_ols)
    result4.append(median_predict)
    evs_predict = explained_variance_score(y_train[j], pred_ols)
    result5.append(evs_predict)
print('样本内R方=','%.4f'%np.mean(result))
print('样本外R方=','%.4f'%np.mean(result1)) 
print('EVS=','%.4f'%np.mean(result5))
print('MSE=','%.4f'%np.mean(result2))
print('MAE=','%.4f'%np.mean(result3))
print('MedAE=','%.4f'%np.mean(result4))
# print(result)
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# print(result5)