In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error,median_absolute_error,explained_variance_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV,KFold,StratifiedKFold,RandomizedSearchCV #交叉验证
from sklearn.preprocessing import StandardScaler #特征标准化
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
# import xgboost as xgb
# from xgboost.sklearn import XGBRegressor
from sklearn.inspection import PartialDependenceDisplay #部分依赖图
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split  # 划分训练集、验证集、测试集
from sklearn.svm import SVR #支持向量机
from sklearn.neural_network import MLPRegressor #神经网络

from sklearn.tree import DecisionTreeRegressor #决策树
from sklearn.svm import SVR #支持向量机from pathlib import Path


In [None]:
TARGET_FOLDER = '参考文献/1/20240618102625WU_FILE_1'

def locate_project_root(target_folder=TARGET_FOLDER):
    current = Path.cwd().resolve()
    for candidate in [current, *current.parents]:
        if (candidate / target_folder).exists():
            return candidate
    raise FileNotFoundError(f'未能在 {current} 及其父目录中定位 {target_folder}')

PROJECT_ROOT = locate_project_root()
DATA_DIR = PROJECT_ROOT / TARGET_FOLDER / '数据' / '数据-python'
OUTPUT_DIR = PROJECT_ROOT / 'output'
TABLE_DIR = OUTPUT_DIR / 'tables'
FIG_DIR = OUTPUT_DIR / 'figures'
ML_DIR = OUTPUT_DIR / 'ml'
for path in (TABLE_DIR, FIG_DIR, ML_DIR):
    path.mkdir(parents=True, exist_ok=True)
print(f'PROJECT_ROOT: {PROJECT_ROOT}')


In [None]:
###### 数据导入
data = pd.read_csv(DATA_DIR / 'data.csv', header=0)
data = pd.DataFrame(data)
print(data.head(3))
print(data.shape)

In [None]:
###### 数据预处理
x = data.iloc[:, 6:] # 将回归变量与结果结果变量分开，此处默认第一列为y，其余为x，具体看变量格式
y = data.iloc[:, 2]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,
                                                    random_state=0)  # 划分训练集、测试集,未分年度，按y的分布混合抽样
sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train) #训练集特征标准化
x_test = sc.transform(x_test) #测试集特征标准化，使用训练集的参数进行变换，即测试集的变化与训练集保持一致
x_train = pd.DataFrame(x_train,columns=x.columns)
x_test = pd.DataFrame(x_test,columns=x.columns)
names = list(x_train.columns)

In [None]:
lr = LinearRegression()
lr.fit(x_train,y_train)
r2 = lr.score(x_train,y_train, sample_weight=None)
r2a = lr.score(x_test,y_test, sample_weight=None)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)   
pred_ols = lr.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_ols)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_ols)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_ols)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_ols)
print('MedAE=','%.4f'%median_predict)

In [None]:
param_distributions = {'alpha':[0.01,0.1,1,10,100,1000]}
kfold = KFold(n_splits = 5,shuffle = True,random_state = 0)
model_lasso = RandomizedSearchCV(Lasso(),param_distributions=param_distributions,cv = kfold)
model_lasso.fit(x_train,y_train)
r2 = model_lasso.score(x_train,y_train)
r2a = model_lasso.score(x_test,y_test)   
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_lasso = model_lasso.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_lasso)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_lasso)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_lasso)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_lasso)
print('MedAE=','%.4f'%median_predict)

In [None]:
model_gbr = GradientBoostingRegressor(n_estimators =5000 , max_depth = 7,subsample = 0.6,learning_rate = 0.01,random_state=0) 
model_gbr.fit(x_train,y_train)
a = model_gbr.feature_importances_.tolist()
r2 = model_gbr.score(x_train,y_train)
r2a = model_gbr.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_gbr = model_gbr.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_gbr)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_gbr)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_gbr)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_gbr)
print('MedAE=','%.4f'%median_predict)

In [None]:
sorted_index = model_gbr.feature_importances_.argsort()
print(sorted_index)
for i in range(x.shape[1]):
    print(x.columns[sorted_index[i]])
for i in range(x.shape[1]):
    print(model_gbr.feature_importances_[sorted_index[i]])

In [None]:
for i in range(x.shape[1]):
    print(model_gbr.feature_importances_[i])

In [None]:
model_forest = RandomForestRegressor(n_estimators=500, max_features=11,random_state=0, n_jobs=-1)
model_forest.fit(x_train,y_train)
a = model_forest.feature_importances_.tolist()
r2 = model_forest.score(x_train,y_train)    
r2a = model_forest.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_forest = model_forest.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_forest)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_forest)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_forest)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_forest)
print('MedAE=','%.4f'%median_predict)

In [None]:
sorted_index = model_forest.feature_importances_.argsort()
print(sorted_index)
for i in range(x.shape[1]):
    print(x.columns[sorted_index[i]])
for i in range(x.shape[1]):
    print(model_forest.feature_importances_[sorted_index[i]])

In [None]:
for i in range(x.shape[1]):
    print(model_forest.feature_importances_[i])

In [None]:
model_svm = SVR(kernel='rbf',C=1,gamma=0.01)
model_svm.fit(x_train,y_train)
r2 = model_svm.score(x_train,y_train)
r2a = model_svm.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_svm = model_svm.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_svm)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_svm)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_svm)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_svm)
print('MedAE=','%.4f'%median_predict)

In [None]:
model_tree =DecisionTreeRegressor(max_depth=10,max_features=7,random_state=0)
model_tree.fit(x_train,y_train)
r2 = model_tree.score(x_train,y_train)
r2a = model_tree.score(x_test,y_test)
print('样本内R方=','%.4f'%r2)
print('样本外R方=','%.4f'%r2a)
pred_tree = model_tree.predict(x_test)
evs_predict = explained_variance_score(y_test, pred_tree)
print('EVS=','%.4f'%evs_predict)
mse_predict = mean_squared_error(y_test, pred_tree)
print('MSE=','%.4f'%mse_predict) 
mae_predict = mean_absolute_error(y_test, pred_tree)
print('MAE=','%.4f'%mae_predict)
median_predict = median_absolute_error(y_test, pred_tree)
print('MedAE=','%.4f'%median_predict)