In [4]:
# 导入工具
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# import warnings
# warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

In [5]:
# 读取数据
train_data_file = '../zhengqi_train.txt' # ../代表上级目录
test_data_file = '../zhengqi_test.txt'
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8') # 分隔符为'\t'
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

In [6]:
# 归一化
from sklearn import preprocessing
features_columns = train_data.columns[:-1]
min_max_scaler = preprocessing.MinMaxScaler()
# 为保证训练集和测试集都是从同一样本中抽样得到，所以只对训练集fit_transform 对测试集只需transform
train_data_scaler = min_max_scaler.fit_transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(train_data[features_columns])

train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
train_data_scaler['target'] = train_data['target']

test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns

In [7]:
# PCA方法进行特征降维
from sklearn.decomposition import PCA # 主成分分析法
pca = PCA(n_components=16) # 保留16个主成分
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,:-1])
new_test_pca_16 = pca.transform(test_data_scaler)

new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']

new_test_pca_16 = pd.DataFrame(new_test_pca_16)

In [8]:
# 保留16维特征并切分数据
# new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']
train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)


In [24]:
# 模型欠拟合
clf = SGDRegressor(max_iter=500, tol=1e-2) # 基于随机梯度下降法估计线性回归的参数; tol为停止标准
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print('SGDRegressor train MSE:  ', score_train)
print('SGDRegressor test MSE:  ', score_test)
# 这就能说明欠拟合？
print('score of train MSE:  ', clf.score(train_data, train_target))
print('score of test MSE:  ', clf.score(test_data, test_target)) 


SGDRegressor train MSE:   0.151629689344153
SGDRegressor test MSE:   0.15603285820345816
score of train MSE:   0.8403929480003336
score of test MSE:   0.8498426663515907


In [26]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=500, tol=1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print('SGDRegressor train MSE:  ', score_train)
print('SGDRegressor test MSE:  ', score_test)
# 这就能说明过拟合？
print('score of train MSE:  ', clf.score(train_data_poly, train_target))
print('score of test MSE:  ', clf.score(test_data_poly, test_target)) 


SGDRegressor train MSE:   0.13240533361273493
SGDRegressor test MSE:   0.14499079186996386
score of train MSE:   0.8606287128967472
score of test MSE:   0.8604689360853949


In [29]:
# 正常拟合
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print('SGDRegressor train MSE:  ', score_train)
print('SGDRegressor test MSE:  ', score_test)

# 正常拟合情况下对test的MSE最小

print('score of train MSE:  ', clf.score(train_data_poly, train_target))
print('score of test MSE:  ', clf.score(test_data_poly, test_target)) 


SGDRegressor train MSE:   0.1341720456934358
SGDRegressor test MSE:   0.1425203974222481
score of train MSE:   0.85876905264055
score of test MSE:   0.862846306131678


2. 模型正则化

2.1 L2范数正则化

In [35]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L2', alpha=0.0001) # L2范数正则化
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print('SGDRegressor train MSE:  ', score_train)
print('SGDRegressor test MSE:  ', score_test)
print('score of train MSE:  ', clf.score(train_data_poly, train_target))
print('score of test MSE:  ', clf.score(test_data_poly, test_target)) 

SGDRegressor train MSE:   0.1344405679773981
SGDRegressor test MSE:   0.1427679486217192
score of train MSE:   0.8584864031783978
score of test MSE:   0.8626080766428252


3 模型交叉验证

3.1 简单交叉验证

In [36]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
# 将原训练集又切分为80%的训练集和20%的验证集
train_data, test_data, train_target, test_target = train_test_split(train, target,
                                                                    test_size=0.2, random_state=0)
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print('SGDRegressor train MSE:  ', score_train)
print('SGDRegressor test MSE:  ', score_test)


SGDRegressor train MSE:   0.1340462879480953
SGDRegressor test MSE:   0.14245751452624547


3.2 K折交叉验证

In [38]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)  # 5折交叉验证
for k, (train_index, test_index) in enumerate(kf.split(train)):
    train_data, test_data, train_target, test_target = train.values[
        train_index], train.values[test_index], target[train_index], target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, '折', 'SGDRegressor train MSE:  ', score_train)
    print(k, '折', 'SGDRegressor test MSE:  ', score_test,'\n')



0 折 SGDRegressor train MSE:   0.15004340474265546
0 折 SGDRegressor test MSE:   0.10581830160699959 

1 折 SGDRegressor train MSE:   0.13362022019827405
1 折 SGDRegressor test MSE:   0.18229636366915986 

2 折 SGDRegressor train MSE:   0.1465700934689044
2 折 SGDRegressor test MSE:   0.132894816958445 

3 折 SGDRegressor train MSE:   0.14147686443421836
3 折 SGDRegressor test MSE:   0.16299025644470327 

4 折 SGDRegressor train MSE:   0.1387519893104706
4 折 SGDRegressor test MSE:   0.16546101120757006 



留一法、留P法略

4. 模型超参空间及调参

4.1 网格搜索

In [39]:
# 使用网格搜索调参对随机森林模型进行训练
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
train_data, test_data, train_target, test_target = train_test_split(train, target,
                                                                    test_size=0.2, random_state=0)

randomForestRegressor = RandomForestRegressor()
parameters = {'n_estimators': [50, 100, 200], 'max_depth':[1,2,3]}
clf = GridSearchCV(randomForestRegressor, parameters, cv=5) # 指定交叉验证fold数量

clf.fit(train_data, train_target)
score_test = mean_squared_error(test_target, clf.predict(test_data))

print('RandomForestRegressor GridSearchCV test MSE:  ', score_test)


RandomForestRegressor GridSearchCV test MSE:   0.2542298810896775
