## 特征优化

### 导入数据

In [1]:
import pandas as pd
  
train_data_file = "data/zhengqi_train.txt"
test_data_file =  "data/zhengqi_test.txt"

train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

### 定义特征构造方法，构造特征

In [2]:
epsilon=1e-5

#组交叉特征，可以自行定义，如增加： x*x/y, log(x)/y 等等
func_dict = {
            'add': lambda x,y: x+y,
            'mins': lambda x,y: x-y,
            'div': lambda x,y: x/(y+epsilon),
            'multi': lambda x,y: x*y
            }

### 定义特征构造的函数

In [3]:
def auto_features_make(train_data,test_data,func_dict,col_list):
    train_data, test_data = train_data.copy(), test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name, func in func_dict.items():
                for data in [train_data,test_data]:
                    func_features = func(data[col_i],data[col_j])
                    col_func_features = '-'.join([col_i,func_name,col_j])
                    data[col_func_features] = func_features
    return train_data,test_data

### 对训练集和测试集数据进行特征构造

In [4]:
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)

  if __name__ == "__main__":


In [5]:
from sklearn.decomposition import PCA   #主成分分析法

#PCA方法降维
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']

Feature names unseen at fit time:
- V37-multi-V37
Feature names seen at fit time, yet now missing:
- target



In [6]:
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']

### 使用lightgbm模型对新构造的特征进行模型训练和评估

In [None]:
# ls_validation i
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

# 5折交叉验证
Folds=5
kf = KFold(len(X_train2),
            # n_splits=Folds,
            random_state=2019, shuffle=True)
# 记录训练和预测MSE
MSE_DICT = {
    'train_mse':[],
    'test_mse':[]
}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2019,
        objective='regression',
    )
   
    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
    y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
    
    # 训练模型
    lgb_reg.fit(
            X=X_train_KFold,y=y_train_KFold,
            eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
            eval_names=['Train','Test'],
            early_stopping_rounds=100,
            eval_metric='MSE',
            verbose=50
        )


    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_) 
    
    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')
    
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')



[50]	Train's l2: 0.418976	Test's l2: 0.105755
[100]	Train's l2: 0.203665	Test's l2: 0.0242962
[150]	Train's l2: 0.114456	Test's l2: 0.00489616
[200]	Train's l2: 0.0741974	Test's l2: 2.93437e-07
[250]	Train's l2: 0.0535211	Test's l2: 0.000800416
第0折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.07475946684347008 
------
------
 预测MSE
 3.306387977686079e-08 
------





[50]	Train's l2: 0.419015	Test's l2: 0.140632
[100]	Train's l2: 0.203691	Test's l2: 0.0985215
[150]	Train's l2: 0.114536	Test's l2: 0.079759
[200]	Train's l2: 0.074307	Test's l2: 0.0605385
[250]	Train's l2: 0.0536499	Test's l2: 0.0513692
[300]	Train's l2: 0.0416162	Test's l2: 0.049854
[350]	Train's l2: 0.0335032	Test's l2: 0.0429859
[400]	Train's l2: 0.0276629	Test's l2: 0.0404468
[450]	Train's l2: 0.0231634	Test's l2: 0.0412943
[500]	Train's l2: 0.0195293	Test's l2: 0.0397455
[550]	Train's l2: 0.0165658	Test's l2: 0.0393974
[600]	Train's l2: 0.0141633	Test's l2: 0.0407649
第1折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.01672784298833254 
------
------
 预测MSE
 0.03935054336279483 
------





[50]	Train's l2: 0.418947	Test's l2: 0.0188852
[100]	Train's l2: 0.203568	Test's l2: 0.00100975
[150]	Train's l2: 0.114468	Test's l2: 0.00107227
[200]	Train's l2: 0.0742317	Test's l2: 0.00688086
第2折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.15690875227032425 
------
------
 预测MSE
 5.611463906468012e-08 
------





[50]	Train's l2: 0.41899	Test's l2: 0.193962
[100]	Train's l2: 0.203864	Test's l2: 0.0695358
[150]	Train's l2: 0.114674	Test's l2: 0.0277131
[200]	Train's l2: 0.0744482	Test's l2: 0.0140802
[250]	Train's l2: 0.0536803	Test's l2: 0.00457946
[300]	Train's l2: 0.0416071	Test's l2: 0.000561737
[350]	Train's l2: 0.0334907	Test's l2: 5.71213e-06
[400]	Train's l2: 0.0276055	Test's l2: 0.000415383
第3折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.03533201196066011 
------
------
 预测MSE
 1.0083048983817551e-06 
------





[50]	Train's l2: 0.418819	Test's l2: 0.159919
[100]	Train's l2: 0.203548	Test's l2: 0.0760221
[150]	Train's l2: 0.114445	Test's l2: 0.0471659
[200]	Train's l2: 0.0742303	Test's l2: 0.0327988
[250]	Train's l2: 0.0535198	Test's l2: 0.0245416
[300]	Train's l2: 0.0414231	Test's l2: 0.0224708
[350]	Train's l2: 0.0333567	Test's l2: 0.0180571
[400]	Train's l2: 0.0275814	Test's l2: 0.019877
[450]	Train's l2: 0.0231131	Test's l2: 0.0220159
第4折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.03179678613338918 
------
------
 预测MSE
 0.01758442125350803 
------





[50]	Train's l2: 0.419068	Test's l2: 0.00376511
[100]	Train's l2: 0.203825	Test's l2: 0.000139612
[150]	Train's l2: 0.114677	Test's l2: 0.00221127
第5折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.22973610045831822 
------
------
 预测MSE
 8.3928071866396e-07 
------





[50]	Train's l2: 0.418553	Test's l2: 1.33664
[100]	Train's l2: 0.20356	Test's l2: 0.593563
[150]	Train's l2: 0.114538	Test's l2: 0.291945
[200]	Train's l2: 0.0742104	Test's l2: 0.154165
[250]	Train's l2: 0.0535458	Test's l2: 0.0840137
[300]	Train's l2: 0.0414856	Test's l2: 0.0505137
[350]	Train's l2: 0.0333671	Test's l2: 0.0389641
[400]	Train's l2: 0.0274927	Test's l2: 0.0282909
[450]	Train's l2: 0.0230405	Test's l2: 0.0238994
[500]	Train's l2: 0.0194298	Test's l2: 0.0232937
[550]	Train's l2: 0.016478	Test's l2: 0.0212104
[600]	Train's l2: 0.0140918	Test's l2: 0.0173329
[650]	Train's l2: 0.0121263	Test's l2: 0.0158925
[700]	Train's l2: 0.0104849	Test's l2: 0.0150482
[750]	Train's l2: 0.00911054	Test's l2: 0.0152969
第6折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.010544990848799307 
------
------
 预测MSE
 0.015001785134157784 
------





[50]	Train's l2: 0.418867	Test's l2: 0.177295
[100]	Train's l2: 0.20364	Test's l2: 0.0421716
[150]	Train's l2: 0.114486	Test's l2: 0.00437618
[200]	Train's l2: 0.07424	Test's l2: 0.000285364
[250]	Train's l2: 0.0535354	Test's l2: 0.00643583
第7折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.08144304541375287 
------
------
 预测MSE
 8.824090263093309e-10 
------





[50]	Train's l2: 0.418962	Test's l2: 0.293013
[100]	Train's l2: 0.203771	Test's l2: 0.191966
[150]	Train's l2: 0.114611	Test's l2: 0.13172
[200]	Train's l2: 0.0743145	Test's l2: 0.0882778
[250]	Train's l2: 0.0535902	Test's l2: 0.0658243
[300]	Train's l2: 0.0414907	Test's l2: 0.0527119
[350]	Train's l2: 0.0334103	Test's l2: 0.0433898
[400]	Train's l2: 0.0275633	Test's l2: 0.039266
[450]	Train's l2: 0.0230627	Test's l2: 0.0375582
[500]	Train's l2: 0.0194333	Test's l2: 0.0344022
[550]	Train's l2: 0.0164686	Test's l2: 0.0353841
[600]	Train's l2: 0.0140851	Test's l2: 0.0336191
[650]	Train's l2: 0.0121325	Test's l2: 0.0334945
[700]	Train's l2: 0.0105084	Test's l2: 0.0310433
[750]	Train's l2: 0.00913884	Test's l2: 0.0295955
[800]	Train's l2: 0.00796443	Test's l2: 0.0272034
[850]	Train's l2: 0.00694732	Test's l2: 0.0258476
[900]	Train's l2: 0.00606858	Test's l2: 0.0254355
[950]	Train's l2: 0.00532056	Test's l2: 0.0245711
[1000]	Train's l2: 0.00467931	Test's l2: 0.0233079
[1050]	Train's l2: 0.0



[50]	Train's l2: 0.419036	Test's l2: 0.00239799
[100]	Train's l2: 0.203813	Test's l2: 0.00268442
[150]	Train's l2: 0.114609	Test's l2: 0.00743749
第9折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.3045285904844979 
------
------
 预测MSE
 1.1470940338059348e-06 
------





[50]	Train's l2: 0.418998	Test's l2: 0.0606575
[100]	Train's l2: 0.203724	Test's l2: 0.0116961
[150]	Train's l2: 0.114534	Test's l2: 0.00114861
[200]	Train's l2: 0.0743235	Test's l2: 5.64438e-05
[250]	Train's l2: 0.0536156	Test's l2: 1.94712e-05
[300]	Train's l2: 0.0415416	Test's l2: 0.000264356
第10折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.06774135627605922 
------
------
 预测MSE
 1.0860688773015748e-09 
------





[50]	Train's l2: 0.418588	Test's l2: 1.37381
[100]	Train's l2: 0.203524	Test's l2: 0.690864
[150]	Train's l2: 0.114436	Test's l2: 0.346181
[200]	Train's l2: 0.0742317	Test's l2: 0.189493
[250]	Train's l2: 0.0536249	Test's l2: 0.110866
[300]	Train's l2: 0.0414749	Test's l2: 0.069934
[350]	Train's l2: 0.0333309	Test's l2: 0.0494729
[400]	Train's l2: 0.027505	Test's l2: 0.0348813
[450]	Train's l2: 0.0230592	Test's l2: 0.027173
[500]	Train's l2: 0.0194485	Test's l2: 0.0221897
[550]	Train's l2: 0.0165568	Test's l2: 0.0184528
[600]	Train's l2: 0.0141469	Test's l2: 0.0179586
[650]	Train's l2: 0.0121837	Test's l2: 0.0162747
[700]	Train's l2: 0.010531	Test's l2: 0.0157988
[750]	Train's l2: 0.00912577	Test's l2: 0.0153905
[800]	Train's l2: 0.00793775	Test's l2: 0.016021
第11折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.009365707625869515 
------
------
 预测MSE
 0.014969987964355553 
------





[50]	Train's l2: 0.419045	Test's l2: 0.097808
[100]	Train's l2: 0.203876	Test's l2: 0.0476716
[150]	Train's l2: 0.114709	Test's l2: 0.0217354
[200]	Train's l2: 0.0743301	Test's l2: 0.00727449
[250]	Train's l2: 0.0536459	Test's l2: 0.00342079
[300]	Train's l2: 0.0415101	Test's l2: 0.000914694
[350]	Train's l2: 0.0334051	Test's l2: 0.000388864
[400]	Train's l2: 0.0276018	Test's l2: 0.000330803
[450]	Train's l2: 0.0231272	Test's l2: 4.72281e-05
[500]	Train's l2: 0.0194803	Test's l2: 1.97692e-05
[550]	Train's l2: 0.0165513	Test's l2: 6.51461e-05
第12折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.021363228053157143 
------
------
 预测MSE
 1.1479735117855689e-08 
------





[50]	Train's l2: 0.419006	Test's l2: 0.0167244
[100]	Train's l2: 0.203678	Test's l2: 0.0135922
[150]	Train's l2: 0.114581	Test's l2: 0.010354
[200]	Train's l2: 0.0743028	Test's l2: 0.00561155
[250]	Train's l2: 0.0535818	Test's l2: 0.00711933
第13折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.07486015450432343 
------
------
 预测MSE
 0.0053349757657680545 
------





[50]	Train's l2: 0.41905	Test's l2: 0.0259043
[100]	Train's l2: 0.203677	Test's l2: 0.05788
第14折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.9512117958443262 
------
------
 预测MSE
 0.0033614169494722975 
------





[50]	Train's l2: 0.417997	Test's l2: 2.6233
[100]	Train's l2: 0.203311	Test's l2: 0.953281
[150]	Train's l2: 0.114422	Test's l2: 0.367447
[200]	Train's l2: 0.0741854	Test's l2: 0.174931
[250]	Train's l2: 0.0535477	Test's l2: 0.084872
[300]	Train's l2: 0.0414811	Test's l2: 0.0489677
[350]	Train's l2: 0.033334	Test's l2: 0.0261398
[400]	Train's l2: 0.0275101	Test's l2: 0.0196824
[450]	Train's l2: 0.0230275	Test's l2: 0.0119682
[500]	Train's l2: 0.0194155	Test's l2: 0.00633378
[550]	Train's l2: 0.016443	Test's l2: 0.00380764
[600]	Train's l2: 0.0140302	Test's l2: 0.00372934
[650]	Train's l2: 0.0120808	Test's l2: 0.0035822
第15折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.01606915797637025 
------
------
 预测MSE
 0.003129310364319631 
------





[50]	Train's l2: 0.418345	Test's l2: 1.97427
[100]	Train's l2: 0.203341	Test's l2: 1.17486
[150]	Train's l2: 0.114403	Test's l2: 0.715629
[200]	Train's l2: 0.0743004	Test's l2: 0.48508
[250]	Train's l2: 0.0537036	Test's l2: 0.329718
[300]	Train's l2: 0.0415312	Test's l2: 0.228379
[350]	Train's l2: 0.0333408	Test's l2: 0.190106
[400]	Train's l2: 0.0275096	Test's l2: 0.159225
[450]	Train's l2: 0.0230432	Test's l2: 0.149278
[500]	Train's l2: 0.019421	Test's l2: 0.139129
[550]	Train's l2: 0.0164762	Test's l2: 0.124128
[600]	Train's l2: 0.0140842	Test's l2: 0.114802
[650]	Train's l2: 0.0121376	Test's l2: 0.106123
[700]	Train's l2: 0.0104972	Test's l2: 0.097351
[750]	Train's l2: 0.00911608	Test's l2: 0.0926291
[800]	Train's l2: 0.0079318	Test's l2: 0.0889121
[850]	Train's l2: 0.00693674	Test's l2: 0.0876726
[900]	Train's l2: 0.0060702	Test's l2: 0.0831002
[950]	Train's l2: 0.00534417	Test's l2: 0.0794914
[1000]	Train's l2: 0.00470762	Test's l2: 0.0783593
[1050]	Train's l2: 0.00415834	Test's 



[50]	Train's l2: 0.418985	Test's l2: 0.159932
[100]	Train's l2: 0.203624	Test's l2: 0.180288
第17折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.598408665817711 
------
------
 预测MSE
 0.1540314148501441 
------





[50]	Train's l2: 0.418895	Test's l2: 0.0228147
[100]	Train's l2: 0.203639	Test's l2: 0.0030529
[150]	Train's l2: 0.114536	Test's l2: 7.03107e-06
[200]	Train's l2: 0.0742068	Test's l2: 0.00066555
第18折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.11928881637914478 
------
------
 预测MSE
 1.3457122437056216e-07 
------





[50]	Train's l2: 0.419011	Test's l2: 0.102648
[100]	Train's l2: 0.203687	Test's l2: 0.110834
[150]	Train's l2: 0.114446	Test's l2: 0.0767824
[200]	Train's l2: 0.0741273	Test's l2: 0.0684767
[250]	Train's l2: 0.0535209	Test's l2: 0.0618702
[300]	Train's l2: 0.0414662	Test's l2: 0.0467024
[350]	Train's l2: 0.0333896	Test's l2: 0.0465377
[400]	Train's l2: 0.0275339	Test's l2: 0.0487733
第19折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.03942844430557029 
------
------
 预测MSE
 0.04408808877085342 
------





[50]	Train's l2: 0.418682	Test's l2: 0.887251
[100]	Train's l2: 0.203544	Test's l2: 0.399085
[150]	Train's l2: 0.114397	Test's l2: 0.178412
[200]	Train's l2: 0.0742163	Test's l2: 0.0883592
[250]	Train's l2: 0.0535723	Test's l2: 0.050775
[300]	Train's l2: 0.0414642	Test's l2: 0.0316489
[350]	Train's l2: 0.0334164	Test's l2: 0.0190467
[400]	Train's l2: 0.0275216	Test's l2: 0.0123161
[450]	Train's l2: 0.0230746	Test's l2: 0.00857246
[500]	Train's l2: 0.0194715	Test's l2: 0.00756381
[550]	Train's l2: 0.0165508	Test's l2: 0.00465872
[600]	Train's l2: 0.0141975	Test's l2: 0.00303928
[650]	Train's l2: 0.0122096	Test's l2: 0.00213325
[700]	Train's l2: 0.0105698	Test's l2: 0.00164455
[750]	Train's l2: 0.00916274	Test's l2: 0.00118621
[800]	Train's l2: 0.00798829	Test's l2: 0.000798988
[850]	Train's l2: 0.00698512	Test's l2: 0.000731094
[900]	Train's l2: 0.00613217	Test's l2: 0.000678491
[950]	Train's l2: 0.00538641	Test's l2: 0.000588114
[1000]	Train's l2: 0.00473489	Test's l2: 0.000519298
[105



[50]	Train's l2: 0.419044	Test's l2: 0.00225495
[100]	Train's l2: 0.203723	Test's l2: 0.00697722
第21折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.6963551415171719 
------
------
 预测MSE
 0.00022718615098842322 
------





[50]	Train's l2: 0.418896	Test's l2: 0.132942
[100]	Train's l2: 0.203585	Test's l2: 0.0640139
[150]	Train's l2: 0.114407	Test's l2: 0.0372986
[200]	Train's l2: 0.0741228	Test's l2: 0.0210578
[250]	Train's l2: 0.0534323	Test's l2: 0.0159583
[300]	Train's l2: 0.0413834	Test's l2: 0.0162243
[350]	Train's l2: 0.0333003	Test's l2: 0.0223327
第22折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.05081657768118204 
------
------
 预测MSE
 0.014877617179682922 
------





[50]	Train's l2: 0.418947	Test's l2: 0.0193329
[100]	Train's l2: 0.203666	Test's l2: 0.0145756
[150]	Train's l2: 0.11451	Test's l2: 0.00108674
[200]	Train's l2: 0.0743262	Test's l2: 0.0040723
[250]	Train's l2: 0.0536434	Test's l2: 0.0160333
第23折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.0982784845306778 
------
------
 预测MSE
 2.973757296419554e-07 
------





[50]	Train's l2: 0.41895	Test's l2: 0.390101
[100]	Train's l2: 0.203691	Test's l2: 0.215401
[150]	Train's l2: 0.114491	Test's l2: 0.143047
[200]	Train's l2: 0.0741461	Test's l2: 0.0992432
[250]	Train's l2: 0.0535279	Test's l2: 0.0742296
[300]	Train's l2: 0.041408	Test's l2: 0.0592328
[350]	Train's l2: 0.0333591	Test's l2: 0.0470125
[400]	Train's l2: 0.0274965	Test's l2: 0.0364706
[450]	Train's l2: 0.0230109	Test's l2: 0.0308877
[500]	Train's l2: 0.0193837	Test's l2: 0.0301554
[550]	Train's l2: 0.0164554	Test's l2: 0.0284144
第24折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.02053821505672746 
------
------
 预测MSE
 0.02780972377751253 
------





[50]	Train's l2: 0.418969	Test's l2: 0.0488661
[100]	Train's l2: 0.203749	Test's l2: 0.114015
第25折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.951219744452152 
------
------
 预测MSE
 0.010616336510140843 
------





[50]	Train's l2: 0.418894	Test's l2: 0.162979
[100]	Train's l2: 0.203635	Test's l2: 0.0373693
[150]	Train's l2: 0.114428	Test's l2: 0.0039264
[200]	Train's l2: 0.074223	Test's l2: 0.000555116
