In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PowerTransformer
import math
import warnings
warnings.filterwarnings('ignore')

# 處理 data 套件
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,KFold
from scipy.stats import skew
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR

In [2]:
from imbdpack import skew_pro,lr_rmse_ave,lr_rmse_ave_fea,laso_rmse_ave,ElasticNet_rmse_ave,xgb_ave,svr_rmse_ave,rand_ave
from imbdpack import voting_ave,stack_ave,ave

In [3]:
fea_number=7
data=pd.read_csv('./BayesianRidge_Pre_0/Input_A3_015.csv',index_col=0).drop(['Number'], axis=1) ## 改路徑
y=data.Predict
skew_data=skew_pro(data)
num=skew_data.select_dtypes(exclude='object')
numcorr=num.corr()
title=abs(numcorr['Predict']).sort_values(ascending=False).head(fea_number+1).to_frame().index.to_numpy()[1:]
skew_x=skew_data.drop(['Predict'], axis=1)
title

array(['Input_A2_015', 'Input_A6_015', 'Input_A1_015', 'Input_A4_015',
       'Input_A5_015', 'Input_C_137', 'Input_A4_012'], dtype=object)

## Linear Regression Baseline

In [4]:
lr_rmse_ave_fea(skew_data,7)

train_rmse: 0.019713512282696334
test_rmse: 0.020504559214319984
test_r2: 0.5508460551254092


## Lasso

In [5]:
laso_rmse_ave(skew_data[title],y,9e-05)

test_rmse_ave: 0.020376659127956644
[0.019595678141870715, 0.025034946710217745, 0.019748459743782903, 0.01722057450847281, 0.020191466306880998, 0.021051460347164982, 0.02180145803923424, 0.019527462811933516, 0.02008410499501116, 0.01951097967499735]


test_r2_ave: 0.5575229992804258
[0.6053347859044851, 0.36444092037359566, 0.5740507629884186, 0.663328425348323, 0.5561314020965205, 0.4690986943095694, 0.5482718725913468, 0.6300992037017203, 0.6350340872484874, 0.5294398382417908]


## ElasticNet

In [6]:
ElasticNet_rmse_ave(skew_data[title],y,[0.02],0.015)

test_rmse_ave: 0.020383014180187543
[0.019430199078172623, 0.024928376107396625, 0.0197436588685917, 0.01727244446105894, 0.020326660624560632, 0.02100937256195339, 0.021813599767353902, 0.01967778643424961, 0.020049446569995463, 0.019578597328542547]


test_r2_ave: 0.5573370734154943
[0.611972277480483, 0.369840392713519, 0.5742578353969808, 0.6612971993988458, 0.550167554962861, 0.47121941404366585, 0.5477685771336773, 0.624382244684153, 0.6362926177764856, 0.5261726205642723]


## SVR

In [7]:
model=SVR(kernel='linear', C=1.2, gamma= 1e-07, epsilon= 0)
svr_rmse_ave(skew_data,7,model)

train_rmse: 0.020459710392575232
test_rmse: 0.021243930263697586
test_r2: 0.5175633683687253


## XGB

In [8]:
best_xgb_model =  XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.1,
              booster = 'dart', 
              n_estimators = 80, 
              max_depth = 4, 
              min_child_weight = 2,
              seed = 42,
              gamma = 0,
              subsample = 0.8,
              colsample_bytree = 0.6,
              reg_alpha =  0,
              reg_lambda = 0.5)
xgb_ave(skew_data[title], y,best_xgb_model)

test_rmse_ave: 0.019314022878319733
[0.018708557406221512, 0.022736903489053748, 0.016826896137625908, 0.020037382268678624, 0.019020752525853828, 0.019519288363624935, 0.018230421629519142, 0.019578933496952357, 0.019179014945796907, 0.01930207851987036]


test_r2_ave: 0.6019573658397293
[0.6402598960205762, 0.47576595508667463, 0.6907573296716079, 0.5441802431251226, 0.6061107807272861, 0.5435667261781567, 0.6841364728605484, 0.6281466572582409, 0.6671872674572534, 0.5394623300118255]


## RandomForest

In [9]:
rand_model = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 50,
            max_depth = 9,
            min_samples_split = 7,
            max_leaf_nodes = 14,
            min_samples_leaf =4,
            random_state = 42) 
rand_ave(skew_x[title], y,rand_model)

test_rmse_ave: 0.01900400321155791
[0.017769769914259874, 0.02212374567080392, 0.016477233385604258, 0.01862246715672049, 0.019188050881242875, 0.018992173620575408, 0.018509244524834073, 0.01962633862027239, 0.019840737294604338, 0.01889027104666146]


test_r2_ave: 0.6159384823977577
[0.6754572863688693, 0.503659287912025, 0.7034759160791153, 0.6062816903429231, 0.5991513484533801, 0.5678856585919713, 0.6744007165937668, 0.6263437915398141, 0.6438253950419099, 0.5589037330538018]


In [11]:
lr = LinearRegression()
lasso_mod=Lasso(alpha=9e-05)
elastic_mod=ElasticNet(alpha=[0.02], l1_ratio=0.015)
svr_mod=SVR(kernel='linear', C=1.2, gamma= 1e-07, epsilon= 0)

xgb_mod =  XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.1,
              booster = 'dart', 
              n_estimators = 80, 
              max_depth = 4, 
              min_child_weight = 2,
              seed = 42,
              gamma = 0,
              subsample = 0.8,
              colsample_bytree = 0.6,
              reg_alpha =  0,
              reg_lambda = 0.5)

random_mod = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 50,
            max_depth = 9,
            min_samples_split = 7,
            max_leaf_nodes = 14,
            min_samples_leaf =4,
            random_state = 42)     
vote_mod = VotingRegressor([ ('Lasso', lasso_mod), ('Elastic', elastic_mod),
                            ('XGBRegressor', xgb_mod),('RandomForest', random_mod)])

# vote_mod = VotingRegressor([ ('SVR', svr_mod), ('Linear', lr)
#                             ,('RandomForest', random_mod)])
voting_ave(skew_data[title], y,vote_mod)


test_rmse_ave: 0.019121331073299556
[0.01808578387518276, 0.02309159552351408, 0.017651042416506307, 0.017348823090511938, 0.019056933664170262, 0.01947798980066772, 0.019543061384357602, 0.018987729283709894, 0.019161108115153296, 0.01880924357922171]


test_r2_ave: 0.6108984825740356
[0.6638114426435127, 0.4592824466933929, 0.6597233851093135, 0.6582950935325698, 0.6046108490372127, 0.5454961097351929, 0.6370128534399709, 0.6502645225313216, 0.6678084504638873, 0.5626796725539802]


In [12]:

stack_mod = StackingRegressor(regressors=[lasso_mod,elastic_mod,  vote_mod], 
                           meta_regressor=random_mod, use_features_in_secondary=True
                          )

stack_ave(skew_data[title], y,stack_mod)

test_rmse_ave: 0.020301861871143405
[0.019728520669154394, 0.02348232321661313, 0.015852470248127866, 0.02161461772786086, 0.02106521977155073, 0.01940856985984515, 0.01934385583695409, 0.02033667467012044, 0.021129441979667554, 0.021056924731539844]


test_r2_ave: 0.5592694268479294
[0.5999656392509881, 0.4408289122736502, 0.7255360684209037, 0.4695966040103652, 0.5168848096416677, 0.5487300582243196, 0.644375110862109, 0.5988068494834895, 0.5960539245971594, 0.45191629171464187]


In [13]:
ave(skew_data[title],y,0.2,stack_mod,0.4,vote_mod,0.4,random_mod)

test_rmse_ave: 0.018977701458031473
[0.01801583695122956, 0.022447430466346105, 0.0164791271044795, 0.01839496990068782, 0.019063308681771078, 0.018996135808670635, 0.01872955518845954, 0.01908806572848755, 0.019519079763813823, 0.019043504986369135]


test_r2_ave: 0.6166899166438329
[0.6664068383087564, 0.48902946804008274, 0.7034077534804692, 0.6158424783197323, 0.6043462698090138, 0.5677053425390104, 0.6666035422059273, 0.646558557508691, 0.6552803696983681, 0.5517185465282769]
