In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PowerTransformer
import math
import warnings
warnings.filterwarnings('ignore')

# 處理 data 套件
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,KFold
from scipy.stats import skew
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR

In [2]:
from imbdpack import skew_pro,lr_rmse_ave,lr_rmse_ave_fea,laso_rmse_ave,ElasticNet_rmse_ave,xgb_ave,svr_rmse_ave,rand_ave
from imbdpack import voting_ave,stack_ave,ave

In [3]:
fea_number=5
data=pd.read_csv('./BayesianRidge_Pre_0/Input_A3_017.csv',index_col=0).drop(['Number'], axis=1) ## 改路徑
y=data.Predict
skew_data=skew_pro(data)
num=skew_data.select_dtypes(exclude='object')
numcorr=num.corr()
title=abs(numcorr['Predict']).sort_values(ascending=False).head(fea_number+1).to_frame().index.to_numpy()[1:]
skew_x=skew_data.drop(['Predict'], axis=1)
title

array(['Input_A3_019', 'Input_A1_016', 'Input_A4_016', 'Input_A2_019',
       'Input_A1_018'], dtype=object)

## Linear Regression Baseline

In [4]:
lr_rmse_ave_fea(skew_data,5)

train_rmse: 0.009021544601542492
test_rmse: 0.009185245312196517
test_r2: 0.5194517040383431


## Lasso

In [5]:
laso_rmse_ave(skew_data[title],y,3e-07)

test_rmse_ave: 0.00915265946598786
[0.01062750198763127, 0.008673317787159884, 0.00994386176369197, 0.008939578064045368, 0.009945566589734396, 0.008430424097449754, 0.009745321367017846, 0.009006018749580881, 0.008403021442409763, 0.007811982811157472]


test_r2_ave: 0.5127135025130259
[0.42341258348195965, 0.5712795215682764, 0.5171594651920497, 0.5584025737690227, 0.4772080844813562, 0.4659500753571916, 0.4325322480300323, 0.46678132108470427, 0.6080015474223772, 0.6064076047432891]


## ElasticNet

In [6]:
ElasticNet_rmse_ave(skew_data[title],y,[0.04],0.01)

test_rmse_ave: 0.009230097048787068
[0.010712850925479455, 0.008792130351253567, 0.010074765589108081, 0.009202803075028248, 0.009927527433509496, 0.008270915189682867, 0.009711517230252325, 0.009112071378092598, 0.008554092848794324, 0.007942296466669701]


test_r2_ave: 0.5052574206935073
[0.41411430630033175, 0.5594533054152979, 0.504363289356982, 0.532014120812297, 0.4791028327153405, 0.48596800935175777, 0.43646223384147165, 0.4541492804609195, 0.5937799746249461, 0.5931668540557293]


## SVR

In [7]:
model=SVR(kernel='linear', C=1.3, gamma= 1e-07, epsilon= 0)
svr_rmse_ave(skew_data,5,model)

train_rmse: 0.009154731992021709
test_rmse: 0.009323699506015254
test_r2: 0.5042389568202571


## XGB

In [8]:
best_xgb_model =  XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.08,
              booster = 'gbtree', 
              n_estimators = 100, 
              max_depth = 2, 
              min_child_weight = 4,
              seed = 42,
              gamma = 0,
              subsample = 0.8,
              colsample_bytree = 0.4,
              reg_alpha =  0.05,
              reg_lambda = 2)
xgb_ave(skew_data[title], y,best_xgb_model)

test_rmse_ave: 0.009313335268602781
[0.010598929677171258, 0.00884100326981116, 0.010250998065664861, 0.009141604843163866, 0.010236671587176564, 0.008094949510017158, 0.010105390019861934, 0.009481101702687733, 0.008478444859080529, 0.007904259151392748]


test_r2_ave: 0.4956756506473011
[0.42650875579453085, 0.5545419486391667, 0.48687181646100275, 0.5382175962211698, 0.44615614063095754, 0.5076076448240685, 0.38982414861165493, 0.4090411084039933, 0.600933006489341, 0.5970543403971253]


## RandomForest

In [9]:
rand_model = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 70,
            max_depth = 3,
            min_samples_split = 3,
            max_leaf_nodes = 8,
            min_samples_leaf =3,
            random_state = 42)
rand_ave(skew_x[title], y,rand_model)

test_rmse_ave: 0.00926927863567096
[0.009960376687364077, 0.009191904829756122, 0.01024475199640912, 0.008475154918851924, 0.010505539986227328, 0.00874404108960199, 0.00997960197902898, 0.00966823117286096, 0.008206411067431944, 0.007716772629177148]


test_r2_ave: 0.4977234188693435
[0.49352933414788924, 0.5184795359647612, 0.48749693758891477, 0.6030938921449063, 0.4166804068772473, 0.4254770359226312, 0.40492007856865697, 0.38548326679016476, 0.6261305802295773, 0.6159431204586858]


In [10]:
lr = LinearRegression()
lasso_mod=Lasso(alpha=3e-07)
elastic_mod=ElasticNet(alpha=[0.04], l1_ratio=0.01)
svr_mod=SVR(kernel='linear', C=1.3, gamma= 1e-07, epsilon= 0)

xgb_mod =  XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.08,
              booster = 'gbtree', 
              n_estimators = 100, 
              max_depth = 2, 
              min_child_weight = 4,
              seed = 42,
              gamma = 0,
              subsample = 0.8,
              colsample_bytree = 0.4,
              reg_alpha =  0.05,
              reg_lambda = 2)

random_mod = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 70,
            max_depth = 3,
            min_samples_split = 3,
            max_leaf_nodes = 8,
            min_samples_leaf =3,
            random_state = 42)
vote_mod = VotingRegressor([ ('Lasso', lasso_mod), ('SVR', svr_mod),('Elastic', elastic_mod), ('Linear', lr),
                            ('XGBRegressor', xgb_mod),('RandomForest', random_mod)])

# vote_mod = VotingRegressor([ ('SVR', svr_mod), ('Linear', lr)
#                             ,('RandomForest', random_mod)])
voting_ave(skew_data[title], y,vote_mod)


test_rmse_ave: 0.009104355407628452
[0.010483049531019405, 0.008627783272018668, 0.009941242706720622, 0.00884722320360092, 0.009934263113868823, 0.008327059602393827, 0.009715926268739957, 0.009149322641056218, 0.00825889974371062, 0.007758783993155452]


test_r2_ave: 0.5175714803995206
[0.4389803848555146, 0.5757692303286177, 0.5174137769197524, 0.5674797360600665, 0.4783957508950366, 0.47896564576786405, 0.4359504242734521, 0.4496771487077116, 0.6213327048578658, 0.6117500013293257]


In [11]:

stack_mod = StackingRegressor(regressors=[lasso_mod,elastic_mod,lr, random_mod ], 
                           meta_regressor=vote_mod, use_features_in_secondary=True
                          )

stack_ave(skew_data[title], y,stack_mod)

test_rmse_ave: 0.009160998502595682
[0.01015171410528812, 0.008958218975610926, 0.009952785483466167, 0.008526303370969136, 0.01012662833610381, 0.00874921382014681, 0.009828726457629217, 0.009404142280400743, 0.008020042679115229, 0.007892209517226667]


test_r2_ave: 0.5096487867124677
[0.47388397828577855, 0.5426517107606004, 0.5162924646080213, 0.5982886955472839, 0.45799967635486094, 0.4247970913782281, 0.4227773651464035, 0.41859596250354947, 0.6429189789403398, 0.5982819435996115]


In [12]:
ave(skew_data[title],y,0.3,stack_mod,0.6,vote_mod,0.1,lasso_mod)

test_rmse_ave: 0.00909336052693416
[0.010383408086083472, 0.008691989247842216, 0.009908442064011945, 0.008729699910615901, 0.009963212170153278, 0.00842478937556498, 0.009722808421473005, 0.009179199976317533, 0.0081649865470234, 0.007765069470255859]


test_r2_ave: 0.5182773176647133
[0.4495946886784411, 0.5694316811110809, 0.5205930624503275, 0.5788942992333936, 0.4753513475634973, 0.4666637327276617, 0.4351510665878856, 0.44607709498882975, 0.6298955080509083, 0.6111206952551071]
