In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PowerTransformer
import math
import warnings
warnings.filterwarnings('ignore')

# 處理 data 套件
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,KFold
from scipy.stats import skew
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR

In [2]:
from imbdpack import skew_pro,lr_rmse_ave,lr_rmse_ave_fea,laso_rmse_ave,ElasticNet_rmse_ave,xgb_ave,svr_rmse_ave,rand_ave
from imbdpack import voting_ave,stack_ave,ave

In [3]:
fea_number=5
data=pd.read_csv('./BayesianRidge_Pre_0/Input_C_050.csv',index_col=0).drop(['Number'], axis=1) ## 改路徑
y=data.Predict
skew_data=skew_pro(data)
num=skew_data.select_dtypes(exclude='object')
numcorr=num.corr()
title=abs(numcorr['Predict']).sort_values(ascending=False).head(fea_number+1).to_frame().index.to_numpy()[1:]
skew_x=skew_data.drop(['Predict'], axis=1)
title

array(['Input_C_135', 'Input_C_051', 'Input_C_059', 'Input_A3_011',
       'Input_A1_011'], dtype=object)

## Linear Regression Baseline

In [4]:
lr_rmse_ave_fea(skew_data,5)

train_rmse: 0.001598742846204608
test_rmse: 0.0016368865585057665
test_r2: 0.6739558186643463


## Lasso

In [5]:
laso_rmse_ave(skew_data[title],y,3e-06)

test_rmse_ave: 0.001653277153495793
[0.0018432419396380883, 0.002139548504724436, 0.0016060562940453538, 0.0014310739995660026, 0.001919131896262804, 0.0014257646272036475, 0.0015523792626167046, 0.0013231634655798928, 0.0018787134259488723, 0.0014136981193721295]


test_r2_ave: 0.669154073126158
[0.5921262945490694, 0.6157478203799795, 0.6368748084425747, 0.7462982270268697, 0.6271249501444753, 0.7271725280976655, 0.6131627705821467, 0.7388972666990549, 0.6705471834309576, 0.7235888819087881]


## ElasticNet

In [6]:
ElasticNet_rmse_ave(skew_data[title],y,[0.013],0.01)

test_rmse_ave: 0.001672192147622186
[0.0018393379491441232, 0.00214889625275986, 0.0016613586286280042, 0.0014598881129249247, 0.0019627202550741617, 0.0014310601012175678, 0.0015967360662696186, 0.0013059375873835622, 0.0019265078451148716, 0.0013894786777051687]


test_r2_ave: 0.6611729746986525
[0.5938522196704747, 0.6123828682983828, 0.6114368294112068, 0.73597900444578, 0.6099947205041225, 0.7251421315572284, 0.590740406080121, 0.7456514534959171, 0.6535714224920555, 0.732978691031236]


## SVR

In [7]:
model=SVR(kernel='linear', C=1.2, gamma= 1e-07, epsilon= 0)
svr_rmse_ave(skew_data,5,model)

train_rmse: 0.0016559259509810347
test_rmse: 0.0016674821328741534
test_r2: 0.6605401454753362


## XGB

In [8]:
best_xgb_model =  XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.07,
              booster = 'dart', 
              n_estimators = 170, 
              max_depth = 5, 
              min_child_weight = 4,
              seed = 42,
              gamma = 0,
              subsample = 0.85,
              colsample_bytree = 0.4,
              reg_alpha =  0,
              reg_lambda = 2)
xgb_ave(skew_data[title], y,best_xgb_model)

test_rmse_ave: 0.0017249584657416945
[0.0018900468674391774, 0.0022477277761675882, 0.0016769848396203108, 0.001494395217894074, 0.0019049639899098973, 0.0016517990946208364, 0.0015597700180762651, 0.0015504143587999695, 0.0018549847128041285, 0.0014184977820847018]


test_r2_ave: 0.6392424692605247
[0.5711492553672783, 0.5759085750664341, 0.6040930511905016, 0.7233502654198286, 0.6326100953418972, 0.6338096675322179, 0.6094705991941927, 0.641507577733811, 0.6788168031007594, 0.7217088026583274]


## RandomForest

In [9]:
rand_model = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 50,
            max_depth = 3,
            min_samples_split = 9,
            max_leaf_nodes = 7,
            min_samples_leaf =6,
            random_state = 42) 
rand_ave(skew_x[title], y,rand_model)

test_rmse_ave: 0.0016849014741070509
[0.001888973196614534, 0.0021990619892660055, 0.001631892128356759, 0.001453129213155308, 0.0019092101193301195, 0.0015064130850039818, 0.0015604734103623398, 0.0013848494807546985, 0.0019593385732920464, 0.0013556735449347175]


test_r2_ave: 0.656621110126059
[0.5716363477872521, 0.5940738683450824, 0.6250980092840726, 0.7384180414735526, 0.6309704594432144, 0.6954345846127532, 0.609118294299242, 0.7139845052749471, 0.6416634370785695, 0.7458135536619043]


In [11]:
lr = LinearRegression()
lasso_mod=Lasso(alpha=3e-06)
elastic_mod=ElasticNet(alpha=[0.013], l1_ratio=0.01)
svr_mod=SVR(kernel='linear', C=1.2, gamma= 1e-07, epsilon= 0)

xgb_mod =  XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.07,
              booster = 'dart', 
              n_estimators = 170, 
              max_depth = 5, 
              min_child_weight = 4,
              seed = 42,
              gamma = 0,
              subsample = 0.85,
              colsample_bytree = 0.4,
              reg_alpha =  0,
              reg_lambda = 2)

random_mod = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 50,
            max_depth = 3,
            min_samples_split = 9,
            max_leaf_nodes = 7,
            min_samples_leaf =6,
            random_state = 42) 
vote_mod = VotingRegressor([ ('Lasso', lasso_mod), ('SVR', svr_mod),('Elastic', elastic_mod), ('Linear', lr),
                            ('RandomForest', random_mod)])
# 
# vote_mod = VotingRegressor([ ('SVR', svr_mod), ('Linear', lr)
#                             ,('RandomForest', random_mod)])
voting_ave(skew_data[title], y,vote_mod)


test_rmse_ave: 0.001652875347587239
[0.0018343453785490604, 0.0021565506148572234, 0.0016470508927777486, 0.0014282207234883096, 0.0019264512063572382, 0.0014172775671241374, 0.0015356674025094337, 0.0013135906809372945, 0.0019046499620647406, 0.0013649490472072026]


test_r2_ave: 0.6693586161970859
[0.5960540657351762, 0.6096165681272214, 0.618100676561619, 0.7473088799526546, 0.6242753362129874, 0.7304109468742636, 0.6214467926106177, 0.7426616370383703, 0.661387885661654, 0.742323373196295]


In [15]:

stack_mod = StackingRegressor(regressors=[vote_mod, lr, elastic_mod, svr_mod], 
                           meta_regressor=lasso_mod, use_features_in_secondary=True
                          )

stack_ave(skew_data[title], y,stack_mod)

test_rmse_ave: 0.001653277153495793
[0.0018432419396380883, 0.002139548504724436, 0.0016060562940453538, 0.0014310739995660024, 0.001919131896262804, 0.0014257646272036475, 0.0015523792626167046, 0.0013231634655798928, 0.001878713425948872, 0.0014136981193721295]


test_r2_ave: 0.669154073126158
[0.5921262945490694, 0.6157478203799795, 0.6368748084425747, 0.7462982270268698, 0.6271249501444753, 0.7271725280976655, 0.6131627705821467, 0.7388972666990549, 0.6705471834309578, 0.7235888819087881]


In [16]:
ave(skew_data[title],y,0.3,stack_mod,0.3,vote_mod,0.4,lr)

test_rmse_ave: 0.0016512793756847898
[0.0018382501187289323, 0.0021428947197185706, 0.0016157309525255841, 0.001428603538887442, 0.0019200110680620912, 0.0014214163677405951, 0.0015460251499481532, 0.001318776336435138, 0.001883629349073639, 0.0013974561557277521]


test_r2_ave: 0.6699828459665742
[0.5943324894947823, 0.6145449536269709, 0.632486801029318, 0.7471734008708583, 0.6267832369791901, 0.7288341145938391, 0.616323050946034, 0.7406258398106846, 0.6688208066363108, 0.7299037656777538]
